In [31]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #
import seaborn as sns
sns.set()
from scipy.stats import ttest_ind

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification



In [32]:
#df=pd.read_csv('health_insurance.csv')
df=pd.read_csv('~/work/teaching/ENGN2301/week 1/health_insurance.csv')

### df creates a 'dataframe' in python. It's like a spreadsheet in excel, with rows and columns. Use df.head() to take a look at just the first few rows

In [33]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Unnamed: 7,Unnamed: 8
0,19,female,27.9,0,yes,southwest,16885.0,,
1,31,female,25.74,0,no,southeast,3757.0,,
2,46,female,33.44,1,no,southeast,8241.0,,
3,37,female,27.74,3,no,northwest,7282.0,,
4,60,female,25.84,0,no,northwest,28923.0,,


### df.loc will give us particular columns and rows. Here we want the 'sex' column and only the females

In [9]:
df.loc[df["sex"] == 'female' ]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Unnamed: 7,Unnamed: 8
0,19,female,27.900,0,yes,southwest,16885.0,,
1,31,female,25.740,0,no,southeast,3757.0,,
2,46,female,33.440,1,no,southeast,8241.0,,
3,37,female,27.740,3,no,northwest,7282.0,,
4,60,female,25.840,0,no,northwest,28923.0,,
5,62,female,26.290,0,yes,southeast,27809.0,,
6,56,female,39.820,0,no,southeast,11091.0,,
7,52,female,30.780,1,no,northeast,10797.0,,
8,60,female,36.005,0,no,northeast,13229.0,,
9,30,female,32.400,1,no,southwest,4150.0,,


### Remember our t-test where we compared males and females in terms of their charges? let's run that again here

In [34]:
t_statistic, p_value = ttest_ind(df.loc[df["sex"] == 'female' ].charges,df.loc[df["sex"] == 'male' ].charges, equal_var=False)

In [35]:
print(f't_statistic: {t_statistic}\np_value: {p_value}')

t_statistic: -2.100847823212175
p_value: 0.03584453374892653


### To do further analysis on our dataframe, just like in excel, we need to convert both the sex and smoker variables to dummy variables 0/1. Females == 1 and Smokers == 1

In [36]:
sex=pd.get_dummies(df['sex'])
sex=sex['female']
df['sex']=sex
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Unnamed: 7,Unnamed: 8
0,19,1,27.9,0,yes,southwest,16885.0,,
1,31,1,25.74,0,no,southeast,3757.0,,
2,46,1,33.44,1,no,southeast,8241.0,,
3,37,1,27.74,3,no,northwest,7282.0,,
4,60,1,25.84,0,no,northwest,28923.0,,


In [37]:
smoker=pd.get_dummies(df['smoker'])
smoker=smoker['yes']
df['smoker']=smoker

### Now set up the data for model fitting, we want our X input data to be the age, sex, bmi and smoker columns of the dataframe and we want the y data to be the charges column of the dataframe

### --> the X input data are the independent variables that we are using to predict the dependent variable Y

In [38]:
X = df[['age', 'sex', 'bmi', 'smoker']]
y = df['charges']

### ok now we're going to split both X and y into training and test subsets

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

### First fit linear regression

In [40]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### How well does the model fit? model.score gives the coefficient of determination R^2. For more info: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

### first see R^2 on the training data (that was used to 'fit' the model i.e. determine the model parameters). Then see how that compares to the R^2 on the test data (that wasn't used to fit the model)

In [41]:
model.score(X_train, y_train)

0.7559254490751786

In [42]:
model.score(X_test, y_test)

0.7147529373724304

### some more output about the model fit (all on the test data)

In [43]:
predicted = model.predict(X_test)
explained_variance=metrics.explained_variance_score(y_test, predicted)
mean_absolute_error=metrics.mean_absolute_error(y_test, predicted) 
mse=metrics.mean_squared_error(y_test, predicted) 
r2=metrics.r2_score(y_test, predicted)

print('explained_variance: ', round(explained_variance,4))    
print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('RMSE: ', round(np.sqrt(mse),4))

explained_variance:  0.7161
r2:  0.7148
MAE:  4548.9949
RMSE:  6580.8501


### we can use the model to predict the cost for a particular person e.g. a 37 year old non-smoking male, with BMI=30

In [26]:
some_person = {'age' : 37,
        'sex' : 0,
        'bmi' : 30,
        'smoker' : 0
       }
index = [1]
some_person = pd.DataFrame(some_person,index)
print(some_person)
prediction_some_person = model.predict(some_person)
print("Estimated charge for this person is: $",str(prediction_some_person.round(2))[1:-1])

   age  sex  bmi  smoker
1   37    0   30       0
Estimated charge for this person is: $ 7492.43


### Now fit a neural network N.B. this takes much longer

In [45]:
clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
#clf = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=300,activation = 'relu',solver='adam',random_state=1)



In [51]:
#clf.fit(X_train,y_train)



MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=300, random_state=1)

In [50]:
nn_predicted=clf.predict(X_test)

In [51]:
explained_variance=metrics.explained_variance_score(y_test, nn_predicted)
mean_absolute_error=metrics.mean_absolute_error(y_test, nn_predicted) 
mse=metrics.mean_squared_error(y_test, nn_predicted) 
r2=metrics.r2_score(y_test, nn_predicted)

print('explained_variance: ', round(explained_variance,4))    
print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('RMSE: ', round(np.sqrt(mse),4))

explained_variance:  0.6594
r2:  0.6582
MAE:  4238.5037
RMSE:  7204.1386
