In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #
import seaborn as sns
sns.set()
from scipy.stats import ttest_ind

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification



In [13]:
df=pd.read_csv('health_insurance.csv')

In [14]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Unnamed: 7,Unnamed: 8
0,19,female,27.9,0,yes,southwest,16885.0,,
1,31,female,25.74,0,no,southeast,3757.0,,
2,46,female,33.44,1,no,southeast,8241.0,,
3,37,female,27.74,3,no,northwest,7282.0,,
4,60,female,25.84,0,no,northwest,28923.0,,


In [15]:
t_statistic, p_value = ttest_ind(df.loc[df["sex"] == 'female' ].charges,df.loc[df["sex"] == 'male' ].charges, equal_var=False)

In [16]:
print(f't_statistic: {t_statistic}\np_value: {p_value}')

t_statistic: -2.100847823212175
p_value: 0.03584453374892653


In [17]:
## convert sex variable to dummy 0/1

In [18]:
sex=pd.get_dummies(df['sex'])
sex=sex['female']
df['sex']=sex

In [19]:
smoker=pd.get_dummies(df['smoker'])
smoker=smoker['yes']
df['smoker']=smoker

# Now set up the data for model fitting

In [20]:
X = df[['age', 'sex', 'bmi', 'smoker']]
y = df['charges']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

## First fit linear regression

In [36]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [38]:
model.score(X_train, y_train)

0.7559254490751786

In [39]:
model.score(X_test, y_test)

0.7147529373724303

In [40]:
predicted = model.predict(X_test)
explained_variance=metrics.explained_variance_score(y_test, predicted)
mean_absolute_error=metrics.mean_absolute_error(y_test, predicted) 
mse=metrics.mean_squared_error(y_test, predicted) 
r2=metrics.r2_score(y_test, predicted)

print('explained_variance: ', round(explained_variance,4))    
print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('RMSE: ', round(np.sqrt(mse),4))

explained_variance:  0.7161
r2:  0.7148
MAE:  4548.9949
RMSE:  6580.8501


In [45]:
some_person = {'age' : 37,
        'sex' : 0,
        'bmi' : 30,
        'smoker' : 0
       }
index = [1]
some_person = pd.DataFrame(some_person,index)
print(some_person)
prediction_some_person = model.predict(some_person)
print("Estimated charge for this person is: $",str(prediction_new_client1.round(2))[1:-1])

   age  sex  bmi  smoker
1   37    0   30       0
Estimated charge for this person is: $ 7492.43


## Now fit a neural network

In [50]:
clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
#clf = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=300,activation = 'relu',solver='adam',random_state=1)

In [51]:
clf.fit(X_train,y_train)



MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=300, random_state=1)

In [52]:
predicted=clf.predict(X_test)

In [53]:
explained_variance=metrics.explained_variance_score(y_test, predicted)
mean_absolute_error=metrics.mean_absolute_error(y_test, predicted) 
mse=metrics.mean_squared_error(y_test, predicted) 
r2=metrics.r2_score(y_test, predicted)

print('explained_variance: ', round(explained_variance,4))    
print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('RMSE: ', round(np.sqrt(mse),4))

explained_variance:  0.2613
r2:  0.2422
MAE:  6431.9291
RMSE:  10726.4297
