In [185]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [215]:
df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [187]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [188]:
from sklearn.preprocessing import LabelEncoder
df['sex'] = LabelEncoder().fit_transform(df['sex'])
df['smoker'] = LabelEncoder().fit_transform(df['smoker'])

In [189]:
dummies = pd.get_dummies(df['region'])
dumf = dummies.drop(['southwest'],axis=1)
dumf.head()

Unnamed: 0,northeast,northwest,southeast
0,0,0,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0


In [190]:
merg = pd.concat([df,dumf],axis=1)
merg.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast
0,19,0,27.9,0,1,southwest,16884.924,0,0,0
1,18,1,33.77,1,0,southeast,1725.5523,0,0,1
2,28,1,33.0,3,0,southeast,4449.462,0,0,1
3,33,1,22.705,0,0,northwest,21984.47061,0,1,0
4,32,1,28.88,0,0,northwest,3866.8552,0,1,0


In [191]:
final= merg.drop('region',axis=1)
final.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northeast,northwest,southeast
0,19,0,27.9,0,1,16884.924,0,0,0
1,18,1,33.77,1,0,1725.5523,0,0,1
2,28,1,33.0,3,0,4449.462,0,0,1
3,33,1,22.705,0,0,21984.47061,0,1,0
4,32,1,28.88,0,0,3866.8552,0,1,0


In [194]:
x = final.drop('charges',axis=1)
y = final['charges']

In [195]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.20)

In [197]:
a={'Model Name':[], 'Mean_Absolute_Error_MAE':[] ,'Adj_R_Square':[] ,'Root_Mean_Squared_Error_RMSE':[] ,'Mean_Absolute_Percentage_Error_MAPE':[] ,'Mean_Squared_Error_MSE':[] ,'Root_Mean_Squared_Log_Error_RMSLE':[] ,'R2_score': [] }
results=pd.DataFrame(a)

In [198]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [199]:
model_lr = LinearRegression()
model_svr = SVR()
model_dtr = DecisionTreeRegressor()
model_rfr = RandomForestRegressor()
MM = [model_lr,model_svr,model_dtr,model_rfr]

In [200]:
for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    results = results.append(new_row, ignore_index=True)
    #------------------------------------------------------------

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 3884.554
Mean Squared Error (MSE): 29685454.06
Root Mean Squared Error (RMSE): 5448.436
R2_score: 0.802679
Root Mean Squared Log Error (RMSLE): 8.603
Mean Absolute Percentage Error (MAPE): 42.86 %
Adj R Square:  0.801491
------------------------------------------------------------------------------------------------------------
Model Name:  SVR()
Mean Absolute Error (MAE): 7913.986
Mean Squared Error (MSE): 160362598.581
Root Mean Squared Error (RMSE): 12663.435
R2_score: -0.065941
Root Mean Squared Log Error (RMSLE): 9.446
Mean Absolute Percentage Error (MAPE): 102.5 %
Adj R Square:  -0.072357
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 2552.818
Mean Squared Error (MSE): 31194711.31
Root Mean Squared Error (RMSE): 5585.223
R2_score: 0.792647
Root Mean Squared Log Error (RMSLE): 8.628
Mean Absolute Pe

In [201]:
results

Unnamed: 0,Model Name,Mean_Absolute_Error_MAE,Adj_R_Square,Root_Mean_Squared_Error_RMSE,Mean_Absolute_Percentage_Error_MAPE,Mean_Squared_Error_MSE,Root_Mean_Squared_Log_Error_RMSLE,R2_score
0,LinearRegression(),3884.554058,0.801491,5448.435928,42.861499,29685450.0,8.603084,0.802679
1,SVR(),7913.985839,-0.072357,12663.435497,102.498084,160362600.0,9.446474,-0.065941
2,DecisionTreeRegressor(),2552.817611,0.791399,5585.222584,29.605646,31194710.0,8.62788,0.792647
3,"(DecisionTreeRegressor(max_features=1.0, rando...",2425.471317,0.882255,4196.161335,30.300873,17607770.0,8.341925,0.88296


In [202]:
##lets use random forest for this one

In [203]:
model = RandomForestRegressor(n_estimators = 100, random_state = 1)

In [204]:
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

In [205]:
out = pd.DataFrame({'Actual_price': y_test ,'Predicted_price':y_pred})


In [206]:
res = df.merge(out,left_index=True,right_index=True)

In [207]:
res.sample(20)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Actual_price,Predicted_price
8,37,1,29.83,2,0,northeast,6406.4107,6406.4107,9701.566223
159,50,0,27.83,3,0,southeast,19749.38338,19749.38338,10827.118474
767,42,0,29.0,1,0,southwest,7050.642,7050.642,7023.675462
477,25,1,35.625,0,0,northwest,2534.39375,2534.39375,3368.201467
869,25,0,24.3,3,0,southwest,4391.652,4391.652,7304.583545
1092,23,1,32.7,3,0,southwest,3591.48,3591.48,5914.375195
171,49,1,30.3,0,0,southwest,8116.68,8116.68,9977.039289
339,46,0,27.72,1,0,southeast,8232.6388,8232.6388,11143.932057
103,61,0,29.92,3,1,southeast,30942.1918,30942.1918,29676.650175
541,20,0,31.79,2,0,southeast,3056.3881,3056.3881,5679.435714


In [208]:
model.score(x_test,y_test)

0.881816580104568

In [210]:
import pickle

In [212]:
pickle.dump(model,open('model.pkl','wb'))

In [213]:
pickle.dump(df,open('df.pkl','wb'))

In [214]:
x_train

Unnamed: 0,age,sex,bmi,children,smoker,northeast,northwest,southeast
1332,52,0,44.700,3,0,0,0,0
165,47,1,28.215,4,0,1,0,0
1229,58,1,30.305,0,0,1,0,0
246,60,0,38.060,0,0,0,0,1
428,21,0,16.815,1,0,1,0,0
...,...,...,...,...,...,...,...,...
1326,42,0,32.870,0,0,1,0,0
1304,42,1,24.605,2,1,1,0,0
487,19,1,28.700,0,0,0,0,0
563,50,1,44.770,1,0,0,0,1
