In [134]:
import pandas as pd

In [135]:
# load the dataset
data = pd.read_csv('insurance.csv')

In [136]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [137]:
data.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [138]:
# shape 
data.shape

(1338, 7)

In [91]:
# get info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [92]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [93]:
# check null values
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [94]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [95]:
# convert the str to int
data['sex'].unique()

array(['female', 'male'], dtype=object)

In [96]:
data['sex'] = data['sex'].map({'male':1,'female':0})

In [97]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [98]:
# smoker
data['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [99]:
data['smoker'] = data['smoker'].map({'yes':1,'no':0})

In [100]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [101]:
data['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [102]:
data['region'] = data['region'].map({'southwest':1, 'southeast':2, 'northwest':3, 'northeast':4})

In [103]:
data.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,1,16884.924
1,18,1,33.77,1,0,2,1725.5523


### Store Feature Matrix in X and Target in vector y

In [104]:
X = data.drop(['charges'],axis=1)

In [105]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,1
1,18,1,33.770,1,0,2
2,28,1,33.000,3,0,2
3,33,1,22.705,0,0,3
4,32,1,28.880,0,0,3
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,3
1334,18,0,31.920,0,0,4
1335,18,0,36.850,0,0,2
1336,21,0,25.800,0,0,1


In [106]:
y = data['charges']

In [107]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

### Train / Test Split
1. Split data into two part: ---> training set and testing set
2. Train the model(s) on Training set
3. Test the model(s) on Testing set

In [108]:
from sklearn.model_selection import train_test_split

In [109]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [110]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
560,46,0,19.950,2,0,3
1285,47,0,24.320,0,0,4
1142,52,0,24.860,0,0,2
969,39,0,34.320,5,0,2
486,54,0,21.470,3,0,3
...,...,...,...,...,...,...
1095,18,0,31.350,4,0,4
1130,39,0,23.870,5,0,2
1294,58,1,25.175,0,0,4
860,37,0,47.600,2,1,1


In [111]:
# train our models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [112]:
lr = LinearRegression()
lr.fit(X_train,y_train)# training the model/algorithm

sv = SVR()
sv.fit(X_train,y_train)

rf = RandomForestRegressor()
rf.fit(X_train,y_train)

gb = GradientBoostingRegressor()
gb.fit(X_train,y_train)

Checking for prediction performance

In [113]:
y_pred1 = lr.predict(X_test)
y_pred2 = sv.predict(X_test)
y_pred3 = rf.predict(X_test)
y_pred4 = gb.predict(X_test)

In [114]:
y_pred1,y_pred2,y_pred3,y_pred4

(array([ 8924.40724442,  7116.29501758, 36909.01352144,  9507.87469118,
        27013.3500079 , 10790.77956153,   226.29844571, 16942.71599941,
         1056.63079407, 11267.91997309, 28048.59793155,  9424.36324087,
         5326.32232088, 38460.06017922, 40303.40597026, 37147.01010262,
        15287.91876684, 35965.05485917,  9179.1753067 , 31510.8319849 ,
         3797.79068365, 10070.82803304,  2312.57551348,  7074.41348194,
        11352.37224357, 12907.77079523, 14448.84678727,  6205.65997921,
         9917.00839638,  2239.50032819,  9060.55469043, 13120.56214535,
         4617.70702822,  3467.91218926,  4402.74821855, 12967.91608907,
         1927.44498944,  8757.9180081 , 33324.35180597, 32638.47697026,
         3852.41756615,  4370.39670883, 14080.76023234, 11478.63402576,
         8829.26135924, 12046.15119133,  5322.80515731,  3100.71182484,
        35546.60547574,  9201.61196817, 15894.23763341,  2406.04003607,
        12397.52052544,  1433.90617387, 13448.14094304, 12519.54

Evaluate Our Model

In [115]:
from sklearn import metrics

In [116]:
result1 = metrics.r2_score(y_test,y_pred1)
result2 = metrics.r2_score(y_test,y_pred2)
result3 = metrics.r2_score(y_test,y_pred3)
result4 = metrics.r2_score(y_test,y_pred4)

In [117]:
result1,result2,result3,result4

(0.7833463107364539,
 -0.07229762787861826,
 0.8647753507775777,
 0.877993618163719)

In [118]:
result = pd.DataFrame({
    'Alogirthms':['LinearRegression','SVR','RandomForestRegressor','GradientBoostingRegressor'],
    'Predicted Result':[result1,result2,result3,result4]
})

In [119]:
result

Unnamed: 0,Alogirthms,Predicted Result
0,LinearRegression,0.783346
1,SVR,-0.072298
2,RandomForestRegressor,0.864775
3,GradientBoostingRegressor,0.877994


Visualize

In [120]:
import plotly.express as px

In [121]:
fig = px.bar(
    result,
    y = 'Alogirthms',
    x = 'Predicted Result',
    color='Alogirthms',
    text_auto=',.0%'
)
fig.show()

In [122]:
import joblib

In [139]:
gb = GradientBoostingRegressor()
gb.fit(X,y)

In [140]:
joblib.dump(gb,'Insurance_Model')

['Insurance_Model']

In [141]:
model =  joblib.load('Insurance_Model')

In [144]:
new_customer = {
    'age': 19,
    'sex':0,
    'bmi':27.900,
    'children':0,
    'smoker':1,
    'region':1
}
new_df = pd.DataFrame(new_customer,index=[0])
new_df

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,1


In [145]:
model.predict(new_df)

array([18901.83209496])