In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import tkinter as tk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR #Support Vector Regression
from sklearn.ensemble import RandomForestRegressor
from sklearn .ensemble import GradientBoostingRegressor

In [None]:
data = pd.read_csv('insurance.csv')
#top 5 rows
data.head()

In [None]:
#last 5 rows
data.tail()

In [None]:
data.shape #shape is an attribute of dataframe and not a method

In [None]:
print("Number of rows: ", data.shape[0])
print("Number of columns: ", data.shape[1])

In [None]:
data.info() #to get the information about the data inside the csv

In [None]:
#to check the number of null values in a dataset
data.isnull().sum()

In [None]:
#getting the statistics of the data
data.describe() 
#how to read for percentile: 25% of the people have age less than 27... 

In [None]:
#to get statistical data also for categorical data
data.describe(include='all')

In [None]:
#machine learning algortihms can only understand numbers and not strings
#so we need to convert the categorical data into numerical data
data['sex'].unique()

In [None]:
#converting female to 0 and male to 1
data['sex'] = data['sex'].map({'female':0, 'male':1})

In [None]:
#for smoker, yes to 1 and no to 0
data['smoker'] = data['smoker'].map({'yes':1, 'no':0})
data.head()

In [None]:
#for region column
data['region'].unique()

In [None]:
data['region'] = data['region'].map({'southwest':1, 'southeast':2, 'northwest':3, 'northeast':4})

In [None]:
data.tail()

In [None]:
#Starting with creation of X matrix for independent variables and y vector for dependent variable
X = data.drop(['charges'], axis=1)
y = data['charges']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
svm = SVR()
svm.fit(X_train, y_train)
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)

In [None]:
y_pred1 = lr.predict(X_test)
y_pred2 = svm.predict(X_test)
y_pred3 = rf.predict(X_test)
y_pred4 = gb.predict(X_test)

#comparing the results of models to see which one is the best
df = pd.DataFrame({'Actual':y_test, 'LR':y_pred1, 'SVM':y_pred2, 'RF':y_pred3, 'GB':y_pred4})
df

In [None]:
#visually comparing the results to see which one is the best
plt.subplot(221)
plt.plot(df['Actual'].iloc[0:11],label='Actual')
plt.plot(df['LR'].iloc[0:11],label='LR')
plt.legend()

plt.subplot(222)
plt.plot(df['Actual'].iloc[0:11],label='Actual')
plt.plot(df['SVM'].iloc[0:11],label='SVM')
plt.legend()

plt.subplot(223)
plt.plot(df['Actual'].iloc[0:11],label='Actual')
plt.plot(df['RF'].iloc[0:11],label='RF')
plt.legend()

plt.subplot(224)
plt.plot(df['Actual'].iloc[0:11],label='Actual')
plt.plot(df['GB'].iloc[0:11],label='GB')

plt.tight_layout()
plt.legend()

In [None]:
#evaluating the algorithm using r square. r square value is used to measure the goodness of fit. Greater the value of r square, #better is the model
from sklearn import metrics

In [None]:
score1 = metrics.r2_score(y_test, y_pred1)
score2 = metrics.r2_score(y_test, y_pred2)
score3 = metrics.r2_score(y_test, y_pred3)
score4 = metrics.r2_score(y_test, y_pred4)

In [None]:
print(score1, score2, score3, score4)

In [None]:
#evaluating using mean absolute error where lowest value is the best
s1 = metrics.mean_absolute_error(y_test, y_pred1)
s2 = metrics.mean_absolute_error(y_test, y_pred2)
s3 = metrics.mean_absolute_error(y_test, y_pred3)
s4 = metrics.mean_absolute_error(y_test, y_pred4)

In [None]:
print(s1, s2, s3, s4)

In [None]:
testData = {'age':40,
            'sex':1,
            'bmi':40.30,
            'children':4,
            'smoker':1,
            'region':2
            }

df = pd.DataFrame(testData, index=[0])
df

In [None]:
#prediction for new data
new_pred = gb.predict(df)
new_pred

In [None]:
#before deplyment train the model on entire dataset
gb = GradientBoostingRegressor()
gb.fit(X,y)

In [None]:
joblib.dump(gb,'insurance_prediction')

In [None]:
#prediction is different because we have trained the model on entire dataset
model = joblib.load('insurance_prediction')
model.predict(df)