In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('insurance.csv')

# EDA

In [3]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
data.shape

(1338, 7)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [8]:
data.describe(include='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,676,,,1064,364,
mean,39.207025,,30.663397,1.094918,,,13270.422265
std,14.04996,,6.098187,1.205493,,,12110.011237
min,18.0,,15.96,0.0,,,1121.8739
25%,27.0,,26.29625,0.0,,,4740.28715
50%,39.0,,30.4,1.0,,,9382.033
75%,51.0,,34.69375,2.0,,,16639.912515


## Covert Columns From String ['sex' ,'smoker','region' ] To Numerical Values 

In [9]:
data['sex'].unique()

array(['female', 'male'], dtype=object)

In [10]:
data['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [11]:
data['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [12]:
data['sex']=data['sex'].map({'female':0,'male':1})
data['smoker']=data['smoker'].map({'yes':1,'no':0})
data['region']=data['region'].map({'southwest':1,'southeast':2,'northwest':3,'northeast':4})

In [13]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,1,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,3,21984.47061
4,32,1,28.88,0,0,3,3866.8552


## Feature Selection

In [14]:
data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [15]:
# Features
X = data.drop(['charges'],axis=1)

In [16]:
# Target
y = data['charges']

### 9. Train/Test split
#### 1. Split data into two part : a training set and a testing set
#### 2. Train the model(s) on training set
#### 3. Test the Model(s) on Testing set

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2 ,random_state=42)

In [19]:
y_train

560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
           ...     
1095     4561.18850
1130     8582.30230
1294    11931.12525
860     46113.51100
1126    10214.63600
Name: charges, Length: 1070, dtype: float64

### 10. Import the models

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

### 11. Model Training

In [21]:
lr = LinearRegression()
lr.fit(X_train,y_train)

svm = SVR()
svm.fit(X_train,y_train)

rf = RandomForestRegressor()
rf.fit(X_train,y_train)

gr = GradientBoostingRegressor()
gr.fit(X_train,y_train)

GradientBoostingRegressor()

### Prediction on Test Data

In [22]:
y_pred1 = lr.predict(X_test)
y_pred2 = svm.predict(X_test)
y_pred3 = rf.predict(X_test)
y_pred4 = gr.predict(X_test)

df1 = pd.DataFrame({'Actual':y_test,'Lr':y_pred1,
                  'svm':y_pred2,'rf':y_pred3,'gr':y_pred4})

In [23]:
df1.head()

Unnamed: 0,Actual,Lr,svm,rf,gr
764,9095.06825,8924.407244,9548.261584,10989.238326,11001.128629
887,5272.1758,7116.295018,9492.515425,5244.800143,5840.174656
890,29330.98315,36909.013521,9648.758701,28452.573176,28001.980112
1293,9301.89355,9507.874691,9555.044136,9771.831221,9745.291602
259,33750.2918,27013.350008,9420.421978,34615.801228,33639.100981


## Evaluating the Algorithm

In [24]:
from sklearn import metrics

In [25]:
score1 = metrics.r2_score(y_test,y_pred1)
score2 = metrics.r2_score(y_test,y_pred2)
score3 = metrics.r2_score(y_test,y_pred3)
score4 = metrics.r2_score(y_test,y_pred4)

In [26]:
print(score1,score2,score3,score4)

0.7833463107364539 -0.07229762787861826 0.8627736902542527 0.8779726251291786


In [27]:
s1 = metrics.mean_absolute_error(y_test,y_pred1)
s2 = metrics.mean_absolute_error(y_test,y_pred2)
s3 = metrics.mean_absolute_error(y_test,y_pred3)
s4 = metrics.mean_absolute_error(y_test,y_pred4)

In [28]:
print(s1,s2,s3,s4)

4186.508898366434 8592.428727899724 2457.732899181811 2447.9515580545844


## Predict Charges For New Customer

In [29]:
data = {'age' : 40,
        'sex' : 1,
        'bmi' : 40.30,
        'children' : 4,
        'smoker' : 1,
        'region' : 2}

df = pd.DataFrame(data,index=[0])
df

Unnamed: 0,age,sex,bmi,children,smoker,region
0,40,1,40.3,4,1,2


In [30]:
new_pred = gr.predict(df)
print("Medical Insurance cost for new : ",new_pred)

Medical Insurance cost for new :  [43013.23345491]


## Save Model Usign Joblib

In [31]:
gr = GradientBoostingRegressor()
gr.fit(X,y)

GradientBoostingRegressor()

In [32]:
import joblib

In [33]:
joblib.dump(gr,'model_joblib_gr')

['model_joblib_gr']

In [34]:
model = joblib.load('model_joblib_gr')

In [35]:
model.predict([[40,1,40.3,4,1,2]])



array([42148.361888])

### GUI

In [36]:
from tkinter import *

In [37]:
import joblib

In [38]:
def show_entry():
    
    p1 = float(e1.get())
    p2 = float(e2.get())
    p3 = float(e3.get())
    p4 = float(e4.get())
    p5 = float(e5.get())
    p6 = float(e6.get())

    model = joblib.load('model_joblib_gr')
    result = model.predict([[p1,p2,p3,p4,p5,p6]])
    
    Label(master, text = "Insurance Cost").grid(row=7)
    Label(master, text=result).grid(row=8)

In [39]:
master = Tk()
master.title('Insurance Cost Prediction')
label = Label(master, text = 'Insurance Cost Prediction',
        bg = 'black', fg = 'white').grid(row=0, columnspan=2)

#Creating labels for features
Label(master,text = "Enter Your Age").grid(row=1)
Label(master,text = "Male Or Female [1/0]").grid(row=2)
Label(master,text = "Enter Your BMI Value").grid(row=3)
Label(master,text = "Enter Number of Children").grid(row=4)
Label(master,text = "Smoker Yes/No [1/0]").grid(row=5)
Label(master,text = "Region [1-4]").grid(row=6)

#Creating Input widgets instances
e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)

#setting position for input widgets
e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)

#Creating Button for prediction
#alsodefine show entry function
Button(master, text="Predict",command=show_entry).grid()

mainloop()

