# **IMPORTING THE NECESSARY LIBRARIES**

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

In [43]:
#Importing the insurance dataset into Jupyter notebook
dataset = pd.read_csv("/Users/muralik/Desktop/Datasets/insurance.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


**CLEANING THE DATA**

In [44]:
#Dropping the duplicate values
dataset.drop_duplicates()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [45]:
#Checking the shape of the dataset
dataset.shape

(1338, 7)

In [46]:
#Getting the first five values of the data
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


**EXPLORATORY DATA ANALYSIS**

In [47]:
#Encoding the data
dataset.sex.replace(["female","male"],[0,1],inplace=True)

In [48]:
#Getting the unique values from the field region and encoding the same
dataset.region.unique()
dataset.region.replace(["southwest","southeast","northwest","northeast"],[0,1,2,3],inplace=True)

In [49]:
#Getting the unique values from the field smoker and encoding the same
dataset["smoker"].unique()
dataset.smoker.replace(["yes","no"],[0,1],inplace=True)

**CHECKING THE CORRELATION OF THE MODEL**

In [50]:
dataset[["age","charges"]].corr()

Unnamed: 0,age,charges
age,1.0,0.299008
charges,0.299008,1.0


In [51]:
#Transforming age to get a better correlation
dataset["Transformed_age"] = np.log(dataset["age"])
dataset[["Transformed_age","charges"]].corr()

Unnamed: 0,Transformed_age,charges
Transformed_age,1.0,0.289967
charges,0.289967,1.0


In [52]:
dataset[["sex","charges"]].corr()

Unnamed: 0,sex,charges
sex,1.0,0.057292
charges,0.057292,1.0


In [53]:
#Transforming sex to get a better correlation 
dataset["Transformed_Sex"] = np.power(dataset["sex"],2)
dataset[["Transformed_Sex","charges"]].corr()

Unnamed: 0,Transformed_Sex,charges
Transformed_Sex,1.0,0.057292
charges,0.057292,1.0


In [54]:
dataset[["bmi","charges"]].corr()

Unnamed: 0,bmi,charges
bmi,1.0,0.198341
charges,0.198341,1.0


In [55]:
#Transforming the feature BMI to get better correlation
dataset["Transformed_bmi"] = np.log(dataset["bmi"])
dataset[["Transformed_bmi","charges"]].corr()

Unnamed: 0,Transformed_bmi,charges
Transformed_bmi,1.0,0.199266
charges,0.199266,1.0


In [56]:
dataset[["children","charges"]].corr()

Unnamed: 0,children,charges
children,1.0,0.067998
charges,0.067998,1.0


In [57]:
#Transforming the feature children to get better correlation
dataset["Transformed_children"] = np.power(dataset["children"],3)
dataset[["Transformed_children","charges"]].corr()

Unnamed: 0,Transformed_children,charges
Transformed_children,1.0,0.01016
charges,0.01016,1.0


In [58]:
dataset[["smoker","charges"]].corr()

Unnamed: 0,smoker,charges
smoker,1.0,-0.787251
charges,-0.787251,1.0


In [59]:
dataset[["region","charges"]].corr()

Unnamed: 0,region,charges
region,1.0,0.006208
charges,0.006208,1.0


We are able to see that other than whether a person smokes or not (Feature - Smoker) and age,
there is no strong correlation between any other features and the target variable, thus we can drop
the other unnecessary features to get a better ML model

**SPLITING THE DATA**

In [60]:
#Splitting the data into training and testing
X_values = dataset[["smoker","age"]]
Y_values = dataset[["charges"]]
X_train,X_test,Y_train,Y_test = train_test_split(X_values,Y_values,test_size=0.3,random_state=42)
Regressor = LinearRegression()
Regressor.fit(X_train,Y_train)

In [61]:
for idx,col in enumerate(X_train.columns):
    print(f"The Coefficient of {col} is {Regressor.coef_[0][idx]}")

The Coefficient of smoker is -23665.569918297217
The Coefficient of age is 279.2612064703921


In [62]:
Y_intercept = Regressor.intercept_[0]
Y_intercept

21168.01724829703

In [63]:
Y_pred = Regressor.predict(X_test)

**CHECKING THE ACCURACY OF THE REGRESSION MODEL**

In [64]:
from sklearn.metrics import r2_score
Accuracy = r2_score(Y_test,Y_pred)
print(f"The accuracy of the model is {Accuracy * 100} %")

The accuracy of the model is 74.12271370215589 %


**APPLYING POLYNOMIAL LINEAR REGRESSION**

In [65]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly =  PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_train)
Linreg = LinearRegression()
Linreg.fit(X_poly,Y_train)

In [66]:
Y_pred_poly = Linreg.predict(poly.fit_transform(X_test))

**CHECKING THE ACCURACY OF THE MODEL**

In [67]:
Acc = r2_score(Y_test,Y_pred_poly)
print(f"The accuracy of the model is {Acc * 100} %")

The accuracy of the model is 74.03545314708906 %
