In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import style
style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('insurance.csv')
data

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
print("children : ",data['children'].unique())
print("region  : ",data['region'].unique())
print("age  : ",data['age'].unique())

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates()

In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize = (15,5) , dpi = 90)
sns.countplot(data = data , x = data['region'] )

In [None]:
plt.figure(figsize = (15,11), dpi = 90)
sns.countplot(data=data, x='age', hue='sex')
plt.show()

In [None]:
plt.figure(figsize = (15,5), dpi = 90)
sns.countplot(data=data, x='smoker', hue='sex')
plt.show()

In [None]:
plt.figure(figsize = (15,5), dpi = 90)
sns.boxplot(data = data , x = data['smoker'] , y = data['charges'])

In [None]:
plt.figure(figsize = (15,5), dpi = 90)
sns.barplot(data = data , x = data['smoker'] , y = data['charges'])

In [None]:
plt.figure(figsize = (15,5), dpi = 90)
sns.distplot(data['bmi'])

In [None]:
plt.figure(figsize = (15,5), dpi = 90)
sns.scatterplot(data = data , x = data['charges'] , y = data['bmi'])

In [None]:
plt.figure(figsize = (15,5), dpi = 90)
sns.lineplot(data = data , x = data['bmi'] , y = data['charges'] , hue = data['smoker'])

In [None]:
sex_mapping = {'male': 0, 'female': 1}
data['sex'] = data['sex'].replace(sex_mapping)

In [None]:
data['smoker'] = data['smoker'].map({'yes' : 1 , 'no' : 0})

In [None]:
data.drop('region' ,axis = 1 , inplace = True)

In [None]:
data.head()

In [None]:
plt.figure(figsize = (15,5), dpi = 90)
sns.heatmap(data.corr(), annot = True)

In [None]:
X = data.iloc[:,:-1].values
Y = data.iloc[:,-1].values

In [None]:
X

In [None]:
Y

In [None]:
print('Independent Feature Set Shape : ', X.shape)
print('Dependent Feature Shape       : ', Y.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

In [None]:
print('Training data shape   : ', x_train.shape)
print('Training labels shape : ', y_train.shape)
print('Testing data shape    : ', x_test.shape)
print('Testing labels shape  : ', y_test.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train)

In [None]:
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
print(x_train.shape)

In [None]:
print(x_test)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [None]:
regressor.predict(x_test[0:5])

In [None]:
pred_train = regressor.predict(x_train)
pred_test  = regressor.predict(x_test)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred_test)

In [None]:
from sklearn.metrics import r2_score
print('Training Set Score : ', r2_score(y_train, pred_train))
print('Testing Set Score  : ', r2_score(y_test, pred_test))

In [None]:
import pickle
pick_op = open("regressor","wb")
pick_op.dump(regressor,pick_op)
pick_op.close()

In [None]:
import pickle

# Open a file for writing in binary mode
with open("regressor", "wb") as pick_op:
    # Use pickle's dump method to serialize the object
    pickle.dump(regressor, pick_op)
