In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

ModuleNotFoundError: No module named 'numpy'

###**Data Collection**

In [None]:
insurance_dset = pd.read_csv('insurance.csv')
insurance_dset.head(4)

: 

In [None]:
insurance_dset.info()

: 

Here, 'charges' column is the target variable. In other words, target variable is the value meant to be predicted by the model.

And, all other columns are the features. For example, 'sex', 'smoker', and 'region' are categorical features.

###**Preprocessing-1**

In [None]:
# Data Cleaning - Missing data
insurance_dset.isnull().sum()

: 

###**Data Analysis**

In [None]:
# statistical measures of the dataset
insurance_dset.describe()

: 

In [None]:
sns.set()                               #running set() once will apply themes to all your plots.
plt.figure(figsize = (7,6))
sns.displot(insurance_dset['age'])
plt.title('Age Distribution');

: 

In [None]:
plt.figure(figsize = (4,3))
sns.countplot(x = 'sex', data = insurance_dset)
plt.title('Sex Distribution')
plt.show()

: 

In [None]:
plt.figure(figsize = (5,4))
sns.displot(insurance_dset['bmi'])
plt.title('BMI Distribution')
plt.show()

: 

In [None]:
plt.figure(figsize = (5,3))
sns.countplot(x = 'children', data = insurance_dset)
plt.title('Children')
plt.show()

: 

In [None]:
insurance_dset['children'].value_counts()

: 

In [None]:
plt.figure(figsize = (4,3))
sns.countplot(x = 'smoker', data = insurance_dset)
plt.title('Smoker Distribution')
plt.show()

: 

In [None]:
insurance_dset['smoker'].value_counts()

: 

In [None]:
plt.figure(figsize = (5,3))
sns.countplot(x = 'region', data = insurance_dset)
plt.title('Region')
plt.show()

: 

In [None]:
insurance_dset['region'].value_counts()

: 

In [None]:
plt.figure(figsize = (5,5))
sns.histplot(insurance_dset['charges'])
plt.title('Price Distribution')
plt.show()

: 

In [None]:
sns.scatterplot(x = 'bmi', y = 'charges', hue = 'smoker', s = 30, data = insurance_dset)
plt.show()

: 

In [None]:
sns.scatterplot(x = 'age', y = 'charges', hue = 'smoker', s = 40, data = insurance_dset)
plt.show()

: 

###**Preprocessing-2**

Encoding the categorical features

In [None]:
insurance_dset.replace({'sex': {'male': 0, 'female': 1}}, inplace = True)
insurance_dset.replace({'smoker': {'yes': 1, 'no': 0}}, inplace = True)
insurance_dset.replace({'region': {'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3}}, inplace = True)
insurance_dset.head()

: 

Splitting the features and target

In [None]:
x = insurance_dset.drop(columns = 'charges', axis = 1)
y = insurance_dset['charges']

: 

In [None]:
print(x)

: 

In [None]:
print(y)

: 

Train-Test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2)
print(x.shape, x_train.shape, x_test.shape)

: 

**Training: Linear Regression Model**

Equation of line: y = mx + c

where, x: input features, y: prediction probability, m: slope, c: intercept

In [None]:
model = LinearRegression()

: 

In [None]:
model.fit(x_train, y_train)

: 

Testing

In [None]:
lin_prediction = model.predict(x_train)

: 

In [None]:
r2_train = metrics.r2_score(y_train, lin_prediction)
print('R squared value:', r2_train)

: 

In [None]:
lin_test = model.predict(x_test)

: 

In [None]:
r2_test = metrics.r2_score(y_test, lin_test)
print('R squared value:', r2_test)

: 

In [None]:
input_data = (63,0,21.66,1,0,1)
#convert the data into numpy array
input_array = np.asarray(input_data)
#reshape the array as we are predicting for one instance
input_reshaped = input_array.reshape(1,-1)
prediction = model.predict(input_reshaped)
print('Insurance cost is:', prediction[0])

: 

###**Picking the file**

In [None]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))

: 

In [None]:
loaded_model = pickle.load(open('model.pkl', 'rb'))

: 