<a href="https://colab.research.google.com/github/Jeniloza/test-python-demo/blob/main/DS_600_Flask_Project_HIPP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
!pip install pandasql

Data Collection & Analysis

In [None]:
# loading the data from csv file to a Pandas DataFrame 
insurance_dataset = pd.read_csv('insurance.csv')

In [None]:
# first 5 rows of the dataframe 
insurance_dataset.head()

In [None]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [None]:
my_dataframe = pysqldf("""select * from insurance_dataset""")
my_dataframe

In [None]:
# number of rows and columns 
insurance_dataset.shape

In [None]:
# getting some informations about the dataset
insurance_dataset.info()

Categorical Features:

1.   Sex
2.   Smoker
3.   Region

In [None]:
# checking for missing values
insurance_dataset.isnull().sum()

Data Analysis

In [None]:
# statistical Measures of the dataset
insurance_dataset.describe()

In [None]:
# distribution of age value
sns.set()
plt.figure(figsize=(6,6))
sns.distplot(insurance_dataset['age'])
plt.title('Age Distribution')
plt.show()

In [None]:
# Gender column
plt.figure(figsize=(6,6))
sns.countplot(x='sex', data=insurance_dataset)
plt.title('Sex Distribution')
plt.show()

In [None]:
insurance_dataset['sex'].value_counts()

In [None]:
# bmi distribution
plt.figure(figsize=(6,6))
sns.distplot(insurance_dataset['bmi'])
plt.title('BMI Distribution')
plt.show()

Normal BMI Range --> 18.5 to 24.9

In [None]:
# children column
plt.figure(figsize=(6,6))
sns.countplot(x='children', data=insurance_dataset)
plt.title('Children')
plt.show()

In [None]:
insurance_dataset['children'].value_counts()

In [None]:
# smoker column
plt.figure(figsize=(6,6))
sns.countplot(x='smoker', data=insurance_dataset)
plt.title('smoker')
plt.show()

In [None]:
insurance_dataset['smoker'].value_counts()

In [None]:
# region column
plt.figure(figsize=(6,6))
sns.countplot(x='region', data=insurance_dataset)
plt.title('region')
plt.show()

In [None]:
insurance_dataset['region'].value_counts()

In [None]:
# distribution of charges value
plt.figure(figsize=(6,6))
sns.distplot(insurance_dataset['charges'])
plt.title('Charges Distribution')
plt.show()

Data Pre-Processing

Encoding the categorical features

In [None]:
# encoding sex column
insurance_dataset.replace({'sex':{'male':0,'female':1}}, inplace=True)

# encoding 'smoker' column
insurance_dataset.replace({'smoker':{'yes':0,'no':1}}, inplace=True)

# encoding 'region' column
insurance_dataset.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)

Splitting the Features and Target

In [None]:
X = insurance_dataset.drop(columns='charges', axis=1)
Y = insurance_dataset['charges']

In [None]:
print(X)

In [None]:
print(Y)

Splitting the data into Training data & Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

Model Training

Linear Regression

In [None]:
# loading the Linear Regression model
regressor = LinearRegression()

In [None]:
regressor.fit(X_train, Y_train)

Model Evaluation

In [None]:
# prediction on training data
training_data_prediction =regressor.predict(X_train)

In [None]:
correlation_matrix = np.corrcoef(Y_train, training_data_prediction)
print(correlation_matrix)

correlation_xy = correlation_matrix[0,1]
print(correlation_xy)

r_squared = correlation_xy**2

print(r_squared)

In [None]:
# R squared value
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R squared vale : ', r2_train)

R squared value near to 1 then model is good or if the value near to 0 then model is not good enough.

In [None]:
# prediction on test data
test_data_prediction = regressor.predict(X_test)

In [None]:
# R squared value
r2_test = metrics.r2_score(Y_test, test_data_prediction)
print('R squared vale : ', r2_test)

Save model using joblib

In [None]:
import joblib
joblib.dump(regressor, 'mymodel.pkl')

Building a Predictive System

In [None]:
import joblib
load_regressor = joblib.load('mymodel.pkl')

In [None]:
input_data = (24,1,19.3,0,1,3)

# changing input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = load_regressor.predict(input_data_reshaped)
# print(prediction)

print('The insurance cost is USD($)', prediction[0], 'per year')