### Problem Statement 

###### m Decide the medical insurance charges using linear regression

## Import libraries 

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import pickle
import json

In [3]:
import warnings
warnings.filterwarnings("ignore")

## Data Gathering

In [4]:
df = pd.read_csv("medical_insurance.csv")

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
df.shape

(1338, 7)

In [7]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [9]:
## There is no null value in the dataset 

## Data Preprocessing

## Feature Engineering

In [10]:
# Label Encoding - replacing the categorical columns with numbers

In [11]:
df["sex"] = df.sex.replace({"female":0,"male":1})

In [12]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [13]:
df.smoker = df.smoker.replace({"yes":1,"no":0})

In [14]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [15]:
df["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [16]:
# One Hot Encoding - It transforms categorical data into a format that machine learning models can easily understand and use

In [17]:
# need numeric columns - So the region values not follow the any order so we can apply one hot encoding to create the dumpy 

In [18]:
df["region"].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [19]:
# Step 1: Convert region column to int
df['region'] = df['region']

# Step 2: Apply get_dummies
df = pd.get_dummies(df, columns=['region']).astype(int)

In [20]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27,0,1,16884,0,0,0,1
1,18,1,33,1,0,1725,0,0,1,0
2,28,1,33,3,0,4449,0,0,1,0
3,33,1,22,0,0,21984,0,1,0,0
4,32,1,28,0,0,3866,0,1,0,0


## Model Training

In [21]:
x = df.drop("charges",axis=1)
y = df.charges

In [22]:
# test_size → how much data goes for testing (like 20% or 30%).
# random_state → just a number that makes sure the shuffle happens the same way every time, so you get the same split every time you run the code.

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 34) 

In [24]:
model = LinearRegression()

In [25]:
model.fit(x_train, y_train)

In [26]:
y_pred = model.predict(x_test)

## Model Evaluation 

#### Evaluation on test data

In [27]:
mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
r2score_ = r2_score(y_test,y_pred)

print("Mean Squared Error: ",mse)
print("Mean Absolute Error: ", mae)
print("R2 Score: ", r2score_)

Mean Squared Error:  41579008.40299168
Mean Absolute Error:  4418.564907673939
R2 Score:  0.7442644417447485


#### Evaluation on train data

In [28]:
y_pred_train = model.predict(x_train)

In [29]:
mse = mean_squared_error(y_train,y_pred_train)
mae = mean_absolute_error(y_train,y_pred_train)
r2_score_ = r2_score(y_train,y_pred_train)


print("Mean Squared Error:",mse)
print("Mean Absolute Error:",mae)
print("R2 Score:",r2_score_)

Mean Squared Error: 35299245.58099208
Mean Absolute Error: 4100.6934497090515
R2 Score: 0.7519234373394956


In [30]:
# Training Expectation     R2 On Training Data       R2 on Testing     Overfit/UnderFit
# 89-95                         75                        70                UnderFit
# 80                            75                        73                Underfit
# 78                            76                        74                Ok ok fit
# 78                            77                        76                ok to good fit
# 78                            77                        65                Overfit
# 95                            77                        65                Underfit

## Feacture Selection 

In [31]:
# Feature Selection - is the process of choosing a subset of the most relevant features (variables) from a dataset to build a machine learning model

In [32]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges',
       'region_northeast', 'region_northwest', 'region_southeast',
       'region_southwest'],
      dtype='object')

#### Testing a single input 

In [33]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27,0,1,16884,0,0,0,1
1,18,1,33,1,0,1725,0,0,1,0
2,28,1,33,3,0,4449,0,0,1,0
3,33,1,22,0,0,21984,0,1,0,0
4,32,1,28,0,0,3866,0,1,0,0


#### User input testing

In [34]:
# 18	1	33	1	0	1725	0	0	1	0

In [35]:
age = 19
sex = "male"
bmi = 28
children = 0
smoker = "yes"
region = "southwest"

In [36]:
project_data = {"sex":{"female":0,"male":1},
               "smoker":{"yes":1,"no":0},
                "columns":list(x.columns)}

In [37]:
project_data

{'sex': {'female': 0, 'male': 1},
 'smoker': {'yes': 1, 'no': 0},
 'columns': ['age',
  'sex',
  'bmi',
  'children',
  'smoker',
  'region_northeast',
  'region_northwest',
  'region_southeast',
  'region_southwest']}

In [38]:
# The data convert into the json format 

In [39]:
project_data["sex"][sex]

1

In [40]:
test_array = np.zeros(len(x.columns))
test_array

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [41]:
                      'region_northeast'    'region_northwest' 'region_southeast'   'region_southwest'
'region_northeast'           1                       0                 0                    0
'region_northwest'           0                       1                 0                    0
'region_southeast'           0                       0                 1                    0
'region_southwest'           0                       0                 0                    1

SyntaxError: invalid syntax (3954921756.py, line 2)

In [None]:
test_array[0] = age
test_array[1] = project_data["sex"][sex]
test_array[2] = bmi
test_array[3] = children
test_array[4] = project_data["smoker"][smoker]

test_array

In [None]:
test_series = pd.Series(test_array,index = x.columns)

In [None]:
test_series

In [None]:
column_names = x.columns

In [None]:
region = "southwest" # Input
region = "region_" + region
region

In [None]:
test_series["region_southwest"]

In [None]:
test_series[region] = 1

In [None]:
test_series

In [None]:
charges = model.predict([test_series])[0]
print(f"The insurance charges for the individual is {charges:0.2f}")

In [None]:
with open("Linear_model.pickle","wb") as file:
    pickle.dump(model,file)

In [None]:
with open("Project_data.json","w") as file:
    json.dump(project_data, file)

In [None]:
test_series2 = pd.Series(np.zeros(len(project_data["columns"])),index = project_data["columns"])

In [None]:
test_series2