# __Medical Insurance Prediction__

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings("ignore")

### __STEP-1:__
_Load Data_

In [2]:
df = pd.read_csv("data/medical_insurance.csv")
df.head(3)

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


### __STEP-2:__
_EDA_

1. converting "gender", "smoker", "region" columns to numeric data type

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df.gender.replace({'male': 0, 'female': 1}, inplace=True)
df.smoker.replace({'no': 0, 'yes': 1}, inplace=True)
df = pd.get_dummies(df, columns=["region"], dtype=int)

### __STEP-3:__
_Model Training_

In [5]:
x = df.drop("charges", axis=1)
y = df.charges

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=100)

In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [8]:
print(f"{'*'*10} Slope Of The Lines:{'*'*10}\n{lr.coef_}\n\n{'*'*11} Intercept Value:{'*'*12}\n{lr.intercept_}")

********** Slope Of The Lines:**********
[ 2.63750608e+02 -1.26152749e+01  3.11571755e+02  4.73045625e+02
  2.34931453e+04  6.09598800e+02  2.05975445e+02 -3.07731627e+02
 -5.07842618e+02]

*********** Intercept Value:************
-11946.315221945957


### __STEP-4:__
_Model Evaluation_

__1 . Training Data__

In [9]:
y_train_pred = lr.predict(X_train)

print(f"MSE: {mean_squared_error(y_train, y_train_pred)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_train, y_train_pred))}")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred)}")
print(f"R Squared: {r2_score(y_train, y_train_pred)}")

MSE: 37656160.7908156
RMSE: 6136.461585540612
MAE: 4222.212645388819
R Squared: 0.7380636904176421


__2 . Testing Data__

In [10]:
y_test_pred = lr.predict(X_test)

print(f"MSE: {mean_squared_error(y_test, y_test_pred)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred))}")
print(f"MAE: {mean_absolute_error(y_test, y_test_pred)}")
print(f"R Squared: {r2_score(y_test, y_test_pred)}")

MSE: 32193193.042996
RMSE: 5673.904567667313
MAE: 3916.437775810605
R Squared: 0.7946968492150817


__Conclusion:__ _we observed model is underfitted._

# __Testing User Inputs__

__Columns__

In [11]:
tr = X_train.columns
tr

Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

__Creating Columns with Default Values 0__

In [12]:
user_info = np.zeros([1,9])
user_info

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.]])

__Handling String Input and Encode__

In [13]:
label_enc_data = {"gender":{"male":0, "female":1}, "smoker" : {"yes":1, "no":0}}
label_enc_data

{'gender': {'male': 0, 'female': 1}, 'smoker': {'yes': 1, 'no': 0}}

__User Info__

In [14]:
age = 29
gender = 'female'
bmi = 21.850
children = 0
smoker = 'yes'
region = 'northeast'

__Input Assignment__

In [15]:
user_info[0][0] = age                                   # age
user_info[0][1] = label_enc_data["gender"][gender]      # gender
user_info[0][2] = bmi                                   # bmi
user_info[0][3] = children 
user_info[0][4] = label_enc_data["smoker"][smoker]      # smoker

indexer = np.where(tr == f"region_{region}")[0][0]
user_info[0][indexer] = 1                               # region

__Updated User Values__

In [16]:
user_info

array([[29.  ,  1.  , 21.85,  0.  ,  1.  ,  1.  ,  0.  ,  0.  ,  0.  ]])

__Final Prediction Based on Trained Model__

In [17]:
insurance_charges = lr.predict(user_info)[0]
print(f"{age} years old {gender} with BMI {bmi:.1f}, having {children} children and smoking status is {smoker}, from region {region}.\nInsurance charges would be Rs.{insurance_charges:,.2f}.")

29 years old female with BMI 21.9, having 0 children and smoking status is yes, from region northeast.
Insurance charges would be Rs.26,600.42.
