In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
df = pd.read_csv('Medical-Insurance.csv')

In [5]:
df.head()

Unnamed: 0,Age,Gender,Body Mass Index,No. of child,smoke/non-smoker,Region,Insurance Price
0,19,1,27.9,0,1,3,16884.924
1,18,2,33.77,1,0,4,1725.5523
2,28,2,33.0,3,0,4,4449.462
3,33,2,22.705,0,0,1,21984.47061
4,32,2,28.88,0,0,1,3866.8552


### EDA

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2772 entries, 0 to 2771
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               2772 non-null   int32  
 1   Gender            2772 non-null   int64  
 2   Body Mass Index   2772 non-null   float64
 3   No. of child      2772 non-null   int64  
 4   smoke/non-smoker  2772 non-null   object 
 5   Region            2772 non-null   int64  
 6   Insurance Price   2772 non-null   float64
dtypes: float64(2), int32(1), int64(3), object(1)
memory usage: 140.9+ KB


In [15]:
df['Age'].unique()

array(['19', '18', '28', '33', '32', '31', '46', '37', '60', '25', '62',
       '23', '56', '27', '52', '30', '34', '59', '63', '55', '22', '26',
       '35', '24', '41', '38', '36', '21', '48', '40', '58', '53', '43',
       '64', '20', '61', '44', '57', '29', '45', '54', '49', '47', '51',
       '42', '50', '39'], dtype=object)

In [14]:
df['Age'] = df['Age'].replace('?','42')

In [16]:
df['Age'] = df['Age'].astype(int)

In [20]:
df['smoke/non-smoker'].unique()

array(['1', '0', '?'], dtype=object)

In [26]:
delete_rows = df[df['smoke/non-smoker'] == '?'].index

In [27]:
df.drop(delete_rows, inplace=True)

In [29]:
df['smoke/non-smoker'].unique()

array(['1', '0'], dtype=object)

In [31]:
df['smoke/non-smoker'] = df['smoke/non-smoker'].astype(int)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2765 entries, 0 to 2771
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               2765 non-null   int32  
 1   Gender            2765 non-null   int64  
 2   Body Mass Index   2765 non-null   float64
 3   No. of child      2765 non-null   int64  
 4   smoke/non-smoker  2765 non-null   int32  
 5   Region            2765 non-null   int64  
 6   Insurance Price   2765 non-null   float64
dtypes: float64(2), int32(2), int64(3)
memory usage: 151.2 KB


In [34]:
df.drop(columns = ['Body Mass Index'], axis=1, inplace=True)

In [36]:
df.drop(columns = ['No. of child'], axis=1, inplace=True)

In [37]:
df.head()

Unnamed: 0,Age,Gender,smoke/non-smoker,Region,Insurance Price
0,19,1,1,3,16884.924
1,18,2,0,4,1725.5523
2,28,2,0,4,4449.462
3,33,2,0,1,21984.47061
4,32,2,0,1,3866.8552


In [39]:
df.isnull().sum()

Age                 0
Gender              0
smoke/non-smoker    0
Region              0
Insurance Price     0
dtype: int64

## Train Test Split

In [40]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [41]:
X.head()

Unnamed: 0,Age,Gender,smoke/non-smoker,Region
0,19,1,1,3
1,18,2,0,4
2,28,2,0,4
3,33,2,0,1
4,32,2,0,1


In [42]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: Insurance Price, dtype: float64

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)

In [47]:
X_test

Unnamed: 0,Age,Gender,smoke/non-smoker,Region
1995,30,2,1,3
1223,20,1,1,4
1392,46,1,0,4
2569,48,1,0,2
1341,19,2,0,3
...,...,...,...,...
2337,51,2,1,4
1701,52,2,0,2
103,61,1,1,4
321,26,1,0,2


## Feature Scaling

In [48]:
scaler = StandardScaler()

In [49]:
X_train_scaled = scaler.fit_transform(X_train)

In [50]:
X_train_scaled

array([[ 0.99783225,  0.97587854, -0.5088852 ,  1.27785004],
       [-0.86158933,  0.97587854, -0.5088852 ,  1.27785004],
       [ 0.64025117, -1.02471769, -0.5088852 , -0.49154232],
       ...,
       [-0.93310555, -1.02471769, -0.5088852 , -1.37623849],
       [ 1.64147818,  0.97587854,  1.96507976, -1.37623849],
       [ 1.2838971 ,  0.97587854, -0.5088852 ,  0.39315386]])

In [51]:
X_test_scaled = scaler.fit_transform(X_test)

In [52]:
X_test_scaled

array([[-0.65160483,  1.02379107,  2.01826522,  0.37112258],
       [-1.34296605, -0.9767618 ,  2.01826522,  1.25573803],
       [ 0.45457313, -0.9767618 , -0.49547502,  1.25573803],
       ...,
       [ 1.49161496, -0.9767618 ,  2.01826522,  1.25573803],
       [-0.92814931, -0.9767618 , -0.49547502, -0.51349288],
       [ 1.6298872 , -0.9767618 , -0.49547502,  0.37112258]])

## Model Building

In [53]:
model = LinearRegression()

In [54]:
model.fit(X_train_scaled, y_train)

In [55]:
y_pred = model.predict(X_test_scaled)

In [56]:
y_pred

array([30546.00106563, 27954.3235955 , 10453.86728124, 10760.74492658,
        3068.61202332, 13352.42239671,  7371.54413806, 38402.98996462,
        2903.75602758,  4570.28499843, 10068.01244293, 27194.03109195,
        3057.19485025,  6330.18763096, 11685.89342586, 27347.46991462,
        3057.19485025, 15318.92740309,  6832.25047708, 15086.51138745,
       14779.63374211,  3724.11369211,  3491.69767648,  9847.01360037,
       14779.63374211,  8075.69379477,  3749.9273339 , 12378.62590951,
        3338.25885381, 12801.71156266, 36594.43934418, 10068.01244293,
        8835.98629831,  3057.19485025, 10079.429616  , 12341.39509465,
        9154.28111672, 28414.64006351, 37710.25748098,  6678.81165441,
        2915.17320065,  7806.04696428, 27347.46991462, 10607.30610391,
       11685.89342586, 31227.3163762 , 11183.83057974, 28414.64006351,
        2915.17320065,  7218.10531539,  6727.45964233, 13891.71605768,
       13071.35839315,  2671.34001195,  6060.54080047, 13610.65205413,
      

## Model Performance

In [57]:
mse = mean_squared_error(y_test, y_pred)

In [58]:
mse

39692339.63558076

In [59]:
accuracy = r2_score(y_test, y_pred)

In [60]:
accuracy

0.723008920606866