In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
#loading dataset
url = 'https://raw.githubusercontent.com/MAliHasnain/Insurance_data_Prediction/main/insurance.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,medicalCost
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
#getting information of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          1338 non-null   int64  
 1   sex          1338 non-null   object 
 2   bmi          1338 non-null   float64
 3   children     1338 non-null   int64  
 4   smoker       1338 non-null   object 
 5   region       1338 non-null   object 
 6   medicalCost  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
# Encoding categorical variables
df['sex'] = df['sex'].map({'female': 0, 'male': 1})
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})
df = pd.get_dummies(df, columns=['region'])

# Calculating correlation coefficients
correlation_coefficients = df[['age', 'sex', 'bmi', 'children', 'smoker', 'medicalCost']].corr()

# Merging region correlations
region_correlations = correlation_coefficients.filter(like='region_')
merged_region_correlation = region_correlations.sum(axis=1)

# Updating correlation coefficients
correlation_coefficients['region'] = merged_region_correlation

# Displaying correlation coefficients
print(correlation_coefficients['medicalCost'])

age            0.299008
sex            0.057292
bmi            0.198341
children       0.067998
smoker         0.787251
medicalCost    1.000000
Name: medicalCost, dtype: float64


In [5]:
# Selecting the top three predictors
top_predictors = correlation_coefficients['medicalCost'].abs().nlargest(4).index[1:]
top_predictors

Index(['smoker', 'age', 'bmi'], dtype='object')

In [6]:
#Initialize linear regression model
model1 = LinearRegression()
model2 = LinearRegression()
model3 = LinearRegression()

# Fit the models
model1.fit(df[top_predictors[0]].values.reshape(-1, 1), df['medicalCost'])
model2.fit(df[top_predictors[1]].values.reshape(-1, 1), df['medicalCost'])
model3.fit(df[top_predictors[2]].values.reshape(-1, 1), df['medicalCost'])

# Make predictions for each model
predictions1 = model1.predict(df[top_predictors[0]].values.reshape(-1, 1))
predictions2 = model2.predict(df[top_predictors[1]].values.reshape(-1, 1))
predictions3 = model3.predict(df[top_predictors[2]].values.reshape(-1, 1))

In [7]:
# Calculate performance metrics
mse1 = mean_squared_error(df['medicalCost'], predictions1)
mae1 = mean_absolute_error(df['medicalCost'], predictions1)
r2_1 = r2_score(df['medicalCost'], predictions1)
mse2 = mean_squared_error(df['medicalCost'], predictions2)
mae2 = mean_absolute_error(df['medicalCost'], predictions2)
r2_2 = r2_score(df['medicalCost'], predictions2)
mse3 = mean_squared_error(df['medicalCost'], predictions3)
mae3 = mean_absolute_error(df['medicalCost'], predictions3)
r2_3 = r2_score(df['medicalCost'], predictions3)
# Print performance metrics
print("Performance metrics for model 1:")
print("MSE:", mse1)
print("MAE:", mae1)
print("R-squared:", r2_1)
print()
print("Performance metrics for model 2:")
print("MSE:", mse2)
print("MAE:", mae2)
print("R-squared:", r2_2)
print()
print("Performance metrics for model 3:")
print("MSE:", mse3)
print("MAE:", mae3)
print("R-squared:", r2_3)

Performance metrics for model 1:
MSE: 55720715.954185426
MAE: 5662.089609343061
R-squared: 0.6197648148218988

Performance metrics for model 2:
MSE: 133440978.61376347
MAE: 9055.14962050455
R-squared: 0.08940589967885804

Performance metrics for model 3:
MSE: 140777900.09850758
MAE: 9172.351145507564
R-squared: 0.03933913991786253


In [8]:
# Initializing and predicting fit Model 1:
X_model1 = df[top_predictors].values
y_model1 = df['medicalCost'].values
model1 = LinearRegression()
model1.fit(X_model1, y_model1)
predictions_model1 = model1.predict(X_model1)

#Calculating performance metrics for Model 1:
mse_model1 = mean_squared_error(y_model1, predictions_model1)
mae_model1 = mean_absolute_error(y_model1, predictions_model1)
r2_model1 = r2_score(y_model1, predictions_model1)

#Printing Performance metrics
print("Performance metrics for Model 1:")
print("MSE:", mse_model1)
print("MAE:", mae_model1)
print("R-squared:", r2_model1)

Performance metrics for Model 1:
MSE: 37005395.750507504
MAE: 4216.775692234115
R-squared: 0.7474771588119513


In [10]:
# Initializing and predicting fit Model 2:
X_model2 = df.drop(columns='medicalCost').values
y_model2 = df['medicalCost'].values
model2 = LinearRegression()
model2.fit(X_model2, y_model2)
predictions_model2 = model2.predict(X_model2)

##Calculating performance metrics for Model 2:
mse_model2 = mean_squared_error(y_model2, predictions_model2)
mae_model2 = mean_absolute_error(y_model2, predictions_model2)
r2_model2 = r2_score(y_model2, predictions_model2)

#Printing Performance metrics
print("Performance metrics for Model 2:")
print("MSE:", mse_model2)
print("MAE:", mae_model2)
print("R-squared:", r2_model2)

Performance metrics for Model 2:
MSE: 36501893.00741544
MAE: 4170.886894163586
R-squared: 0.7509130345985207
