<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/medical_cost_charge_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# clone git repo
!git clone https://github.com/zaka-ai/medical-cost-prediction

# change working directory
%cd medical-cost-prediction/data/

In [None]:
# Import necessary libraries and functions
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate,train_test_split
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [None]:
# Load the data in a pandas frame
path_to_data="/content/medical-cost-prediction/data/insurance.csv"
data=pd.read_csv(path_to_data)
data.head()

In [None]:
# Check the maximal charge
max_charge=data['charges'].max()
print(max_charge)

In [None]:
# Check the shape of the dataset
print("This data contains: "+str(data.shape[0])+" rows and "+str(data.shape[1])+" columns.")

In [None]:
# Check the number of classes od children-column
data['children'].value_counts()

In [None]:
# Check data info
data.info()

In [None]:
# Search for nulls
data.isnull().sum()

In [None]:
# Fill the null places
data=data.fillna(data['bmi'].mean())

In [None]:
# create the figure
fig = plt.figure(figsize=(12,5))

# add first sub plot for bmi
ax = fig.add_subplot(121)
# draw scatter of charges with respect to bmi
ax.scatter(data['bmi'],data['charges'])
# set sub plot title
ax.set_title('Scatter of Charges by BMI')

# add second sub plot for age
ax = fig.add_subplot(122)
# draw the scatter of charges with respect to the age
ax.scatter(data['age'],data['charges'])
# set sub plot title
ax.set_title('Scatter of Charges by Age')

In [None]:
# Split the data according to smoking status
data_smoker=data[data['smoker']=="yes"]
data_non_smoker=data[data['smoker']=="no"]

In [None]:
# create the figure
fig = plt.figure(figsize=(12,5))

# add first sub plot for smokers
ax = fig.add_subplot(121)
# draw distribution of charges for smokers
ax.hist(data_smoker['charges'])
# set sub plot title
ax.set_title('Distribution of charges for smokers')

# add second sub plot for non smokers
ax = fig.add_subplot(122)
# draw distribution of charges for non-smokers
ax.hist(data_non_smoker['charges'])
# set sub plot title
ax.set_title('Distribution of charges for non-smokers')

In [None]:
# Split the data according to gender
data_male=data[data['sex']=='male']
data_female=data[data['sex']=='female']

In [None]:
# create the figure
fig = plt.figure(figsize=(12,5))

# add first sub plot for males
ax = fig.add_subplot(121)
# draw distribution of charges for males
ax.hist(data_male['charges'])
# set sub plot title
ax.set_title('Distribution of charges for males')

# add second sub plot for females
ax = fig.add_subplot(122)
# draw distribution of charges for females
ax.hist(data_female['charges'])
# set sub plot title
ax.set_title('Distribution of charges for females')

In [None]:
# Create the figure with 5 subplots
fig, axs = plt.subplots(1, 6, figsize=(20, 5))

# Iterate over the number of children
for i in range(6):
    # Filter data for the current number of children
    data_children_i = data[data['children'] == i]

    # Plot the distribution of charges for the current number of children
    axs[i].hist(data_children_i['charges'])
    axs[i].set_title(f'Distribution of charges for {i} children')

# Adjust layout to prevent clipping of titles
plt.tight_layout()





In [None]:
# Get unique values in the 'region' column
unique_regions = data['region'].unique()

# Create the figure with subplots
fig, ax = plt.subplots(1, len(unique_regions), figsize=(20, 5))

# Iterate over unique values in the 'region' column
for i, region in enumerate(unique_regions):
    # Filter data for the current region
    data_region_i = data[data['region'] == region]

    # Plot the distribution of charges for the current region
    ax[i].hist(data_region_i['charges'])
    ax[i].set_title(f'Distribution of charges for {region}')

# Adjust layout to prevent clipping of titles
plt.tight_layout()

In [None]:
# Drop the less related columns to charges
data=data.drop(['region','sex'],axis=1)

In [None]:
# Convert the classes of smoker-column into integers
encoder=LabelEncoder()
data['smoker']=encoder.fit_transform(data['smoker'])

In [None]:
# Normalize the data
data_max=data.max()
data=data.divide(data_max)

In [None]:
# Check the statistics of the data
data.describe()

In [None]:
# Split the data into training and testing parts
x=data.iloc[:,:-1].values
y=data['charges'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Linear Regression cross-validation
LR = LinearRegression()
LR_scores = cross_validate(LR, x, y, cv=3, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'])

# Support Vector Regression cross-validation
SVR = SVR()
SVR_scores = cross_validate(SVR, x, y, cv=3, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'])

# Random Forest Regression cross-validaton
RFR = RandomForestRegressor()
RFR_scores = cross_validate(RFR, x, y, cv=3, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'])


In [None]:
# Check the results
models = ['LR', 'SVR', 'RFR']
scores = [LR_scores, SVR_scores, RFR_scores]

for model, score in zip(models, scores):
    print(f"{model}:")
    print("MAE:", -score['test_neg_mean_absolute_error'].mean())
    print("MSE:", -score['test_neg_mean_squared_error'].mean())
    print("------------------------")


In [None]:
# Define the hyperparameters and their possible values for the grid search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=RFR, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=3, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(x_train, y_train)

# Print the best hyperparameters and corresponding MSE
print("Best Hyperparameters:", grid_search.best_params_)
print("Best MSE:", -grid_search.best_score_)  # Negated during cross-validation


In [None]:
# Define the tuned model
tuned_RFR=RandomForestRegressor(max_depth=20,min_samples_leaf=4,min_samples_split=10,n_estimators=100)

In [None]:
# Fit the tuned model with training data
tuned_RFR.fit(x_train,y_train)

In [None]:
# Check the MAE and the MSE on testing data
y_hat=tuned_RFR.predict(x_test)
print("MAE:",mean_absolute_error(y_hat,y_test))
print("MSE:",mean_squared_error(y_hat,y_test))

In [None]:
# Define a charge predictor function
def charge_predictor(input):
  return tuned_RFR.predict(input)*max_charge


In [None]:
# Save the model
model.save('tuned_model.h5')