Step 1: Load and Preprocess the Data

In [2]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder, StandardScaler 

#load dataset 
df_bmi = pd.read_excel("../Data/Medibuddy_BMI.xlsx") 
df_region =pd.read_excel("../Data/Medibuddy_Region.xlsx") 

#Merge 
df = pd.merge(df_bmi, df_region, on="Policy no.") 

#drop policy no since it is not relevant 
df.drop(columns=["Policy no."], inplace=True) 

#Encode categorical varivable  
encoder = LabelEncoder() 
df["sex"] = encoder.fit_transform(df["sex"]) 
df["region"] = encoder.fit_transform(df["region"])  
df["smoker"] = encoder.fit_transform(df["smoker"])
#feature selection 
x = df.drop(columns=["charges in INR"]) 
y = df["charges in INR"] 

#split data into 80 20 ratio 
x_train,x_test,y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=42) 

#scale numerical features 
scaler = StandardScaler() 
x_train =scaler.fit_transform(x_train) 
x_test =scaler.fit_transform(x_test)


In [8]:
# Convert scaled arrays back to DataFrames to retain column names
t_x_train = pd.DataFrame(x_train, columns=x.columns)
t_x_test = pd.DataFrame(x_test, columns=x.columns)

# Check column names
print("x_train columns:", t_x_train.columns)
print("x_test columns:", t_x_test.columns)

x_train columns: Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')
x_test columns: Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')


Step 2: Train the Machine Learning Model

In [9]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score 

model = RandomForestRegressor(n_estimators=100, random_state=42) 

model.fit(x_train, y_train) 

y_pred = model.predict(x_test) 

#Evaliate the model 

mae = mean_absolute_error(y_test,y_pred) 
mse = mean_squared_error(y_test, y_pred) 
r2 = r2_score(y_test, y_pred) 

print(f"MAE: {mae:.2f}, MSE: {mse:.2f}, R2 Score: {r2:.2f}")

MAE: 2721.20, MSE: 21459598.88, R2 Score: 0.86


Step 3: Hyperparameter Tuning with GridSearchCV

In [11]:
from sklearn.model_selection import GridSearchCV 

#Define hyperparameter grid 
param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth':[None, 10, 20], 
    'min_samples_split':[2, 5, 10]
} 

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv= 5, n_jobs=-1)  
grid_search.fit(x_train,y_train) 

best_model = grid_search.best_estimator_ 
print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}


In [2]:
# Save the best model
import joblib
joblib.dump(best_model, "best_model3.pkl") 
