In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRFRegressor
import lightgbm as lgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")

# Standardization object
sc = StandardScaler()

# Load dataset
dataset = pd.read_csv('insurance.csv')
dataset.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [2]:
# Identify categorical and numerical columns
categorical_variable = dataset.select_dtypes(include='object').columns
numerical_variable = dataset.select_dtypes(include='int64').columns

print(categorical_variable)
print(numerical_variable)

# Encoding categorical features
dataset['sex'] = dataset['sex'].map({'male': 0, 'female': 1})
dataset['smoker'] = dataset['smoker'].map({'yes': 1, 'no': 0})
dataset['region'] = dataset['region'].map({'northwest': 0, 'northeast': 1, 'southeast': 2, 'southwest': 3})


Index(['sex', 'smoker', 'region'], dtype='object')
Index(['age', 'children'], dtype='object')


In [3]:
# Features (X) and target (y)
X = dataset.drop(columns='charges')
y = dataset['charges']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print shapes
print("Size of X_train:", X_train.shape)
print("Size of y_train:", y_train.shape)
print("Size of X_test:", X_test.shape)
print("Size of y_test:", y_test.shape)


Size of X_train: (936, 6)
Size of y_train: (936,)
Size of X_test: (402, 6)
Size of y_test: (402,)


In [4]:
# Initialize and train XGBoost Random Forest Regressor
regressor_xgb = XGBRFRegressor()
regressor_xgb.fit(X_train, y_train)

# Predict
y_pred_xgb = regressor_xgb.predict(X_test)

# Evaluation
r2_score_xgb = r2_score(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

# Print results
print("R2 score (test): {:.3f}".format(r2_score_xgb))
print("MAE : {:.3f}".format(mae_xgb))
print("RMSE : {:.3f}".format(rmse_xgb))


R2 score (test): 0.875
MAE : 2501.163
RMSE : 4283.380


In [5]:
# Save model and scaler using pickle
import pickle

# Save the trained model
with open('training_model.sav', 'wb') as model_file:
    pickle.dump(regressor_xgb, model_file)

# Save the fitted scaler
with open('scaler.sav', 'wb') as scaler_file:
    pickle.dump(sc, scaler_file)
