# Modeling and Evaluation

## Objectives
Predict medical insurance charges using customer profile information.

## Inputs
- Processed customer dataset with feature engineering.

## Outputs
- Trained ML regression model.
- Feature importance ranking.

# Change working directory

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

---

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load Engineered Dataset

In [None]:
import pandas as pd
df_path = 'outputs/datasets/engineered/insurance_engineered.csv'
df = pd.read_csv(df_path)
df.head()

Stop displaying warnings messages

# ML Pipeline with all data

Since we already done the feature engineering in the previous notebook, we can direclty create the ML pipeline with all the data.

In [None]:
from sklearn.pipeline import Pipeline

# Feature Engineering
from feature_engine.selection import SmartCorrelatedSelection

def FullPipeline(model):
    return Pipeline([
        ('correlation_filter', SmartCorrelatedSelection(
            method='pearson',
            threshold=0.8,
            selection_method='variance')),
        ('model', model)
    ])

---

## Hyperparameter Optimisation

**Hyperparameter Optimisation**

This is the process of tuning the hyperparameters of a machine learning model to improve its performance. It involves searching for the best combination of hyperparameters that yield the highest performance on a validation set.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
import numpy as np
import pandas as pd

class HyperparameterOptimizationSearch(BaseEstimator):
    def __init__(self, models, params):
        """
        models: dict of {model_name: estimator}
        params: dict of {model_name: param_grid}
        """
        self.models = models
        self.params = params
        self.grid_searches = {}

    def fit(self, X, y, cv=5, n_jobs=-1, verbose=1, scoring=None):
        for model_name, model in self.models.items():
            print(f"\nRunning GridSearchCV for {model_name}\n")
            
            # Wrap model in your pipeline
            pipeline = FullPipeline(model)

            # Get parameters
            param_grid = self.params[model_name]

            # Grid search
            gs = GridSearchCV(
                estimator=pipeline,
                param_grid=param_grid,
                cv=cv,
                n_jobs=n_jobs,
                verbose=verbose,
                scoring=scoring,
                return_train_score=True
            )

            gs.fit(X, y)
            self.grid_searches[model_name] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(name, scores, params):
            return pd.Series({
                'estimator': name,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
                **params
            })

        rows = []
        for name, gs in self.grid_searches.items():
            params_list = gs.cv_results_['params']
            splits = [gs.cv_results_[f'split{i}_test_score'] for i in range(gs.cv)]
            all_scores = np.vstack(splits).T  # shape: (n_candidates, n_splits)

            for p, s in zip(params_list, all_scores):
                rows.append(row(name, s, p))

        df = pd.DataFrame(rows).sort_values(by=sort_by, ascending=False)
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        return df[columns + [c for c in df.columns if c not in columns]], self.grid_searches


## Split Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split

# Features (X) and Target (y)
X = df.drop(columns=['charges', 'charges_transformed'])
y = df['charges_transformed']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2
)

print(f"X_train shape: {X_train.shape}  y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}  y_test shape: {y_test.shape}")

## Grid Search CV - Sklearn

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

models_quick_search = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}

params_quick_search = {
    "LinearRegression": {},
    "DecisionTreeRegressor": {
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 2]
    },
    "RandomForestRegressor": {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 2],
        'model__bootstrap': [True]
    },
    "GradientBoostingRegressor": {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.1, 0.2],
        'model__max_depth': [3, 5],
        'model__subsample': [0.8, 1.0],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 2]
    },
    "XGBRegressor": {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.1, 0.2],
        'model__max_depth': [3, 5],
        'model__subsample': [0.8, 1.0],
        'model__colsample_bytree': [0.8, 1.0]
    },
}

**Run Grid Search CV**

In [None]:
# Grid Search
search = HyperparameterOptimizationSearch(
    models=models_quick_search,
    params=params_quick_search
)
search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)

Check the results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
print(grid_search_summary)

**Evaluate the Best Model**

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def regression_performance(X_train, y_train, X_test, y_test, pipeline):
    print("Model Evaluation \n")
    print("* Train Set")
    regression_evaluation(X_train, y_train, pipeline)
    print("* Test Set")
    regression_evaluation(X_test, y_test, pipeline)

def regression_evaluation(X, y, pipeline):
    prediction = pipeline.predict(X)
    print('R2 Score:', r2_score(y, prediction).round(3))
    print('Mean Absolute Error:', mean_absolute_error(y, prediction).round(3))
    print('Mean Squared Error:', mean_squared_error(y, prediction).round(3))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y, prediction)).round(3))
    print("\n")

best_model = grid_search_summary.iloc[0]['estimator']
print("Best Model:", best_model)
best_regressor_pipeline = grid_search_pipelines[best_model].best_estimator_

regression_performance(X_train, y_train, X_test, y_test, best_regressor_pipeline)

Parameters for best model

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

The best clf pipeline

In [None]:
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

In [None]:
pipeline_clf.steps

With these strong evaluation scores, you are already in excellent shape and an extensive search is not going to be necessary.

- High R² on test set (0.847) 
- Low gap between train/test scores — not overfitting
- Low error metrics — strong predictions
- Stable RMSE on train/test — very balanced model

---

## Assess feature importance

In [None]:
# Access the model from the pipeline
xgb_model = pipeline_clf.named_steps['model']
# Transform training set using the pipeline up to the model
X_transformed = pipeline_clf[:-1].transform(X_train)

# Get feature names from original DataFrame
feature_names = X_train.columns

import matplotlib.pyplot as plt
import seaborn as sns

# Get importance values
importances = xgb_model.feature_importances_

# Create DataFrame for plotting
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance - XGBRegressor')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
print(importance_df)


---

## Evaluate Pipeline on Train and Test Sets

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

def regression_evaluation(X, y, pipeline):
    prediction = pipeline.predict(X)
    print('R2 Score:', r2_score(y, prediction).round(3))
    print('Mean Absolute Error:', mean_absolute_error(y, prediction).round(3))
    print('Mean Squared Error:', mean_squared_error(y, prediction).round(3))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y, prediction)).round(3))
    print("\n")

def regression_performance(X_train, y_train, X_test, y_test, pipeline):
    print("Model Evaluation \n")
    print("* Train Set")
    regression_evaluation(X_train, y_train, pipeline)
    print("* Test Set")
    regression_evaluation(X_test, y_test, pipeline)


In [None]:
regression_performance(X_train, y_train, X_test, y_test, pipeline_clf)


We used R2 Score, MAE and RMSE to evaluate the model performance, since Confusion Matrix and Accuracy are not suitable for regression problems.

**Evaluation Conclusion**

The XGBRegressor demonstrates strong predictive performance and generalization capability in estimating medical insurance costs. Its test set R² score of 0.847 indicates that the model explains approximately 85% of the variance in insurance charges for unseen data, which is excellent for a real-world regression task. Low and consistent error across training and test sets shows good generalization with minimal overfitting. Feature importance analysis confirms which variables most influence the cost, helping the business better understand risk drivers.


# Push files to Repo

We will generate the following file
* Train set
* Test set
* Data cleaning and Feature Engineering pipeline
* Modeling pipeline
* features importance plot

In [None]:
import joblib
import os

version = "v2"
file_path = f"outputs/ml_pipelines/{version}"

try:
    os.makedirs(file_path)
except Exception as e:
    print(e)

## Train Set

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train

In [None]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

## Test Set

In [None]:
print(X_test.shape)
X_test.head()

In [None]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test

In [None]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

## ML Pipelines: Feature Engineering and Modelling

Pipeline responsible for Feature Scaling, and Model

In [None]:
pipeline_clf

In [None]:
joblib.dump(value=pipeline_clf ,
            filename=f"{file_path}/clf_pipeline_model.pkl")

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='yeo-johnson')
y_transformed = pt.fit_transform(df[['charges']])

In [None]:
joblib.dump(pt, f"{file_path}/power_transformer.pkl")

## Feature Importance Plot and CSV

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance - XGBRegressor')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()

# Save to PNG
plt.savefig(f"{file_path}/feature_importance_xgb.png", dpi=300)
plt.show()

In [None]:
importance_df.to_csv(f"{file_path}/feature_importance_xgb.csv", index=False)