In [1]:

import sys
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

# Add the path to the script if it's not in the same directory
sys.path.append("../scripts")  # Adjust this path

from training import train_and_log_model, initialize_mlflow, explain_model_with_shap
from utils import count_group_contribution, create_bar_chart, create_grouped_bar_chart, create_boxplot
from data_prep import load_data, handle_missing_data, engineer_features, create_features

  from .autonotebook import tqdm as notebook_tqdm


### Load the data

In [2]:
file_path = "../data/MachineLearningRating_v3.txt"

data = load_data(file_path)


### Data Cleaning

This section focuses on ensuring the quality of the data through a series of cleaning steps.

1. **Identifying Missing Values:**
   The code identifies which columns have missing data. These missing values can impact the accuracy of our models, so we need to handle them carefully.

2.  **Addressing Missing Data:**
    *   Columns with substantial missing data (over 50%) are removed from the analysis because they don’t offer enough reliable information.
    *   Rows that contain missing values in columns that have very few missing values are removed to make sure we are working with a clean dataset.
    *   For columns with a moderate amount of missing values, the missing values are filled using the most frequent value to preserve the data's shape and reduce any impact on the model performance.


In [3]:
olumn_na_rations = data.isna().mean()
print(olumn_na_rations[olumn_na_rations > 0].sort_values(ascending=False) * 100)

NumberOfVehiclesInFleet    100.000000
CrossBorder                 99.930207
CustomValueEstimate         77.956560
WrittenOff                  64.183810
Converted                   64.183810
Rebuilt                     64.183810
NewVehicle                  15.327998
Bank                        14.594670
AccountType                  4.022806
Gender                       0.953507
MaritalStatus                0.825819
mmcode                       0.055195
VehicleType                  0.055195
make                         0.055195
VehicleIntroDate             0.055195
NumberOfDoors                0.055195
bodytype                     0.055195
kilowatts                    0.055195
cubiccapacity                0.055195
Cylinders                    0.055195
Model                        0.055195
CapitalOutstanding           0.000200
dtype: float64


#### 2) Addressing this missing values

In [4]:
data = handle_missing_data(data)
# Finally check for the missing values
column_na_rations = data.isna().mean()
print(column_na_rations[column_na_rations > 0].sort_values(ascending=False) * 100)

Series([], dtype: float64)


### Feature Engineering

Here, we create new features that may assist the model with predictions, using domain specific knowledge.

1. **Calculating Optimal Premium:**
   A new feature called `OptimalPremium` is introduced. It represents a premium value that ensures at least a break-even return for the company, based on the amount claimed and the original premium. We assume that a company wants to ensure that they have a positive return for each customer, and this feature allows them to do this effectively.

In [5]:
data = engineer_features(data)

In [6]:
#Check the result of the top 10 rows.
print(data[['TotalClaims', 'TotalPremium', 'OptimalPremium']].sort_values(by='TotalClaims' , ascending=False).head(10))

          TotalClaims  TotalPremium  OptimalPremium
451249  393092.105263    243.538333   393092.105263
601844  376432.491228    562.617807   376432.491228
818316  363343.421053   1065.027982   363343.421053
173451  304338.657895    818.206140   304338.657895
172766  302361.149123    825.392281   302361.149123
402798  286686.431053    783.867018   286686.431053
803349  269311.929825    806.181579   269311.929825
904510  265789.473684   1060.473070   265789.473684
920077  263157.632807    857.887193   263157.632807
710484  261307.017544   1261.964035   261307.017544


### Feature Encoding

This step transforms categorical features into numerical representations, as most machine learning models need numerical data.

Categorical features are transformed using Label Encoding to assign a numerical label for each category, this ensures that the model can work with the categorical data

In [7]:
X, y = create_features(data)

### Data Splitting

The dataset is split into training and testing sets, where 80% of the data will be used to train the model and 20% of the data will be used to evaluate the model's performance

In [8]:
# Split the data 80-20
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2)

### Training Models

Here, we implement various machine learning models.

1.  **Initializing MLflow:**
    An MLflow tracking URI and experiment are initialized to manage and keep track of model training and evaluation.

2.  **Training the models:**
      The code sets up several machine learning models, including Linear Regression, Decision Tree Regressor, Random Forest Regressor, and Gradient Boosting Regressor. It will then train each model and record the results in the MLflow server.

In [9]:
models = {
    "LinearRegression": LinearRegression(),
    "RandomForestRegressor": RandomForestRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
}

In [10]:
best_model = None
best_r2 = -float('inf')  # Initialize as negative infinity
best_mse = float('inf')  # Initialize MSE as positive infinity

In [11]:
# 1) Initialize mlflow tracking uri
tracking_uri = "notebook"
experiment_name = "Optimum Price"
tracking_id = initialize_mlflow(uri=tracking_uri, experiment_name=experiment_name)


In [12]:
# Train each model and log with MLflow
for model_name, model in models.items():
    print(f"Training {model_name}...")
    mse, r2, trained_model = train_and_log_model(model, model_name, tracking_id, x_train, y_train, x_test, y_test)

Training LinearRegression...


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 701.84it/s] 


Training RandomForestRegressor...


Downloading artifacts: 100%|██████████| 7/7 [00:05<00:00,  1.27it/s] 


Training GradientBoostingRegressor...


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 655.99it/s] 


Training DecisionTreeRegressor...


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 520.04it/s]


In [13]:
 # Compare based on r2 score (or you can change to mse)
if r2 > best_r2:
        best_r2 = r2
        best_mse = mse
        best_model = trained_model


# Print out the best model's results
print(f"Best model: {best_model.__class__.__name__}")
print(f"Best R2 Score: {best_r2}")
print(f"Best MSE: {best_mse}")

Best model: DecisionTreeRegressor
Best R2 Score: -0.11943651272459466
Best MSE: 5491871.41892026


In [19]:
### Explain the model
# Generate SHAP explanations for the best model
shap_values = explain_model_with_shap(best_model, x_train, x_test)