In [20]:
import numpy as np
import pandas as pd
from constants import Columns, ModelConstants
from utils import TransactionDataset, TuneHyperParams, FinalModelPerformance
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import Pipeline
from joblib import dump

In [12]:
#Getting the dataset from GitHub and splitting the data

data = TransactionDataset().get_training_test_split()

In [13]:
onehot_columns = [Columns.CUSTOMER_TYPE]
ordinal_columns = [Columns.SPECIFIC_HOLIDAY]

In [14]:
#Pre-Define the columns transformation that we want to make

ordinal_and_onehot_transformation = make_column_transformer(
    (OneHotEncoder(), onehot_columns),
    (OrdinalEncoder(), ordinal_columns),
    remainder="passthrough"
)

In [15]:
#Pre-Defining the model that we want to use and the over/under-sampling methods

random_forest = RandomForestClassifier(random_state=ModelConstants.RANDOM_STATE)
tomek_links = TomekLinks(sampling_strategy='majority')

In [16]:
rf_pipeline = Pipeline(
    [
        ("column_transforms", ordinal_and_onehot_transformation),
        ("Tomek_Links_UnderSampling", tomek_links),
        ("random_forest", random_forest)
    ]
)

In [17]:
#Choosing hyperparameters and justification

random_params_rf = {
    'random_forest__bootstrap': [True, False], # Prevent over-fitting and reduce variance if set to True, check if there is a difference
    'random_forest__criterion': ['gini', 'entropy', 'log_loss'], # Testing different split functions
    'random_forest__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], # Testing optimal depth and prevent over-fitting when set to None
    'random_forest__max_features': ['log2', 'sqrt'], # Number of features to consider when looking for the best split
    'random_forest__min_samples_leaf': [1, 2, 4, 6, 8, 10], # Minimum number of samples required to be at a leaf node
    'random_forest__min_samples_split': [2, 5, 10, 15, 20, 25], # Minimum number of samples required to split an internal node
    'random_forest__n_estimators': [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], # Number of trees in the forest, a lot of trees can slow down the training process
    'random_forest__class_weight': ["balanced", {0:3,1:7}, {0:2,1:8}, {0:1,1:9}, None] # Takes into account the imbalance in the decision class
}

In [18]:
#Running a Random Search on the pipeline with the above selected parameters and fitting/evaluating the performance on the training data

best_model_rf = (
    TuneHyperParams()
    .random_grid_search(
        rf_pipeline,
        random_params_rf
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_model()
)

{   'random_forest__bootstrap': True,
    'random_forest__class_weight': {0: 3, 1: 7},
    'random_forest__criterion': 'gini',
    'random_forest__max_depth': 60,
    'random_forest__max_features': 'log2',
    'random_forest__min_samples_leaf': 8,
    'random_forest__min_samples_split': 20,
    'random_forest__n_estimators': 400}
Best parameter (CV score: 0.680):


In [19]:
#Cross-validating the best model on the training data to get a better overall look on the performance using different performance metrics

final_model = FinalModelPerformance(
    model=best_model_rf,
    data=data
)

final_model.get_cross_validation_results()

Unnamed: 0,Metric for Training Set,Score
0,fit_time mean score,2.57863
1,score_time mean score,0.116947
2,5-fold CV F1 Score mean score,0.679953
3,5-fold CV Accuracy mean score,0.891131
4,5-fold CV Balanced Accuracy mean score,0.829771
5,5-fold CV Matthew's Correlation Coefficient me...,0.618315


In [3]:
final_model.get_final_model_performance()

Unnamed: 0,Metric for Testing Set,Score
0,F1 Score,0.701235
1,Accuracy,0.90086
2,Balanced Accuracy,0.807434
3,Matthew's Correlation Coefficient,0.643525


In [21]:
dump(final_model.model, "./models_exports/random_forest_classifier.joblib")

['./models_exports/random_forest_classifier.joblib']