In [1]:
import numpy as np
import pandas as pd
from constants import Columns, ModelConstants, Resample
from utils import TransactionDataset, get_cross_validation_results, TuneHyperParams, get_final_model_performance
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
#from sklearn.pipeline import Pipeline
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import Pipeline #Keep in mind this is a diffrent package then SKLearn

In [2]:
#Getting the dataset from GitHub and splitting the data

data = TransactionDataset().get_training_test_split()

In [3]:
onehot_columns = [Columns.CUSTOMER_TYPE]
ordinal_columns = [Columns.SPECIFIC_HOLIDAY]

In [4]:
#Pre-Define the columns transformation that we want to make

ordinal_and_onehot_transformation = make_column_transformer(
    (OneHotEncoder(), onehot_columns),
    (OrdinalEncoder(), ordinal_columns),
    remainder="passthrough"
)

In [5]:
#Pre-Defining the model that we want to use and the under-sampling method

random_forest = RandomForestClassifier()
tomek_links = TomekLinks(sampling_strategy='majority')

In [6]:
rf_pipeline = Pipeline(
    [
        ("column_transforms", ordinal_and_onehot_transformation), 
        ("tomek_links_undersampling", tomek_links), 
        ("random_forest", random_forest)
    ]
)

In [7]:
#fitting the pipeline to the training data

rf_pipeline.fit(data.TRAINING.predictors, data.TRAINING.outcome)

In [11]:
#Choosing hyperparameters and justification

random_params = {
    'random_forest__bootstrap': [True, False], #Prevent overfitting and reduce variance if set to True, check if there is a difference
    'random_forest__criterion': ['gini', 'entropy', 'log_loss'], #Testing diffrent split functions
    'random_forest__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], #Testing optimal depth and prevent overfitting when set to None
    'random_forest__max_features': ['log2', 'sqrt'], #Number of features to consider when looking for the best split
    'random_forest__min_samples_leaf': [1, 2, 4, 6, 8, 10], #Minimum number of samples required to be at a leaf node
    'random_forest__min_samples_split': [2, 5, 10, 15, 20, 25], #Minimum number of samples required to split an internal node
    'random_forest__n_estimators': [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], #Number of trees in the forest, a lot of trees can slow down the training process
    'random_forest__class_weight': ["balanced", {0:3,1:7}, {0:2,1:8}, {0:1,1:9}, None] #Takes into account the imbalance in the decision class
}

In [12]:
#Running a Random Search on the pipeline with the above selected parameters and fitting/evaluating the performance on the training data

rgs = (
    TuneHyperParams()
    .random_grid_search(
        rf_pipeline,
        random_params
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
    )

{'random_forest__n_estimators': 400, 'random_forest__min_samples_split': 5, 'random_forest__min_samples_leaf': 4, 'random_forest__max_features': 'sqrt', 'random_forest__max_depth': 80, 'random_forest__criterion': 'entropy', 'random_forest__class_weight': 'balanced', 'random_forest__bootstrap': True}
Best parameter (CV score: 0.676):


In [None]:
#best performing parameters so far

best_parameters = {
    'random_forest__bootstrap': [True],
    'random_forest__criterion': ['entropy'], #Testing diffrent split functions
    'random_forest__max_depth': [80], #Testing optimal depth and prevent overfitting when set to None
    'random_forest__max_features': ['sqrt'], #Number of features to consider when looking for the best split
    'random_forest__min_samples_leaf': [4], #Minimum number of samples required to be at a leaf node
    'random_forest__min_samples_split': [5], #Minimum number of samples required to split an internal node
    'random_forest__n_estimators': [400], #Number of trees in the forest, a lot of trees can slow down the training process
    'random_forest__class_weight': ["balanced"] #Takes into account the imbalance in the decision class, {0:3,1:7}
}

In [10]:
#Cross-validating the best model on the training data to get a better overall look on the perfermance using different performance metrics

get_cross_validation_results(
    rgs.best_estimator_,
    data.TRAINING.predictors,
    data.TRAINING.outcome
)

fit_time: 14.8177738904953
score_time: 0.3107225656509399
test_accuracy: 0.8884671208282295
test_balanced_accuracy: 0.8343039605421859
test_f1: 0.6790784256296393


{'fit_time': array([14.82632637, 14.61016655, 14.85092878, 14.26845288, 14.21779251,
        15.44095039, 14.66646981, 14.76143217, 15.17474031, 15.36047912]),
 'score_time': array([0.42268491, 0.303056  , 0.32901955, 0.28854966, 0.31171274,
        0.30463958, 0.27675891, 0.29169297, 0.29803395, 0.28107738]),
 'test_accuracy': array([0.88843398, 0.89355169, 0.88536336, 0.89355169, 0.875     ,
        0.88114754, 0.90163934, 0.8954918 , 0.89139344, 0.87909836]),
 'test_balanced_accuracy': array([0.840718  , 0.82246256, 0.83357526, 0.83044609, 0.8267118 ,
        0.79816045, 0.86126725, 0.82275166, 0.85844869, 0.84849784]),
 'test_f1': array([0.68405797, 0.67901235, 0.6744186 , 0.68484848, 0.65340909,
        0.63975155, 0.71764706, 0.68125   , 0.70056497, 0.67582418])}

In [13]:
get_final_model_performance(
    rgs.best_estimator_,
    data.TRAINING,
    data.TESTING
)

Unnamed: 0,Test,Score
0,F1 Score,0.704192
1,Accuracy,0.898812
2,Balanced Accuracy,0.802371
3,Matthew's Correlation Coefficient,0.647093
