In [1]:
import numpy as np
import pandas as pd
from constants import Columns, ModelConstants, Resample
from utils import TransactionDataset, get_cross_validation_results, TuneHyperParams
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline

In [2]:
data = TransactionDataset().get_training_test_split()

Resample.no_resample
Resample.no_resample
Resample.no_resample


In [None]:
data.TRAINING.outcome.value_counts()

In [3]:
onehot_columns = [Columns.CUSTOMER_TYPE + 'potato']
ordinal_columns = [Columns.SPECIFIC_HOLIDAY]

In [5]:
simple_columns_transform = make_column_transformer(
    ( OneHotEncoder(), onehot_columns),
    remainder="passthrough"
)

ordinal_and_onehot_transformation = make_column_transformer(
    ( OneHotEncoder(), onehot_columns),
    ( OrdinalEncoder(), ordinal_columns),
    remainder="passthrough"
)

In [6]:
random_forest = RandomForestClassifier()

In [7]:
rf_pipeline_1 = Pipeline(
    [
        ("column_transforms", simple_columns_transform),
        ("random_forest", random_forest)
    ]
)

rf_pipeline_2 = Pipeline(
    [
        ("column_transforms", ordinal_and_onehot_transformation),
        ("random_forest", random_forest)
    ]
)

In [8]:
rf_pipeline_2.fit(data.TRAINING.predictors, data.TRAINING.outcome)

ValueError: A given column is not a column of the dataframe

In [None]:
#Choosing hyperparameters and justification

random_params = {
    'random_forest__bootstrap': [True, False], #Prevent overfitting and reduce variance
    'random_forest__criterion': ['gini', 'entropy', 'log_loss'], #Testing diffrent split functions
    'random_forest__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], #Testing optimal depth and prevent overfitting when set to None
    'random_forest__max_features': ['log2', 'sqrt'], #Number of features to consider when looking for the best split
    'random_forest__min_samples_leaf': [1, 2, 4], #Minimum number of samples required to be at a leaf node
    'random_forest__min_samples_split': [2, 5, 10], #Minimum number of samples required to split an internal node
    'random_forest__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], #Number of trees in the forest, a lot of trees can slow down the training process
    'random_forest__class_weight': ["balanced", {0:3,1:7}, {0:2,1:8}, {0:1,1:9}] #Takes into account the imbalance in the decision class
}

In [None]:
gs = (
    TuneHyperParams()
    .random_grid_search(
        rf_pipeline_1,
        random_params
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
    )

In [None]:
gs2 = (
    TuneHyperParams()
    .random_grid_search(
        rf_pipeline_2,
        random_params
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
    )

In [None]:
data.TESTING.outcome.value_counts() #.1855

In [None]:
data.TRAINING.outcome.value_counts()

In [None]:
get_cross_validation_results(
    gs2,
    data.TRAINING.predictors,
    data.TRAINING.outcome
)