In [1]:
from utils import (
    get_cross_validation_results,
    TransactionDataset,
    TuneHyperParams,
    get_final_model_performance
)

from constants import Columns, ModelConstants, ResamplingStrategy

In [2]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    FunctionTransformer,
    Normalizer,
    PowerTransformer,
    RobustScaler
)

from sklearn.compose import make_column_transformer, make_column_selector

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, ADASYN

In [3]:
data = TransactionDataset().get_training_test_split()

In [4]:
log_transform = FunctionTransformer(
    func=np.log1p, inverse_func=np.expm1, check_inverse=False
)

order_mapping = [['0', '0.2', '0.4', '0.6', '0.8', '1']]

norm_and_one_hot_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (Normalizer(), make_column_selector(dtype_include="number"))
)

log_and_ordinal_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (log_transform, make_column_selector(dtype_include="number"))
)

power_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (PowerTransformer(), make_column_selector(dtype_include="number"))
)

robust_transformation = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (RobustScaler(), make_column_selector(dtype_include="number"))
)

In [5]:
regression_model = LogisticRegression(
    random_state=ModelConstants.RANDOM_STATE,
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    warm_start=True
)

In [6]:
smote_sampling = SMOTE(random_state=ModelConstants.RANDOM_STATE)
adasyn_sampling = ADASYN(random_state=ModelConstants.RANDOM_STATE)

In [7]:
smote_column_transforms_model = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("sampling", smote_sampling),
        ("logistic_regression", regression_model)
    ]
)

In [8]:
tuning_params = {
    "column_transformation": [
        log_and_ordinal_transforms,
        norm_and_one_hot_transforms,
        power_transforms,
        robust_transformation
    ],

    "sampling": [
        smote_sampling,
        adasyn_sampling,
    ],
    "sampling__sampling_strategy": [
        ResamplingStrategy.MINORITY_ONLY,
        ResamplingStrategy.ALL,
        0.2,
        0.3,
        0.4,
        0.5,
        0.6,
        0.7
    ],

    "logistic_regression__penalty": ["l1", "l2"],
    "logistic_regression__solver": ["saga", "liblinear"],
    "logistic_regression__C": [0.1, 0.5, 1, 1.5, 2, 2.5, 3],
}

In [10]:
best_model = TuneHyperParams(
).random_grid_search(
    smote_column_transforms_model,
    tuning_params
).fit_model(
    data.TRAINING.predictors,
    data.TRAINING.outcome
).get_best_model()


{'sampling__sampling_strategy': 0.3, 'sampling': ADASYN(random_state=123, sampling_strategy=0.3), 'logistic_regression__solver': 'liblinear', 'logistic_regression__penalty': 'l2', 'logistic_regression__C': 1.5, 'column_transformation': ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(),
                                 ['Customer_Type']),
                                ('ordinalencoder',
                                 OrdinalEncoder(categories=[['0', '0.2', '0.4',
                                                             '0.6', '0.8',
                                                             '1']]),
                                 ['SpecificHoliday']),
                                ('powertransformer', PowerTransformer(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x13fd3e080>)])}
Best parameter (CV score: 0.656):


In [11]:
get_cross_validation_results(best_model, data.TRAINING.predictors, data.TRAINING.outcome)

Unnamed: 0,Test,Score
0,fit_time,0.267738
1,score_time,0.01546
2,test_F1 Score,0.656153
3,test_Accuracy,0.881811
4,test_Balanced Accuracy,0.817002
5,test_Matthew's Correlation Coefficient,0.589814


In [12]:
get_final_model_performance(best_model, data.TRAINING, data.TESTING)

Unnamed: 0,Test,Score
0,F1 Score,0.669963
1,Accuracy,0.890619
2,Balanced Accuracy,0.789773
3,Matthew's Correlation Coefficient,0.605999
