In [1]:
from utils import (
    FinalModelPerformance,
    TransactionDataset,
    TuneHyperParams,
)

from constants import Columns, ModelConstants, ResamplingStrategy

In [16]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    FunctionTransformer,
    Normalizer,
    PowerTransformer,
    RobustScaler
)
from sklearn.compose import make_column_transformer, make_column_selector

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, NearMiss

from joblib import dump


In [3]:
data = TransactionDataset().get_training_test_split()

In [4]:
log_transform = FunctionTransformer(
    func=np.log1p, inverse_func=np.expm1, check_inverse=False
)

order_mapping = [['0', '0.2', '0.4', '0.6', '0.8', '1']]

norm_and_one_hot_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (Normalizer(), make_column_selector(dtype_include="number"))
)

log_and_ordinal_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (log_transform, make_column_selector(dtype_include="number"))
)

power_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (PowerTransformer(), make_column_selector(dtype_include="number"))
)

robust_transformation = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (RobustScaler(), make_column_selector(dtype_include="number"))
)

In [5]:
regression_model = LogisticRegression(
    random_state=ModelConstants.RANDOM_STATE,
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    warm_start=True
)

In [6]:
smote_oversampling = SMOTE(random_state=ModelConstants.RANDOM_STATE)
adasyn_oversampling = ADASYN(random_state=ModelConstants.RANDOM_STATE)

tomek_under_sampling = TomekLinks()
near_miss_under_sampling = NearMiss()

In [8]:
column_transforms_only_model = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("over_sampling", smote_oversampling),
        ("under_sampling", tomek_under_sampling),
        ("logistic_regression", regression_model)
    ]
)

In [9]:
tune_logistic_regression_penalty = { "logistic_regression__penalty": ["l1", "l2"]}
tune_logistic_regression_solver = {"logistic_regression__solver": ["saga", "liblinear"]}
tune_logistic_regression_C = {"logistic_regression__C": [0.1, 0.5, 1, 1.5, 2, 2.5, 3]}
tune_logistic_regression_class_weight = {
    "logistic_regression__class_weight": [
        None,
        "balanced",
        {1: 0.55, 0: 0.45},
        {1: 0.6, 0: 0.4},
        {1: 0.65, 0: 0.35},
        {1: 0.7, 0: 0.3},
        {1: 0.75, 0: 0.25},
        {1: 0.8, 0: 0.2},
    ]
}

tune_column_transformation = {
    "column_transformation": [
        log_and_ordinal_transforms,
        norm_and_one_hot_transforms,
        power_transforms,
        robust_transformation
    ],
}

tune_over_sampling = {
    "over_sampling": [
        smote_oversampling,
        adasyn_oversampling,
    ]
}

tune_over_sampling__sampling_strategy = {
    "over_sampling__sampling_strategy": [
        ResamplingStrategy.MINORITY_ONLY,
        ResamplingStrategy.ALL,
        0.2,
        0.3,
        0.4,
        0.5,
        0.6,
        0.7
    ],
}
tune_under_sampling = {
    "under_sampling": [
        near_miss_under_sampling,
        tomek_under_sampling,
        "passthrough"
    ],
}


tuning_params_1 = (
    tune_column_transformation |
    tune_logistic_regression_class_weight|
    tune_logistic_regression_penalty|
    tune_logistic_regression_solver|
    tune_logistic_regression_C|
    tune_over_sampling|
    tune_over_sampling__sampling_strategy|
    tune_under_sampling
)

tuning_params_2 = (
    tune_column_transformation |
    tune_logistic_regression_class_weight|
    tune_logistic_regression_penalty|
    tune_logistic_regression_solver|
    tune_logistic_regression_C|
    tune_under_sampling
)


In [10]:
best_model = (
    TuneHyperParams()
    .random_grid_search(
     column_transforms_only_model,
     [tuning_params_1, tuning_params_2]
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_model()
)

{   'column_transformation': ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(),
                                 ['Customer_Type']),
                                ('ordinalencoder',
                                 OrdinalEncoder(categories=[['0', '0.2', '0.4',
                                                             '0.6', '0.8',
                                                             '1']]),
                                 ['SpecificHoliday']),
                                ('powertransformer', PowerTransformer(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x1359c7040>)]),
    'logistic_regression__C': 0.1,
    'logistic_regression__class_weight': {0: 0.4, 1: 0.6},
    'logistic_regression__penalty': 'l2',
    'logistic_regression__solver': 'liblinear',
    'over_sampling': SMOTE(random_state=123, sampling_strategy=0.5),
    'over_sampling__sampling_strategy': 0.5,
    'under_sampling': 'passthrou

In [11]:
final_model_performance_lr = FinalModelPerformance(
    model=best_model,
    data=data
)

In [14]:
final_model_performance_lr.get_cross_validation_results()

Unnamed: 0,Metric for Training Set,Score
0,fit_time mean score,0.241923
1,score_time mean score,0.018467
2,5-fold CV F1 Score mean score,0.66327
3,5-fold CV Accuracy mean score,0.873719
4,5-fold CV Balanced Accuracy mean score,0.841874
5,5-fold CV Matthew's Correlation Coefficient me...,0.600981


In [15]:
final_model_performance_lr.get_final_model_performance()

Unnamed: 0,Metric for Testing Set,Score
0,F1 Score,0.67933
1,Accuracy,0.882425
2,Balanced Accuracy,0.776068
3,Matthew's Correlation Coefficient,0.619154


In [17]:
dump(final_model_performance_lr.model, "./models_exports/logistic_regression_classifier.joblib")

['./models_exports/logistic_regression_classifier.joblib']