In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from utils import (
    get_cross_validation_results,
    get_final_model_performance,
    TransactionDataset,
    TuneHyperParams,
)

from constants import Resample, Columns, ModelConstants

In [3]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer, Normalizer, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer, make_column_selector

### Obtain Dataset and split it

In [4]:
data = TransactionDataset().get_training_test_split(resample=Resample.no_resample)

### Defining Column Transformations

In [13]:
log_transform = FunctionTransformer(
    func=np.log1p, inverse_func=np.expm1, check_inverse=False
)

order_mapping = [['0', '0.2', '0.4', '0.6', '0.8', '1']]

norm_and_one_hot_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (Normalizer(), make_column_selector(dtype_include="number"))
)

log_and_ordinal_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (log_transform, make_column_selector(dtype_include="number"))
)

power_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(categories=order_mapping), [Columns.SPECIFIC_HOLIDAY]),
    (PowerTransformer(), make_column_selector(dtype_include="number"))
)

In [6]:
regression_model = LogisticRegression(
    random_state=ModelConstants.RANDOM_STATE,
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    warm_start=True
)

### Defining Model Pipelines

In [14]:
log_and_ordinal_model_pipeline = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("logistic_regression", regression_model)
    ]
)

norm_and_one_hot_model_pipeline = Pipeline(
    [
        ("column_transformation", norm_and_one_hot_transforms),
        ("logistic_regression", regression_model)
    ]
)

pca_and_log_ordinal_model_pipeline = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("principal_components", PCA()),
        ("logistic_regression", regression_model)
    ]
)

power_transform_model_pipeline = Pipeline(
    [
        ("column_transformation", power_transforms),
        ("principal_components", PCA()),
        ("logistic_regression", regression_model)
    ]
)

### Tuning Logistic Regression Models

In [8]:
tuning_params = {
    "logistic_regression__penalty": ["l1", "l2"],
    "logistic_regression__solver": ["saga", "liblinear"],
    "logistic_regression__C": [0.1, 0.5, 1, 1.5, 2, 2.5, 3],
    "logistic_regression__class_weight": [
        "balanced",
        {1: 0.55, 0: 0.45},
        {1: 0.6, 0: 0.4},
        {1: 0.65, 0: 0.35},
        {1: 0.7, 0: 0.3},
        {1: 0.75, 0: 0.25},
        {1: 0.8, 0: 0.2},
    ],
}

In [9]:
gs_log_ordinal = (
    TuneHyperParams()
    .full_grid_search(
     log_and_ordinal_model_pipeline,
     tuning_params
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
)

{'logistic_regression__C': 1.5, 'logistic_regression__class_weight': {1: 0.8, 0: 0.2}, 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'saga'}
Best parameter (CV score: 0.666):


In [10]:
gs_norm = (
    TuneHyperParams()
    .full_grid_search(
     norm_and_one_hot_model_pipeline,
     tuning_params
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
)

{'logistic_regression__C': 1.5, 'logistic_regression__class_weight': 'balanced', 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'liblinear'}
Best parameter (CV score: 0.616):


In [17]:
pca_tuning = {
    "principal_components__n_components": [3, 5, 7, 9, 11, 13, 15],
    "principal_components__whiten": [True, False]
}

tuning_params = tuning_params | pca_tuning

gs_pca_power = (
    TuneHyperParams()
    .full_grid_search(
     power_transform_model_pipeline,
     tuning_params
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
)

{'logistic_regression__C': 0.1, 'logistic_regression__class_weight': {1: 0.75, 0: 0.25}, 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'saga', 'principal_components__n_components': 15, 'principal_components__whiten': True}
Best parameter (CV score: 0.667):


In [16]:
gs_pca = (
    TuneHyperParams()
    .full_grid_search(
     pca_and_log_ordinal_model_pipeline,
     tuning_params
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
)

{'logistic_regression__C': 0.1, 'logistic_regression__class_weight': {1: 0.8, 0: 0.2}, 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'liblinear', 'principal_components__n_components': 11, 'principal_components__whiten': False}
Best parameter (CV score: 0.667):


### Refitting models after Grid Search

In [24]:
# {'logistic_regression __C': 1.5, 'logistic_regression__class_weight': {1: 0.8, 0: 0.2}, 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'saga'}

final_log_model = LogisticRegression(
    C=1.5,
    penalty="l1",
    solver="saga",
    class_weight={1: 0.8, 0: 0.2},
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    random_state=ModelConstants.RANDOM_STATE,
    warm_start=True
)

final_log_model_pipeline = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("logistic_regression", final_log_model)
    ]
)

get_cross_validation_results(
    final_log_model_pipeline,
    data.TRAINING.predictors,
    data.TRAINING.outcome
)

results = get_final_model_performance(
    final_log_model_pipeline,
    data.TRAINING,
    data.TESTING
)

print(results)

fit_time: 4.789946794509888
score_time: 0.00689387321472168
test_accuracy: 0.880068627615484
test_balanced_accuracy: 0.8317220389462638
test_f1: 0.6648300886446668
                                Test     Score
0                           F1 Score  0.681657
1                           Accuracy  0.889799
2                  Balanced Accuracy  0.787254
3  Matthew's Correlation Coefficient  0.619925


In [23]:
# {'logistic_regression__C': 1.5, 'logistic_regression__class_weight': 'balanced', 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'liblinear'}

final_norm_model = LogisticRegression(
    C=1.5,
    solver="liblinear",
    penalty="l1",
    class_weight="balanced",
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    random_state=ModelConstants.RANDOM_STATE,
    warm_start=True
)

final_norm_model_pipeline = Pipeline(
    [
        ("column_transformation", norm_and_one_hot_transforms),
        ("logistic_regression", final_norm_model)
    ]
)

get_cross_validation_results(
    final_norm_model_pipeline,
    data.TRAINING.predictors,
    data.TRAINING.outcome
)

results = get_final_model_performance(
    final_norm_model_pipeline,
    data.TRAINING,
    data.TESTING
)

print(results)

fit_time: 0.5719465970993042
score_time: 0.00810391902923584
test_accuracy: 0.8790450861620552
test_balanced_accuracy: 0.7782856374781862
test_f1: 0.6203327651254187
                                Test     Score
0                           F1 Score  0.636242
1                           Accuracy  0.888980
2                  Balanced Accuracy  0.791557
3  Matthew's Correlation Coefficient  0.571044


In [22]:
# {'logistic_regression__C': 0.1, 'logistic_regression__class_weight': {1: 0.8, 0: 0.2}, 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'liblinear', 'principal_components__n_components': 11, 'principal_components__whiten': False}


final_log_pca_model = LogisticRegression(
    C=0.1,
    solver="liblinear",
    penalty="l1",
    class_weight={1: 0.8, 0: 0.2},
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    random_state=ModelConstants.RANDOM_STATE,
    warm_start=True
)

final_pca_model_pipeline = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("principal_components", PCA(n_components=11, whiten=False)),
        ("logistic_regression", final_log_pca_model)
    ]
)

get_cross_validation_results(
    final_pca_model_pipeline,
    data.TRAINING.predictors,
    data.TRAINING.outcome
)

results = get_final_model_performance(
    final_pca_model_pipeline,
    data.TRAINING,
    data.TESTING
)

print(results)

fit_time: 0.05274362564086914
score_time: 0.0097930908203125
test_accuracy: 0.879147650049499
test_balanced_accuracy: 0.8346525059058598
test_f1: 0.6657237530635807
                                Test     Score
0                           F1 Score  0.681499
1                           Accuracy  0.888570
2                  Balanced Accuracy  0.785155
3  Matthew's Correlation Coefficient  0.619932


In [21]:
# {'logistic_regression__C': 0.1, 'logistic_regression__class_weight': {1: 0.75, 0: 0.25}, 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'saga', 'principal_components__n_components': 15, 'principal_components__whiten': True}

final_power_pca_model = LogisticRegression(
    C=0.1,
    solver="saga",
    penalty="l1",
    class_weight={1: 0.75, 0: 0.25},
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    random_state=ModelConstants.RANDOM_STATE,
    warm_start=True
)

final_pca_power_model_pipeline = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("principal_components", PCA(n_components=15, whiten=True)),
        ("logistic_regression", final_power_pca_model)
    ]
)

get_cross_validation_results(
    final_pca_power_model_pipeline,
    data.TRAINING.predictors,
    data.TRAINING.outcome
)

results = get_final_model_performance(
    final_pca_power_model_pipeline,
    data.TRAINING,
    data.TESTING
)

print(results)

fit_time: 0.07852847576141357
score_time: 0.009978222846984863
test_accuracy: 0.8809914928603788
test_balanced_accuracy: 0.8229355682743515
test_f1: 0.6596332984749401
                                Test     Score
0                           F1 Score  0.682927
1                           Accuracy  0.893486
2                  Balanced Accuracy  0.794173
3  Matthew's Correlation Coefficient  0.621370
