In [None]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
from utils import (
    get_cross_validation_results,
    get_final_model_performance,
    TransactionDataset,
    TuneHyperParams,
)

from constants import Resample, Columns, ModelConstants

In [2]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer, make_column_selector

In [3]:
data = TransactionDataset().get_training_test_split(resample=Resample.no_resample)

### Defining Column Transformations

In [4]:
log_transform = FunctionTransformer(
    func=np.log1p, inverse_func=np.expm1, check_inverse=False
)

log_and_ordinal_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(), [Columns.SPECIFIC_HOLIDAY]),
    (log_transform, make_column_selector(dtype_include="number"))
)

norm_and_one_hot_transforms = make_column_transformer(
    (OneHotEncoder(), [Columns.SPECIFIC_HOLIDAY, Columns.CUSTOMER_TYPE]),
    (Normalizer(), make_column_selector(dtype_include="number"))
)

In [5]:
regression_model = LogisticRegression(
    random_state=ModelConstants.RANDOM_STATE,
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    warm_start=True
)

### Defining Model Pipelines

In [6]:
log_and_ordinal_model_pipeline = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("logistic_regression", regression_model)
    ]
)

norm_and_one_hot_model_pipeline = Pipeline(
    [
        ("column_transformation", norm_and_one_hot_transforms),
        ("logistic_regression", regression_model)
    ]
)

pca_and_log_ordinal_model_pipeline = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("principal_components", PCA()),
        ("logistic_regression", regression_model)
    ]
)

### Tuning Logistic Regression Models

In [None]:
tuning_params = {
    "logistic_regression__penalty": ["l1", "l2"],
    "logistic_regression__solver": ["saga", "liblinear"],
    "logistic_regression__C": [0.1, 0.5, 1, 1.5, 2, 2.5, 3],
    "logistic_regression__class_weight": [
        "balanced",
        {1: 0.55, 0: 0.45},
        {1: 0.6, 0: 0.4},
        {1: 0.65, 0: 0.35},
        {1: 0.7, 0: 0.3},
        {1: 0.75, 0: 0.25},
        {1: 0.8, 0: 0.2},
    ],
}

In [None]:
gs_log_ordinal = (
    TuneHyperParams()
    .full_grid_search(
     log_and_ordinal_model_pipeline,
     tuning_params
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
)

In [None]:
gs_norm = (
    TuneHyperParams()
    .full_grid_search(
     norm_and_one_hot_model_pipeline,
     tuning_params
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
)

In [None]:
pca_tuning = {
    "principal_components__n_components": [3, 5, 7, 9, 11, 13, 15],
    "principal_components__whiten": [True, False]
}

tuning_params = tuning_params | pca_tuning

gs_pca = (
    TuneHyperParams()
    .full_grid_search(
     pca_and_log_ordinal_model_pipeline,
     tuning_params
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
)

### Refitting models after Grid Search

In [7]:
# {'logistic_regression__C': 1.5, 'logistic_regression__class_weight': {1: 0.8, 0: 0.2}, 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'saga'}
# Best parameter (CV score: 0.666):

final_log_model = LogisticRegression(
    C=1.5,
    penalty="l1",
    solver="saga",
    class_weight={1: 0.8, 0: 0.2},
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    random_state=ModelConstants.RANDOM_STATE,
    warm_start=True
)

final_log_model_pipeline = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("logistic_regression", final_log_model)
    ]
)

get_cross_validation_results(
    final_log_model_pipeline,
    data.TRAINING.predictors,
    data.TRAINING.outcome
)

get_final_model_performance(
    final_log_model_pipeline,
    data.TRAINING,
    data.TESTING
)

fit_time: 5.054137134552002
score_time: 0.007306718826293945
test_accuracy: 0.880068627615484
test_balanced_accuracy: 0.8317220389462638
test_f1: 0.6648300886446668


Unnamed: 0,Test,Score
0,F1 Score,0.681657
1,Accuracy,0.889799
2,Balanced Accuracy,0.787254
3,Matthew's Correlation Coefficient,0.619925


In [10]:
# {'logistic_regression__C': 1.5, 'logistic_regression__class_weight': 'balanced', 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'liblinear'}
# Best parameter (CV score: 0.617):

final_norm_model = LogisticRegression(
    C=1.5,
    solver="liblinear",
    penalty="l1",
    class_weight="balanced",
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    random_state=ModelConstants.RANDOM_STATE,
    warm_start=True
)

final_norm_model_pipeline = Pipeline(
    [
        ("column_transformation", norm_and_one_hot_transforms),
        ("logistic_regression", final_norm_model)
    ]
)

get_cross_validation_results(
    final_norm_model_pipeline,
    data.TRAINING.predictors,
    data.TRAINING.outcome
)

get_final_model_performance(
    final_norm_model_pipeline,
    data.TRAINING,
    data.TESTING
)

fit_time: 0.39056107997894285
score_time: 0.00755312442779541
test_accuracy: 0.878942836887763
test_balanced_accuracy: 0.7779587663627822
test_f1: 0.6198352238036511


Unnamed: 0,Test,Score
0,F1 Score,0.634538
1,Accuracy,0.888161
2,Balanced Accuracy,0.789735
3,Matthew's Correlation Coefficient,0.568762


In [11]:
# {'logistic_regression__C': 0.1, 'logistic_regression__class_weight': {1: 0.8, 0: 0.2}, 'logistic_regression__penalty': 'l1', 'logistic_regression__solver': 'liblinear', 'principal_components__n_components': 11, 'principal_components__whiten': False}
# Best parameter (CV score: 0.667):

final_log_pca_model = LogisticRegression(
    C=0.1,
    solver="liblinear",
    penalty="l1",
    class_weight={1: 0.8, 0: 0.2},
    max_iter=ModelConstants.MAX_ITERATIONS * 10,
    random_state=ModelConstants.RANDOM_STATE,
    warm_start=True
)

final_pca_model_pipeline = Pipeline(
    [
        ("column_transformation", log_and_ordinal_transforms),
        ("principal_components", PCA(n_components=11, whiten=False)),
        ("logistic_regression", final_log_pca_model)
    ]
)

get_cross_validation_results(
    final_pca_model_pipeline,
    data.TRAINING.predictors,
    data.TRAINING.outcome
)

get_final_model_performance(
    final_pca_model_pipeline,
    data.TRAINING,
    data.TESTING
)

fit_time: 0.05665748119354248
score_time: 0.010228633880615234
test_accuracy: 0.879147650049499
test_balanced_accuracy: 0.8346525059058598
test_f1: 0.6657237530635807


Unnamed: 0,Test,Score
0,F1 Score,0.681499
1,Accuracy,0.88857
2,Balanced Accuracy,0.785155
3,Matthew's Correlation Coefficient,0.619932
