In [1]:
import pandas as pd
import numpy as np

In [27]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.decomposition import PCA


In [3]:
from constants import Columns, ModelConstants
from utils import TuneHyperParams, TransactionDataset, get_cross_validation_results

In [4]:
data = TransactionDataset().get_training_test_split()

In [36]:
drop_columns = [
    Columns.INFO_PAGE_TIME,
    Columns.ACCOUNT_PAGE_TIME,
    Columns.PRODUCT_PAGE_TIME,
    Columns.AD_CAMPAIGN_1, 
    Columns.AD_CAMPAIGN_2,
    Columns.AD_CAMPAIGN_3
]


column_transformation_1 = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(), [Columns.SPECIFIC_HOLIDAY]),
    (FunctionTransformer(np.log1p), make_column_selector(dtype_include="number")),
    ("drop", drop_columns),
    remainder="passthrough"
)

column_transformation_2 =  make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(), [Columns.SPECIFIC_HOLIDAY]),
    remainder="passthrough"
)

In [16]:
nb_model = BernoulliNB(force_alpha=True)

In [17]:
principal_components = PCA()

In [23]:
nb_model_pipeline_1 = Pipeline(
    [
        ("column_transformations", column_transformation_1),
        ("principal_components", principal_components),
        ("nb_model", nb_model)
    ]
)

nb_model_pipeline_2 = Pipeline(
    [
        ("column_transformations", column_transformation_2),
        ("principal_components", principal_components),
        ("nb_model", nb_model)
    ]
)

In [34]:
params_to_tune = {
    "principal_components__n_components": [3, 5, 7, 9, 11, 13],
    "nb_model__alpha": [0, 0.7, 0.9, 1],
    "nb_model__class_prior": [
        [0.7, 0.25],
        [0.6, 0.3],
        [0.6, 0.35],
        [0.7, 0.3],
        [0.55, 0.35],
        [0.55, 0.45]
    ]
}


gs = (
    TuneHyperParams()
    .grid_search(
        nb_model_pipeline_2,
        params_to_tune
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
);


{'nb_model__alpha': 0, 'nb_model__class_prior': [0.6, 0.3], 'principal_components__n_components': 5}
Best parameter (CV score: 0.647):


In [35]:
params_to_tune = {
    "principal_components__n_components": [3, 5, 7, 9, 11, 13],
    "nb_model__alpha": [0, 0.7, 0.9, 1],
    "nb_model__class_prior": [
        [0.7, 0.25],
        [0.6, 0.3],
        [0.6, 0.35],
        [0.7, 0.3],
        [0.55, 0.35],
        [0.55, 0.45]
    ]
}


gs = (
    TuneHyperParams()
    .grid_search(
        nb_model_pipeline_1,
        params_to_tune
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
);


{'nb_model__alpha': 0, 'nb_model__class_prior': [0.55, 0.45], 'principal_components__n_components': 5}
Best parameter (CV score: 0.620):


### With SVC

In [51]:
sv_classifier = SVC()

In [52]:
svc_pipeline1 = Pipeline(
    [("column_transformation", column_transformation_1),
     ('svc_model',sv_classifier)
     ]    
)

svc_pipeline2= Pipeline([("column_transformation",column_transformation_2),
                         ('svc_model', sv_classifier)
                         ]
)

In [57]:
params_to_tune = {
    "svc_model__C": [0.0, 0.1, 1, 10, 100],
     "svc_model__class_weight": [
        "balanced",
        {1: 0.3, 0:0.7},
        {1: 0.35, 0:0.65},
        {1: 0.4, 0:0.6},
        {1: 0.45, 0:0.55},
        {1: 0.2, 0:0.8},
    ]
}


gs = (
    TuneHyperParams()
    .grid_search(
    svc_pipeline1,
    params_to_tune
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
);

30 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/shanxie/.local/share/virtualenvs/ML_Challenge-_ai04I7D/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanxie/.local/share/virtualenvs/ML_Challenge-_ai04I7D/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanxie/.local/share/virtualenvs/ML_Challenge-_ai04I7D/lib/python3.10/site-packages/sklearn/svm/_base.py", line 180, in fit
    self._validate_params()
 

{'svc_model__C': 0.1, 'svc_model__class_weight': 'balanced'}
Best parameter (CV score: 0.661):


In [55]:
params_to_tune = {
    "svc_model__C": [0.0, 0.1, 1, 10, 100],
    "svc_model__class_weight": [
        "balanced",
        {1: 0.3, 0:0.7},
        {1: 0.35, 0:0.65},
        {1: 0.4, 0:0.6},
        {1: 0.45, 0:0.55},
        {1: 0.2, 0:0.8},
    ]
    
}


gs = (
    TuneHyperParams()
    .grid_search(
    svc_pipeline2,
    params_to_tune
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_scores_and_params()
);

30 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/shanxie/.local/share/virtualenvs/ML_Challenge-_ai04I7D/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanxie/.local/share/virtualenvs/ML_Challenge-_ai04I7D/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanxie/.local/share/virtualenvs/ML_Challenge-_ai04I7D/lib/python3.10/site-packages/sklearn/svm/_base.py", line 180, in fit
    self._validate_params()
 

{'svc_model__C': 100, 'svc_model__class_weight': 'balanced'}
Best parameter (CV score: 0.604):


NameError: name 'get_final_model_performance' is not defined