In [1]:
import pandas as pd
import numpy as np


In [22]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE,ADASYN
from joblib import dump

In [3]:
from constants import Columns, ModelConstants
from utils import (
    TuneHyperParams, 
    TransactionDataset, 
    FinalModelPerformance
)

In [4]:
data = TransactionDataset().get_training_test_split()

In [5]:
drop_columns = [
    Columns.INFO_PAGE_TIME,
    Columns.ACCOUNT_PAGE_TIME,
    Columns.PRODUCT_PAGE_TIME,
    Columns.AD_CAMPAIGN_1, 
    Columns.AD_CAMPAIGN_2,
    Columns.AD_CAMPAIGN_3
]


column_transformation_1 = make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(), [Columns.SPECIFIC_HOLIDAY]),
    (FunctionTransformer(np.log1p), make_column_selector(dtype_include="number")),
    ("drop", drop_columns),
    remainder="passthrough"
)

column_transformation_2 =  make_column_transformer(
    (OneHotEncoder(), [Columns.CUSTOMER_TYPE]),
    (OrdinalEncoder(), [Columns.SPECIFIC_HOLIDAY]),
    remainder="passthrough"
)

In [6]:
smote_oversampling = SMOTE (random_state= ModelConstants.RANDOM_STATE)
adasyn_oversampling = ADASYN(random_state= ModelConstants.RANDOM_STATE)

In [7]:
nb_model = BernoulliNB(force_alpha=True)

In [8]:
principal_components = PCA()

In [9]:
nb_model_pipeline = Pipeline(
    [
        ("column_transformations", column_transformation_1),
        ("data_sampling", smote_oversampling),
        ("principal_components", principal_components),
        ("nb_model", nb_model)
    ]
)


In [10]:
params_to_tune = {
    "column_transformations" : [column_transformation_1 ,column_transformation_2],
    "data_sampling":[smote_oversampling , adasyn_oversampling, "passthrough"],
    "principal_components__n_components": [3, 5, 7, 9, 11, 13],
    "nb_model__alpha": [0, 0.7, 0.9, 1],
    "nb_model__class_prior": [
        [0.7, 0.25],
        [0.6, 0.3],
        [0.6, 0.35],
        [0.7, 0.3],
        [0.55, 0.35],
        [0.55, 0.45]
    ]
}


best_model_nb = (
    TuneHyperParams()
    .full_grid_search(
        nb_model_pipeline,
        params_to_tune
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_model()
);


{   'column_transformations': ColumnTransformer(remainder='passthrough',
                  transformers=[('onehotencoder', OneHotEncoder(),
                                 ['Customer_Type']),
                                ('ordinalencoder', OrdinalEncoder(),
                                 ['SpecificHoliday'])]),
    'data_sampling': 'passthrough',
    'nb_model__alpha': 0,
    'nb_model__class_prior': [0.6, 0.3],
    'principal_components__n_components': 5}
Best parameter (CV score: 0.647):


In [11]:
final_model_evaluation_nb = FinalModelPerformance(
 model=best_model_nb,
 data=data
)

In [12]:
final_model_evaluation_nb.get_cross_validation_results()

Unnamed: 0,Metric for Training Set,Score
0,fit_time mean score,0.044897
1,score_time mean score,0.017828
2,5-fold CV F1 Score mean score,0.646885
3,5-fold CV Accuracy mean score,0.885498
4,5-fold CV Balanced Accuracy mean score,0.798395
5,5-fold CV Matthew's Correlation Coefficient me...,0.579477


In [13]:
final_model_evaluation_nb.get_final_model_performance()

Unnamed: 0,Metric for Testing Set,Score
0,F1 Score,0.662286
1,Accuracy,0.894715
2,Balanced Accuracy,0.800931
3,Matthew's Correlation Coefficient,0.599931


In [24]:
dump(final_model_evaluation_nb.model, "./models_exports/naive_bayes_classifier.joblib")

['./models_exports/naive_bayes_classifier.joblib']

### With SVC

In [14]:
sv_classifier = SVC(random_state=ModelConstants.RANDOM_STATE)

In [15]:
svc_pipeline = Pipeline(
    [("column_transformation", column_transformation_1),
    ("data_sampling", smote_oversampling ),
     ('svc_model',sv_classifier)
     ]    
)



In [16]:
params_to_tune = {
    "column_transformation" : [column_transformation_1],
    "data_sampling":[smote_oversampling , adasyn_oversampling, "passthrough"],
    "svc_model__C": [0.1, 1, 10, 100],
    "svc_model__kernel" :["linear"]
}


best_model_svc = (
    TuneHyperParams()
    .random_grid_search(
    svc_pipeline,
    params_to_tune
    )
    .fit_model(
        data.TRAINING.predictors,
        data.TRAINING.outcome
    )
    .get_best_model()
);

{   'column_transformation': ColumnTransformer(remainder='passthrough',
                  transformers=[('onehotencoder', OneHotEncoder(),
                                 ['Customer_Type']),
                                ('ordinalencoder', OrdinalEncoder(),
                                 ['SpecificHoliday']),
                                ('functiontransformer',
                                 FunctionTransformer(func=<ufunc 'log1p'>),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x13eb5bac0>),
                                ('drop', 'drop',
                                 ['Info_Page_Time', 'Account_Page_Time',
                                  'ProductPage_Time', 'Ad_Campaign_1',
                                  'Ad_Campaign2', 'Ad_Campaign3'])]),
    'data_sampling': SMOTE(random_state=123),
    'svc_model__C': 100,
    'svc_model__kernel': 'linear'}
Best parameter (CV score: 0.659):


In [17]:
final_model_evaluation_svc = FinalModelPerformance(
    model=best_model_svc,
    data=data
)


In [20]:
final_model_evaluation_svc.get_cross_validation_results()

Unnamed: 0,Metric for Training Set,Score
0,fit_time mean score,165.977011
1,score_time mean score,0.25133
2,5-fold CV F1 Score mean score,0.659497
3,5-fold CV Accuracy mean score,0.874539
4,5-fold CV Balanced Accuracy mean score,0.834882
5,5-fold CV Matthew's Correlation Coefficient me...,0.595188


In [19]:
final_model_evaluation_svc.get_final_model_performance()

Unnamed: 0,Metric for Testing Set,Score
0,F1 Score,0.682648
1,Accuracy,0.886112
2,Balanced Accuracy,0.781317
3,Matthew's Correlation Coefficient,0.622174


In [23]:
dump(final_model_evaluation_svc.model, "./models_exports/support_vector_classifier.joblib")

['./models_exports/support_vector_classifier.joblib']