In [None]:
import lib._util.visualplot as vp
import lib._util.mlpipe as mlpipe

# Pre-processing
from lib._class.DFDuplicateRemoval import DFDuplicateRemoval

# Feature scaling
from lib._class.DFRobustScaler import DFRobustScaler
from lib._class.DFStandardScaler import DFStandardScaler
from lib._class.DFMinMaxScaler import DFMinMaxScaler

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)

import numpy as np
import itertools
from tqdm import tqdm

# Time measurement
import time
from datetime import timedelta

# Scikit-Learn
from sklearn.model_selection import cross_validate, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline as SklearnPipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import BernoulliNB, CategoricalNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.dummy import DummyClassifier

from sklearn.gaussian_process.kernels import RBF
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import StackingClassifier

from xgboost import XGBClassifier

# Imbalanced-Learn
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import ADASYN, BorderlineSMOTE, SVMSMOTE
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks

# Plotly
import plotly.express as px
import plotly.figure_factory as ff

# Constant Variable

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/graph/'

# Phase 1 - Data Loading
- Reference: https://www.kaggle.com/mlg-ulb/creditcardfraud/home
- Time: Number of seconds elapsed between this transaction and the first transaction in the dataset
- V1-V28: May be result of a PCA dimensionality reduction to protect user identities and sensitive features
- Amount: Transaction amount
- Class: 1 for fraudulent transactions, 0 otherwise

In [None]:
df_chunks = pd.read_csv(f'{SOURCE_PATH_DATA}creditcard.csv', sep=',', chunksize=50_000)
data_df   = pd.concat(df_chunks)

data_df.shape

In [None]:
data_df.head()

In [None]:
vp.faststat(data_df)

In [None]:
vp.value_count(data_df, 'Class')

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=4,
             title='Phase 1 - Histogram',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 2048})

###### Box

In [None]:
vp.box(data_df,
       color='Class',
       max_col=4,
       title='Phase 1 - Box',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'height': 2048,
           'legend_orientation': 'h'
       })

###### KDE

In [None]:
vp.kde(data_df,
       color='Class',
       max_col=4,
       title='Phase 1 - KDE',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'height': 2048,
           'legend_orientation': 'h'
       })

# Phase 2 - Data Preparation
- Remove duplicated data
- Reduce memory usage

In [None]:
duplicate_removal = DFDuplicateRemoval(target='Class', keep='mean')
duplicate_removal.fit(data_df)

# Observe duplicated data
duplicate_df = duplicate_removal.duplicate_df

duplicate_df

In [None]:
vp.value_count(duplicate_df, 'Class')

In [None]:
# Observe if duplicated data are having different target label
vp.value_count(
    duplicate_df.groupby(duplicate_removal.subset).agg(
        Class=('Class', 'mean')
    ).reset_index(),
    'Class'
)

In [None]:
# Remove duplicated data
data_df = duplicate_removal.transform(data_df)

data_df.shape

In [None]:
vp.value_count(data_df, 'Class')

In [None]:
# TODO
# - change to class
data_df = mlpipe.reduce_memory_usage(data_df)

vp.faststat(data_df)

# Phase 3 - Classification
- Baseline

In [None]:
metric_dict = {
    'roc_auc':  [],
    'f1_macro': [],
    'cv_score': [],
    'matrix':   [],
    'method':   [],
}

In [None]:
# Separate features & target
X, y = mlpipe.xy_split(data_df, 'Class')

vp.value_count(y.to_frame(), 'Class')
X.shape

In [None]:
# Separate dataset
X_train, X_test, y_train, y_test = mlpipe.dataset_split(X, y, test_size=.8, stratify=y, random_state=0)

print('Train dataset:\n-----------------------')
vp.value_count(y_train.to_frame(), 'Class')
print('\nTest dataset:\n----------------------')
vp.value_count(y_test.to_frame(), 'Class')

In [None]:
def dict_sampling(metric_dict, indexes):
    new_dict = {}
    for key in metric_dict.keys():
        new_dict[key] = [x for i,x in enumerate(metric_dict[key]) if i in indexes]
    
    return new_dict

def cross_validation(estimator, X, y, n_splits=10, show_score=True):
    cv_dict = cross_validate(
        estimator,
        X,
        y,
        scoring=['f1_macro', 'roc_auc'],
        cv=StratifiedKFold(n_splits),
        verbose=10,
        n_jobs=-1
    )
    
    if show_score:
        cv_roc_auc  = cv_dict['test_roc_auc']
        cv_f1_macro = cv_dict['test_f1_macro']
        print(f'CV ROC-AUC:  {np.mean(cv_roc_auc) :.5f} ({np.std(cv_roc_auc) :.5f})')
        print(f'CV F1-Macro: {np.mean(cv_f1_macro) :.5f} ({np.std(cv_f1_macro) :.5f})\n')
    
    return cv_dict

def train_test_evaluation(method, metric_dict, cv_dict,
                          model, X_train, y_train, X_test, y_test,
                          train_pipeline=None, test_pipeline=None):
    
    # Model training
    model.fit(
        X_train if train_pipeline is None else train_pipeline.fit_transform(X_train),
        y_train
    )

    # Evaluate test data
    test_pipeline = train_pipeline if test_pipeline is None else test_pipeline
    
    eval_dict = mlpipe.eval_classif(
        y_test,
        model.predict(
            X_test if test_pipeline is None else test_pipeline.transform(X_test)
        ),
        y_prob=model.predict_proba(
            X_test if test_pipeline is None else test_pipeline.transform(X_test)
        )[:,-1],
        return_evaluation=True
    )

    metric_dict['method'].append(method)
    metric_dict['roc_auc'].append(cv_dict['test_roc_auc'])
    metric_dict['f1_macro'].append(cv_dict['test_f1_macro'])
    metric_dict['cv_score'].append((cv_dict['test_roc_auc'] + cv_dict['test_f1_macro']) / 2)
    metric_dict['matrix'].append(eval_dict['matrix'])

In [None]:
standard_scaler = DFStandardScaler(columns=['Time', 'Amount'])
minmax_scaler   = DFMinMaxScaler()
model           = LogisticRegression(random_state=0, n_jobs=-1)

# Cross validation
cv_dict = cross_validation(
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    X_train,
    y_train
)

# Evaluation
train_test_evaluation(
    'Baseline',
    metric_dict,
    cv_dict,
    model,
    X_train, y_train, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True)
)

# Phase 4 - Classification
- Class Weighting

In [None]:
class_weights = [
    ('Weight', mlpipe.class_weight(y_train)),
    ('Weight (Normalize)', mlpipe.class_weight(y_train, normalize=True)),
    ('Ratio', mlpipe.class_ratio(y_train)),
    ('Ratio (Normalize)', mlpipe.class_ratio(y_train, normalize=True)),
]

In [None]:
for method, weight_dict in class_weights:
    print(f'\n{method}:')
    
    model = LogisticRegression(random_state=0, n_jobs=-1,
                               class_weight=weight_dict)
    
    # Cross validation
    cv_dict = cross_validation(
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('minmax_scaler', minmax_scaler),
            ('model', model),
        ], verbose=True),
        X_train,
        y_train
    )

    # Evaluation
    train_test_evaluation(
        method,
        metric_dict,
        cv_dict,
        model,
        X_train, y_train, X_test, y_test,
        train_pipeline=Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('minmax_scaler', minmax_scaler),
        ], verbose=True)
    )

###### Box

In [None]:
def plot_score(metric_dict, title, layout_kwargs={}):
    eval_df  = pd.DataFrame(metric_dict)
    eval_dfs = []
    for index in eval_df.index:
        eval_dfs.append(
            pd.DataFrame({
                'method':   eval_df.at[index, 'method'],
                'cv_score': eval_df.at[index, 'cv_score'],
                'f1_macro': eval_df.at[index, 'f1_macro'],
                'roc_auc':  eval_df.at[index, 'roc_auc'],
            })
        )

    vp.box(
        pd.concat(eval_dfs, axis=0),
        color='method',
        max_col=1,
        title=title,
        out_path=OUT_PATH_GRAPH,
        layout_kwargs={**layout_kwargs, 'showlegend': False},
        box_kwargs={
            'boxmean': 'sd',
            'boxpoints': False,
        }
    )

In [None]:
plot_score(metric_dict,
           'Phase 4 - Box - Class Weighting')

###### Confusion Matrix

In [None]:
def plot_matrix(metric_dict, title, z_max=None, max_col=4, layout_kwargs={}):
    data_groups = []
    annotations = ()

    for index, matrix_df in enumerate(metric_dict['matrix']):
        # Reverse sorting to ensure plotly display is same as dataframe layout
        tmp_df = matrix_df.sort_index(ascending=False)
        
        # Heatmap data
        fig = fig = ff.create_annotated_heatmap(
            z=tmp_df.values,
            x=[f'Pred {x}' for x in tmp_df.columns],
            y=[f'True {x}' for x in tmp_df.index],
            colorscale='Portland',
            zmin=0,
            zmax=z_max
        )
        data_groups.append(fig['data'])

        # Heatmap annotation
        annotation = fig['layout']['annotations']
        for x in annotation:
            suffix = '' if index == 0 else index+1
            x['xref'] = f'x{suffix}'
            x['yref'] = f'y{suffix}'
        annotations += annotation

    vp.datagroups_subplots(data_groups,
                           xaxis_titles=metric_dict['method'],
                           max_col=max_col,
                           title=title,
                           out_path=OUT_PATH_GRAPH,
                           layout_kwargs={
                               'annotations': annotations,
                               **layout_kwargs,
                           })

In [None]:
plot_matrix(metric_dict,
            'Phase 4 - Confusion Matrix - Class Weighting',
            z_max=y_test.value_counts().values[-1],
            max_col=3)

# Phase 5 - Classification
- Re-sampling

In [None]:
pipelines = [
    (
        'ADASYN + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('adasyn', ADASYN(random_state=0, n_jobs=-1)),
            ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'ADASYN + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('adasyn', ADASYN(random_state=0, n_jobs=-1)),
            ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'SMOTE + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('smote_enn', SMOTEENN(random_state=0, n_jobs=-1)),
        ], verbose=True)
    ),
    (
        'SMOTE + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('smote_tomek', SMOTETomek(random_state=0, n_jobs=-1)),
        ], verbose=True)
    ),
    (
        'BL SMOTE + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
            ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'BL SMOTE + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
            ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'SVM SMOTE + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('svm_smote', SVMSMOTE(svm_estimator=SVC(class_weight='balanced', max_iter=200, random_state=0),
                                   random_state=0, n_jobs=-1)),
            ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'SVM SMOTE + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('svm_smote', SVMSMOTE(svm_estimator=SVC(class_weight='balanced', max_iter=200, random_state=0),
                                   random_state=0, n_jobs=-1)),
            ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
]

In [None]:
for method, resample_pipeline in pipelines:
    print(f'\n{method}:')
    
    # Cross validation
    model = LogisticRegression(random_state=0, n_jobs=-1)
    steps = resample_pipeline.steps.copy()
    steps.append(('minmax_scaler', minmax_scaler))
    steps.append(('model', model))

    # Reference: https://kiwidamien.github.io/how-to-do-cross-validation-when-upsampling-data.html
    cv_dict = cross_validation(
        Pipeline(steps, verbose=True),
        X_train,
        y_train
    )
    
    # Re-sampling
    X_bal, y_bal = resample_pipeline.fit_resample(X_train, y_train)
    
    # Evaluation
    train_test_evaluation(
        method,
        metric_dict,
        cv_dict,
        model,
        X_bal, y_bal, X_test, y_test,
        train_pipeline=Pipeline(steps=[
            ('minmax_scaler', minmax_scaler),
        ], verbose=True),
        test_pipeline=Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('minmax_scaler', minmax_scaler),
        ], verbose=True)
    )

###### Box

In [None]:
plot_score(dict_sampling(metric_dict, [0] + list(np.linspace(5, 12, 8))),
           'Phase 5 - Box - Re-sampling')

###### Confusion Matrix

In [None]:
plot_matrix(dict_sampling(metric_dict, [0] + list(np.linspace(5, 12, 8))),
            'Phase 5 - Confusion Matrix - Re-sampling',
            z_max=y_test.value_counts().values[-1],
            layout_kwargs={'height': 1000})

# Phase 6 - Classification
- Hyperparameter Tuning

In [None]:
def cv_search(X, y, estimator, param_distributions, n_splits=10, n_iter=100, cv=None):
    search = RandomizedSearchCV(
        estimator,
        param_distributions,
        scoring=['f1_macro', 'roc_auc'],
        cv=StratifiedKFold(n_splits=n_splits) if cv is None else cv,
        n_jobs=-1,
        verbose=10,
        n_iter=n_iter,
        random_state=0,
        refit=False
    )
    search.fit(X, y)
    
    return search

def cv_results(search):
    result_df = pd.DataFrame(search.cv_results_)
    result_df = result_df[[
        'params',
        'mean_test_f1_macro', 'rank_test_f1_macro',
        'mean_test_roc_auc', 'rank_test_roc_auc'
    ]]
    result_df['cv_score'] = result_df[['mean_test_f1_macro', 'mean_test_roc_auc']].sum(axis=1) / 2
    result_df.sort_values(by='cv_score', ascending=False, inplace=True)
    best_params = result_df['params'].values[0]
    
    return result_df, best_params

In [None]:
logistic_params = {
    # 'model__C': [.001, .01, .1, 1, 10, 100, 1000],
    'model__fit_intercept': [True, False],
    'model__max_iter': [1000],
    'model__multi_class': ['auto'],
}

search_params = [
    {
        **logistic_params,
        'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'model__penalty': ['l1'],
    },
    {
        **logistic_params,
        'model__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
        'model__penalty': ['l2'],
    },
    {
        **logistic_params,
        'model__solver': ['liblinear'],
        'model__penalty': ['l2'],
        'model__dual': [True, False],
    },
    {
        **logistic_params,
        'model__solver': ['saga'],
        'model__penalty': ['elasticnet'],
        'model__l1_ratio': np.linspace(.1, .9, 9),
    }
]

### Class Weight (Normalize)
- Best performance among class weighting

In [None]:
# Find best solver combination (low cardinality)
model  = LogisticRegression(random_state=0, n_jobs=-1,
                            class_weight=mlpipe.class_weight(y_train, normalize=True))
search = cv_search(
    X_train, y_train,
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    search_params
)

In [None]:
result_df, best_params = cv_results(search)

result_df

In [None]:
best_params

In [None]:
# Find best C value corresponding to solver combination (high cardinality)
search_params = [
    {
        **{k: [v] for k,v in best_params.items()},
        'model__C': [.001, .01, .1, 1, 10, 100, 1000],
    }
]

model  = LogisticRegression(random_state=0, n_jobs=-1,
                            class_weight=mlpipe.class_weight(y_train, normalize=True))
search = cv_search(
    X_train, y_train,
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    search_params
)

In [None]:
result_df, best_params = cv_results(search)

result_df

In [None]:
best_params

In [None]:
model = LogisticRegression(random_state=0, n_jobs=-1,
                           class_weight=mlpipe.class_weight(y_train, normalize=True),
                           **{k.replace('model__', ''): v for k,v in best_params.items()})

# Cross validation
cv_dict = cross_validation(
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    X_train,
    y_train
)

# Evaluation
train_test_evaluation(
    'Weight (Normalize) - Tuned',
    metric_dict,
    cv_dict,
    model,
    X_train, y_train, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True)
)

### SVM SMOTE + ENN
- Best performance among re-sampling

In [None]:
# Find best C value corresponding to solver combination (high cardinality)
search_params = [
    {
        **{k: [v] for k,v in best_params.items() if k != 'model__C'},
        'model__C': [.001, .01, .1, 1, 10, 100, 1000],
    }
]

model  = LogisticRegression(random_state=0, n_jobs=-1)
search = cv_search(
    X_train, y_train,
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('svm_smote', SVMSMOTE(svm_estimator=SVC(class_weight='balanced', max_iter=200, random_state=0),
                               random_state=0, n_jobs=-1)),
        ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    search_params
)

In [None]:
result_df, best_params = cv_results(search)

result_df

In [None]:
best_params

In [None]:
model = LogisticRegression(random_state=0, n_jobs=-1,
                           **{k.replace('model__', ''): v for k,v in best_params.items()})

# Cross validation
# Reference: https://kiwidamien.github.io/how-to-do-cross-validation-when-upsampling-data.html
cv_dict = cross_validation(
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('svm_smote', SVMSMOTE(svm_estimator=SVC(class_weight='balanced', max_iter=200, random_state=0),
                               random_state=0, n_jobs=-1)),
        ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    X_train,
    y_train
)

# Re-sampling
X_bal, y_bal = Pipeline(steps=[
    ('standard_scaler', standard_scaler),
    ('svm_smote', SVMSMOTE(svm_estimator=SVC(class_weight='balanced', max_iter=200, random_state=0),
                           random_state=0, n_jobs=-1)),
    ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
], verbose=True).fit_resample(X_train, y_train)

# Evaluation
train_test_evaluation(
    'SVM SMOTE + ENN - Tuned',
    metric_dict,
    cv_dict,
    model,
    X_bal, y_bal, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('minmax_scaler', minmax_scaler),
    ], verbose=True),
    test_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True)
)

###### Box

In [None]:
plot_score(dict_sampling(metric_dict, [0, 2, 11, 13, 14]),
           'Phase 6 - Box - Tuned')

###### Confusion Matrix

In [None]:
plot_matrix(dict_sampling(metric_dict, [0, 2, 11, 13, 14]),
            'Phase 6 - Confusion Matrix - Tuned',
            z_max=y_test.value_counts().values[-1],
            max_col=3)

# Phase 7 - Classification
- Model Stacking (Baseline)

In [None]:
def get_estimator(name, class_weight=None, params={}):
    kwargs1    = {'random_state': 0, 'n_jobs': -1}
    kwargs1_w  = {**kwargs1, 'class_weight': class_weight}
    kwargs1_w2 = {**kwargs1, 'scale_pos_weight': None if class_weight is None else class_weight[1]}
    kwargs2    = {'n_jobs': -1}
    kwargs2_w  = {**kwargs2, 'class_weight': class_weight}
    kwargs3    = {'random_state': 0}
    kwargs3_w  = {**kwargs3, 'class_weight': class_weight}
    kwargs3_w2 = {**kwargs3_w, 'probability': True}
    
    if name == 'KNeighborsClassifier':
        return make_pipeline(
            DFStandardScaler(),
            KNeighborsClassifier(**kwargs2, **params)
        )
    
    if name == 'LinearSVC':
        return make_pipeline(
            DFStandardScaler(),
            CalibratedClassifierCV(OneVsRestClassifier(LinearSVC(**kwargs3_w, **params), **kwargs2))
        )
    
    if name == 'NuSVC':
        return make_pipeline(
            DFStandardScaler(),
            OneVsRestClassifier(NuSVC(**kwargs3_w2, **params), **kwargs2)
        )
    
    if name == 'SVC':
        return make_pipeline(
            DFStandardScaler(),
            OneVsRestClassifier(SVC(**kwargs3_w2, **params), **kwargs2)
        )
    
    if name == 'XGBClassifier':
        return XGBClassifier(**kwargs1_w2, **params)
    
    if name == 'GradientBoostingClassifier':
        return GradientBoostingClassifier(**kwargs3, **params)
    
    if name == 'ExtraTreesClassifier':
        return ExtraTreesClassifier(**kwargs1_w, **params)
    
    if name == 'RandomForestClassifier':
        return RandomForestClassifier(**kwargs1_w, **params)
    
    if name == 'DecisionTreeClassifier':
        return DecisionTreeClassifier(**kwargs3_w, **params)
    
    if name == 'ExtraTreeClassifier':
        return ExtraTreeClassifier(**kwargs3_w, **params)
    
    if name == 'LogisticRegression':
        return make_pipeline(
            DFRobustScaler(),
            DFMinMaxScaler(),
            LogisticRegression(**kwargs1_w, **params)
        )
    
    if name == 'PassiveAggressiveClassifier':
        return make_pipeline(
            DFRobustScaler(),
            DFMinMaxScaler(),
            CalibratedClassifierCV(PassiveAggressiveClassifier(**kwargs1_w, **params))
        )
    
    if name == 'Perceptron':
        return make_pipeline(
            DFRobustScaler(),
            DFMinMaxScaler(),
            CalibratedClassifierCV(Perceptron(**kwargs1_w, **params))
        )
    
    if name == 'RidgeClassifier':
        return make_pipeline(
            DFStandardScaler(),
            CalibratedClassifierCV(RidgeClassifier(**kwargs3_w, **params))
        )
    
    if name == 'SGDClassifier':
        return make_pipeline(
            DFRobustScaler(),
            DFMinMaxScaler(),
            SGDClassifier(**kwargs1_w, **params)
        )
    
    if name == 'LinearDiscriminantAnalysis':
        return make_pipeline(
            DFStandardScaler(),
            LinearDiscriminantAnalysis(**params)
        )
    
    if name == 'QuadraticDiscriminantAnalysis':
        return make_pipeline(
            DFStandardScaler(),
            QuadraticDiscriminantAnalysis(**params)
        )
    
    if name == 'MLPClassifier':
        return make_pipeline(
            DFRobustScaler(),
            DFMinMaxScaler(),
            MLPClassifier(**kwargs3, **params)
        )
    
    if name == 'GaussianProcessClassifier':
        return make_pipeline(
            DFStandardScaler(),
            GaussianProcessClassifier(**kwargs1, **params)
        )
    
    if name == 'BernoulliNB':
        return BernoulliNB(**params)
    
    if name == 'CategoricalNB':
        return CategoricalNB(**params)
    
    if name == 'ComplementNB':
        return ComplementNB(**params)
    
    if name == 'GaussianNB':
        return GaussianNB(**params)
    
    if name == 'MultinomialNB':
        return MultinomialNB(**params)
    
    if name == 'DummyClassifier':
        return DummyClassifier(**kwargs3, **params)
    
    raise Exception(f'{name} not found.')

def model_name(model):
    name = model.__class__.__name__
    if name == 'RandomForestClassifier':
        return name
    
    if hasattr(model, 'estimator'):
        return model_name(model.estimator)

    elif hasattr(model, 'base_estimator'):
        return model_name(model.base_estimator)

    return name

def get_estimators(class_weight=None):
    estimators = [
        get_estimator('LinearSVC', class_weight),
        get_estimator('XGBClassifier', class_weight),
        get_estimator('GradientBoostingClassifier', class_weight),
        get_estimator('RandomForestClassifier', class_weight),
        get_estimator('PassiveAggressiveClassifier', class_weight),
        get_estimator('QuadraticDiscriminantAnalysis', class_weight)
    ]
    
    return [(model_name(x.steps[-1][1]) if type(x) == SklearnPipeline else model_name(x), x) for x in estimators]

In [None]:
def cv_base_models(estimators, X, y):
    eval_dict = {
        'model':         [],
        'mean_roc_auc':  [],
        'std_roc_auc':   [],
        'mean_f1_macro': [],
        'std_f1_macro':  [],
    }

    # Cross validation
    for name, estimator in estimators:
        print(name)
        
        cv_dict    = cross_validation(estimator, X, y, show_score=False)
        roc_scores = cv_dict['test_roc_auc']
        f1_scores  = cv_dict['test_f1_macro']
        
        eval_dict['model'].append(name)
        eval_dict['mean_roc_auc'].append(np.mean(roc_scores))
        eval_dict['mean_f1_macro'].append(np.mean(f1_scores))
        eval_dict['std_roc_auc'].append(np.std(roc_scores))
        eval_dict['std_f1_macro'].append(np.std(f1_scores))
    
    eval_df = pd.DataFrame(eval_dict).set_index('model')
    eval_df.index.name = ''
    
    return eval_df

In [None]:
weight_dict = mlpipe.class_weight(y_train, normalize=True)

# Final model
model = StackingClassifier(
    estimators=get_estimators(
        weight_dict
    ),
    final_estimator=LogisticRegression(
        class_weight=weight_dict,
        random_state=0,
        n_jobs=-1
    ),
    cv=StratifiedKFold(5),
    n_jobs=-1,
    verbose=10
)

In [None]:
# Cross validation (Base Models)
eval_df = cv_base_models(model.estimators, X_train, y_train)
eval_df

In [None]:
# Cross validation (Meta Model)
cv_dict = cross_validation(
    model,
    X_train,
    y_train,
    n_splits=5
)

In [None]:
def time_taken(seconds):
    print(f'\nTime Taken: {str(timedelta(seconds=seconds))}')

In [None]:
EXEC_START = time.time()

# Evaluation
train_test_evaluation(
    'Model Stacking (Baseline)',
    metric_dict,
    cv_dict,
    model,
    X_train, y_train, X_test, y_test
)

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

# Phase 8 - Classification
- Model Stacking (Hyperparameter Tuning)

In [None]:
# Find best combination (low cardinality)
linearsvc_params = {
    # 'calibratedclassifiercv__base_estimator__estimator__C': [.001, .01, .1, 1, 10, 100, 1000],
    'calibratedclassifiercv__base_estimator__estimator__fit_intercept': [True, False],
    'calibratedclassifiercv__base_estimator__estimator__max_iter': [1000],
}
tree_params = {
    # 'max_depth': [None, 3, 5, 7, 10, 15],
    # 'min_samples_split': np.linspace(.1, 1, 10),
    # 'min_samples_leaf': np.linspace(.1, .5, 5),
    # 'max_features': ['sqrt', 'log2'] + list(np.linspace(.5, 1, 6)),
}


search_params = [
    # LinearSVC
    [
        {
            **linearsvc_params,
            'calibratedclassifiercv__base_estimator__estimator__multi_class': ['ovr'],
            'calibratedclassifiercv__base_estimator__estimator__penalty': ['l2'],
            'calibratedclassifiercv__base_estimator__estimator__loss': ['squared_hinge'],
            'calibratedclassifiercv__base_estimator__estimator__dual': [True, False],
        },
        {
            **linearsvc_params,
            'calibratedclassifiercv__base_estimator__estimator__multi_class': ['ovr'],
            'calibratedclassifiercv__base_estimator__estimator__penalty': ['l2'],
            'calibratedclassifiercv__base_estimator__estimator__loss': ['hinge'],
            'calibratedclassifiercv__base_estimator__estimator__dual': [True],
        },
        {
            **linearsvc_params,
            'calibratedclassifiercv__base_estimator__estimator__multi_class': ['ovr'],
            'calibratedclassifiercv__base_estimator__estimator__penalty': ['l1'],
            'calibratedclassifiercv__base_estimator__estimator__loss': ['squared_hinge'],
            'calibratedclassifiercv__base_estimator__estimator__dual': [False],
        },
        {
            **linearsvc_params,
            'calibratedclassifiercv__base_estimator__estimator__multi_class': ['crammer_singer'],
        }
    ],
    # XGBClassifier
    [
        {
            'n_estimators': [250],
            # 'max_depth': [None, 3, 5, 7, 10, 15],
            'learning_rate': [.1, .01],
            'objective':['reg:logistic'],
            'booster': ['gbtree', 'gblinear', 'dart'],
            'subsample': [.8],
            # 'colsample_bytree': np.linspace(.5, 1, 6),
            # 'reg_alpha': np.linspace(0, 1, 11),
            # 'reg_lambda': np.linspace(0, 1, 11),
        }
    ],
    # GradientBoostingClassifier
    [
        {
            **tree_params,
            'criterion': ['friedman_mse'],
            'n_estimators': [250],
            'loss': ['deviance', 'exponential'],
            'learning_rate': [.1, .01],
            # 'subsample': np.linspace(.5, 1, 6),
        }
    ],
    # RandomForestClassifier
    [
        {
            **tree_params,
            'criterion': ['gini', 'entropy'],
            'n_estimators': [250],
            'bootstrap': [False],
        },
        {
            **tree_params,
            'criterion': ['gini', 'entropy'],
            'n_estimators': [250],
            'bootstrap': [True],
            # 'max_samples': np.linspace(.5, 1, 6),
        }
    ],
    # PassiveAggressiveClassifier
    [
        {
            # 'calibratedclassifiercv__base_estimator__C': [.001, .01, .1, 1, 10, 100, 1000],
            'calibratedclassifiercv__base_estimator__fit_intercept': [True, False],
            'calibratedclassifiercv__base_estimator__max_iter': [1000],
            'calibratedclassifiercv__base_estimator__loss': ['hinge', 'squared_hinge'],
            'calibratedclassifiercv__base_estimator__average': [True, False],
        }
    ],
    # QuadraticDiscriminantAnalysis
    [
        {
            'quadraticdiscriminantanalysis__reg_param': np.linspace(0, 1, 11),
        }
    ]
]

In [None]:
eval_dict = {
    'model':      [],
    'roc_auc':    [],
    'f1_macro':   [],
    'best_param': [],
}
weight_dict = mlpipe.class_weight(y_train, normalize=True)

for index, (name, estimator) in enumerate(get_estimators(weight_dict)):
    print(name)
    eval_dict['model'].append(name)
    
    # Cross validation search
    search = cv_search(
        X_train, y_train,
        estimator,
        search_params[index]
    )
    result_df, best_params = cv_results(search)
    
    try:
        cutoff_index = list(best_params.keys())[0].rindex('__')
    except ValueError:
        cutoff_index = None
    
    best_estimator = get_estimator(
        name,
        weight_dict,
        best_params if cutoff_index is None else {k[cutoff_index +2:]: v for k,v in best_params.items()}
    )
    
    cv_dict = cross_validation(best_estimator, X_train, y_train, show_score=False)
    eval_dict['best_param'].append(best_params)
    eval_dict['roc_auc'].append(cv_dict['test_roc_auc'])
    eval_dict['f1_macro'].append(cv_dict['test_f1_macro'])

In [None]:
lowcard_df = pd.DataFrame(eval_dict)
lowcard_df['mean_roc_auc']  = lowcard_df['roc_auc'].apply(np.mean)
lowcard_df['std_roc_auc']   = lowcard_df['roc_auc'].apply(np.std)
lowcard_df['mean_f1_macro'] = lowcard_df['f1_macro'].apply(np.mean)
lowcard_df['std_f1_macro']  = lowcard_df['f1_macro'].apply(np.std)
lowcard_df.drop(columns=['roc_auc', 'f1_macro'], inplace=True)

lowcard_df

In [None]:
# Find best combination (high cardinality)
tree_params = {
    'max_depth': [None, 3, 5, 7, 10, 15],
    'min_samples_split': np.linspace(.1, 1, 10),
    'min_samples_leaf': np.linspace(.1, .5, 5),
    'max_features': ['sqrt', 'log2'] + list(np.linspace(.5, 1, 6)),
}


search_params = [
    # LinearSVC
    {
        'calibratedclassifiercv__base_estimator__estimator__C': [.001, .01, .1, 1, 10, 100, 1000],
    },
    # XGBClassifier
    {
        'max_depth': [None, 3, 5, 7, 10, 15],
        'colsample_bytree': np.linspace(.5, 1, 6),
        'reg_alpha': np.linspace(0, 1, 11),
        'reg_lambda': np.linspace(0, 1, 11),
    },
    # GradientBoostingClassifier
    {
        **tree_params,
        'subsample': np.linspace(.5, 1, 6),
    },
    # RandomForestClassifier
    {
        **tree_params,
        'max_samples': np.linspace(.5, 1, 6),
    },
    # PassiveAggressiveClassifier
    {
        'calibratedclassifiercv__base_estimator__C': [.001, .01, .1, 1, 10, 100, 1000],
    },
    # QuadraticDiscriminantAnalysis
    {
        # No high cardinality hyperparameters
    }
]

In [None]:
eval_dict = {
    'model':      [],
    'roc_auc':    [],
    'f1_macro':   [],
    'best_param': [],
}
weight_dict = mlpipe.class_weight(y_train, normalize=True)

for index, (name, estimator) in enumerate(get_estimators(weight_dict)):
    print(name)
    eval_dict['model'].append(name)
    
    # Cross validation search
    search = cv_search(
        X_train, y_train,
        estimator,
        {
            **search_params[index],
            # Include best low cardinality hyperparameters
            **{k: [v] for k,v in lowcard_df['best_param'][index].items()}
        }
    )
    result_df, best_params = cv_results(search)
    
    try:
        cutoff_index = list(best_params.keys())[0].rindex('__')
    except ValueError:
        cutoff_index = None
    
    best_params    = best_params if cutoff_index is None else \
                     {k[cutoff_index +2:]: v for k,v in best_params.items()}
    best_estimator = get_estimator(
        name,
        weight_dict,
        best_params,
    )
    
    cv_dict = cross_validation(best_estimator, X_train, y_train, show_score=False)
    eval_dict['best_param'].append(best_params)
    eval_dict['roc_auc'].append(cv_dict['test_roc_auc'])
    eval_dict['f1_macro'].append(cv_dict['test_f1_macro'])

In [None]:
highcard_df = pd.DataFrame(eval_dict)
highcard_df['mean_roc_auc']  = highcard_df['roc_auc'].apply(np.mean)
highcard_df['std_roc_auc']   = highcard_df['roc_auc'].apply(np.std)
highcard_df['mean_f1_macro'] = highcard_df['f1_macro'].apply(np.mean)
highcard_df['std_f1_macro']  = highcard_df['f1_macro'].apply(np.std)
highcard_df.drop(columns=['roc_auc', 'f1_macro'], inplace=True)

highcard_df

In [None]:
# Get base estimators with best hyperparameters
estimators  = []
weight_dict = mlpipe.class_weight(y_train, normalize=True)

for index in tqdm(highcard_df.index):
    name   = highcard_df.at[index, 'model']
    params = highcard_df.at[index, 'best_param']
    estimators.append(
        get_estimator(name, weight_dict, params)
    )
estimators = [(model_name(x.steps[-1][1]) if type(x) == SklearnPipeline else model_name(x), x) for x in estimators]

# Final model
model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(
        class_weight=weight_dict,
        random_state=0,
        n_jobs=-1
    ),
    cv=StratifiedKFold(5),
    n_jobs=-1,
    verbose=10
)

In [None]:
# Cross validation (Meta Model)
cv_dict = cross_validation(
    model,
    X_train,
    y_train,
    n_splits=5
)

In [None]:
EXEC_START = time.time()

# Evaluation
train_test_evaluation(
    'Model Stacking (Base Tuned)',
    metric_dict,
    cv_dict,
    model,
    X_train, y_train, X_test, y_test
)

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

# Phase 9 - Classification
- Model Stacking (Combination)

In [None]:
# Reference: https://towardsdatascience.com/stacking-classifiers-for-higher-predictive-performance-566f963e4840#:~:text=What%20is%20Stacking%3F,any%20classifier%20of%20your%20choice.
sub_estimators = []
min_estimator  = 2
max_estimator  = len(estimators)

for n_sub in sorted(range(min_estimator, max_estimator +1), reverse=True):
    for subset in itertools.combinations(estimators, n_sub):
        sub_estimators.append(subset)

len(sub_estimators)

In [None]:
# Find best base-models combination
search_params = [
    {
        'model__estimators': sub_estimators,
    }
]

# Final model
model = StackingClassifier(
    estimators=[],
    final_estimator=LogisticRegression(
        class_weight=weight_dict,
        random_state=0,
        n_jobs=-1
    ),
    cv=StratifiedKFold(5),
    n_jobs=-1,
    verbose=10
)

# Cross validation search
search = cv_search(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True),
    Pipeline(steps=[
        ('model', model),
    ], verbose=True),
    search_params,
    cv=[(X_train.index, X_test.index + len(X_train))]
)

In [None]:
result_df, best_estimators = cv_results(search)

result_df

In [None]:
best_estimators

In [None]:
# Final model
model = StackingClassifier(
    **{k.replace('model__', ''): v for k,v in best_estimators.items()},
    final_estimator=LogisticRegression(
        class_weight=weight_dict,
        random_state=0,
        n_jobs=-1
    ),
    cv=StratifiedKFold(5),
    n_jobs=-1,
    verbose=10
)

In [None]:
EXEC_START = time.time()

# Evaluation
train_test_evaluation(
    'Model Stacking (Base Combination)',
    metric_dict,
    cv_dict,
    model,
    X_train, y_train, X_test, y_test
)

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

# Phase 10 - Classification
- Model Stacking (Meta Tuned)

In [None]:
logistic_params = {
    # 'model__final_estimator__C': [.001, .01, .1, 1, 10, 100, 1000],
    'model__final_estimator__fit_intercept': [True, False],
    'model__final_estimator__max_iter': [1000],
    'model__final_estimator__multi_class': ['auto'],
}

search_params = [
    {
        **logistic_params,
        'model__final_estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'model__final_estimator__penalty': ['l1'],
    },
    {
        **logistic_params,
        'model__final_estimator__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
        'model__final_estimator__penalty': ['l2'],
    },
    {
        **logistic_params,
        'model__final_estimator__solver': ['liblinear'],
        'model__final_estimator__penalty': ['l2'],
        'model__final_estimator__dual': [True, False],
    },
    {
        **logistic_params,
        'model__final_estimator__solver': ['saga'],
        'model__final_estimator__penalty': ['elasticnet'],
        'model__final_estimator__l1_ratio': np.linspace(.1, .9, 9),
    }
]

In [None]:
# Find best solver combination (low cardinality)
model = StackingClassifier(
    **{k.replace('model__', ''): v for k,v in best_estimators.items()},
    final_estimator=LogisticRegression(
        class_weight=weight_dict,
        random_state=0,
        n_jobs=-1
    ),
    cv=StratifiedKFold(5),
    n_jobs=-1,
    verbose=10
)

# Cross validation search
search = cv_search(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True),
    Pipeline(steps=[
        ('model', model),
    ], verbose=True),
    search_params,
    cv=[(X_train.index, X_test.index + len(X_train))]
)

In [None]:
result_df, best_params = cv_results(search)

result_df

In [None]:
best_params

In [None]:
# Find best C value corresponding to solver combination (high cardinality)
search_params = [
    {
        **{k: [v] for k,v in best_params.items()},
        'model__final_estimator__C': [.001, .01, .1, 1, 10, 100, 1000],
    }
]

model = StackingClassifier(
    **{k.replace('model__', ''): v for k,v in best_estimators.items()},
    final_estimator=LogisticRegression(
        class_weight=weight_dict,
        random_state=0,
        n_jobs=-1
    ),
    cv=StratifiedKFold(5),
    n_jobs=-1,
    verbose=10
)

# Cross validation search
search = cv_search(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True),
    Pipeline(steps=[
        ('model', model),
    ], verbose=True),
    search_params,
    cv=[(X_train.index, X_test.index + len(X_train))]
)

In [None]:
result_df, best_params = cv_results(search)

result_df

In [None]:
best_params

In [None]:
model = StackingClassifier(
    **{k.replace('model__', ''): v for k,v in best_estimators.items()},
    final_estimator=LogisticRegression(
        **{k.replace('model__final_estimator__', ''): v for k,v in best_params.items()},
        class_weight=weight_dict,
        random_state=0,
        n_jobs=-1
    ),
    cv=StratifiedKFold(5),
    n_jobs=-1,
    verbose=10
)

# Cross validation
cv_dict = cross_validation(
    Pipeline(steps=[
        ('model', model),
    ], verbose=True),
    X_train,
    y_train,
    n_splits=5
)

# Evaluation
train_test_evaluation(
    'Model Stacking (Meta Tuned)',
    metric_dict,
    cv_dict,
    model,
    X_train, y_train, X_test, y_test
)

###### Box

In [None]:
plot_score(dict_sampling(metric_dict, [0, 2] + list(np.linspace(15, 18, 4))),
           'Phase 10 - Box - Model Stacking')

###### Confusion Matrix

In [None]:
plot_matrix(dict_sampling(metric_dict, [0, 2] + list(np.linspace(15, 18, 4))),
            'Phase 10 - Confusion Matrix - Model Stacking',
            z_max=y_test.value_counts().values[-1],
            max_col=3)