In [None]:
import lib._util.visualplot as vp
import lib._util.mlpipe as mlpipe

# Pre-processing
from lib._class.DFDuplicateRemoval import DFDuplicateRemoval

# Feature scaling
from lib._class.DFStandardScaler import DFStandardScaler
from lib._class.DFMinMaxScaler import DFMinMaxScaler

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)

import numpy as np
from tqdm import tqdm

# Scikit-Learn
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import BernoulliNB, CategoricalNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.dummy import DummyClassifier

from sklearn.gaussian_process.kernels import RBF
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import StackingClassifier

from xgboost import XGBClassifier

# Imbalanced-Learn
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import ADASYN, BorderlineSMOTE, SVMSMOTE
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks

# Plotly
import plotly.express as px
import plotly.figure_factory as ff

# Constant Variable

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/graph/'

# Phase 1 - Data Loading
- Reference: https://www.kaggle.com/mlg-ulb/creditcardfraud/home
- Time: Number of seconds elapsed between this transaction and the first transaction in the dataset
- V1-V28: May be result of a PCA dimensionality reduction to protect user identities and sensitive features
- Amount: Transaction amount
- Class: 1 for fraudulent transactions, 0 otherwise

In [None]:
df_chunks = pd.read_csv(f'{SOURCE_PATH_DATA}creditcard.csv', sep=',', chunksize=50_000)
data_df   = pd.concat(df_chunks)

data_df.shape

In [None]:
data_df.head()

In [None]:
vp.faststat(data_df)

In [None]:
vp.value_count(data_df, 'Class')

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=4,
             title='Phase 1 - Histogram',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 2048})

###### Box

In [None]:
vp.box(data_df,
       color='Class',
       max_col=4,
       title='Phase 1 - Box',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'height': 2048,
           'legend_orientation': 'h'
       })

###### KDE

In [None]:
vp.kde(data_df,
       color='Class',
       max_col=4,
       title='Phase 1 - KDE',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'height': 2048,
           'legend_orientation': 'h'
       })

# Phase 2 - Data Preparation
- Remove duplicated data

In [None]:
duplicate_removal = DFDuplicateRemoval(target='Class', keep='mean')
duplicate_removal.fit(data_df)

# Observe duplicated data
duplicate_df = duplicate_removal.duplicate_df

duplicate_df

In [None]:
vp.value_count(duplicate_df, 'Class')

In [None]:
# Observe if duplicated data are having different target label
vp.value_count(
    duplicate_df.groupby(duplicate_removal.subset).agg(
        Class=('Class', 'mean')
    ).reset_index(),
    'Class'
)

In [None]:
# Remove duplicated data
data_df = duplicate_removal.transform(data_df)

data_df.shape

In [None]:
vp.value_count(data_df, 'Class')

# Phase 3 - Classification
- Baseline

In [None]:
metric_dict = {
    'cv_score': [],
    'matrix':   [],
    'method':   [],
}

In [None]:
# Separate features & target
X, y = mlpipe.xy_split(data_df, 'Class')

vp.value_count(y.to_frame(), 'Class')
X.shape

In [None]:
# Separate dataset
X_train, X_test, y_train, y_test = mlpipe.dataset_split(X, y, test_size=.3, stratify=y, random_state=0)

print('Train dataset:\n-----------------------')
vp.value_count(y_train.to_frame(), 'Class')
print('\nTest dataset:\n----------------------')
vp.value_count(y_test.to_frame(), 'Class')

In [None]:
def cross_validation(estimator, X, y):
    return cross_val_score(
        estimator,
        X,
        y,
        scoring='roc_auc',
        cv=StratifiedKFold(10),
        verbose=10,
        n_jobs=-1
    )

def train_test_evaluation(method, metric_dict, cv_scores,
                          model, X_train, y_train, X_test, y_test,
                          train_pipeline, test_pipeline=None):
    
    # Model training
    model.fit(
        train_pipeline.fit_transform(X_train),
        y_train
    )

    # Evaluate test data
    test_pipeline = train_pipeline if test_pipeline is None else test_pipeline
    
    eval_dict = mlpipe.eval_classif(
        y_test,
        model.predict(
            test_pipeline.transform(X_test)
        ),
        y_prob=model.predict_proba(
            test_pipeline.transform(X_test)
        )[:,-1],
        return_evaluation=True
    )

    metric_dict['method'].append(method)
    metric_dict['cv_score'].append(cv_scores)
    metric_dict['matrix'].append(eval_dict['matrix'])

In [None]:
standard_scaler = DFStandardScaler(columns=['Time', 'Amount'])
minmax_scaler   = DFMinMaxScaler()
model           = LogisticRegression(random_state=0, n_jobs=-1)

# Cross validation
scores = cross_validation(
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    X_train,
    y_train
)
print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')

# Evaluation
train_test_evaluation(
    'Baseline',
    metric_dict,
    scores,
    model,
    X_train, y_train, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True)
)

# Phase 4 - Classification
- Class Weighting

In [None]:
class_weights = [
    ('Class Weight', mlpipe.class_weight(y_train)),
    ('Class Weight (Normalize)', mlpipe.class_weight(y_train, normalize=True)),
    ('Class Ratio', mlpipe.class_ratio(y_train)),
    ('Class Ratio (Normalize)', mlpipe.class_ratio(y_train, normalize=True)),
    ('Class Ratio Floor', mlpipe.class_ratio(y_train, rounding='floor')),
    ('Class Ratio Floor (Normalize)', mlpipe.class_ratio(y_train, rounding='floor', normalize=True)),
    ('Class Ratio Ceil', mlpipe.class_ratio(y_train, rounding='ceil')),
    ('Class Ratio Ceil (Normalize)', mlpipe.class_ratio(y_train, rounding='ceil', normalize=True)),
]

In [None]:
for method, class_weight_dict in class_weights:
    print(f'\n{method}:')
    
    model = LogisticRegression(random_state=0, n_jobs=-1,
                               class_weight=class_weight_dict)
    
    # Cross validation
    scores = cross_validation(
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('minmax_scaler', minmax_scaler),
            ('model', model),
        ], verbose=True),
        X_train,
        y_train
    )
    print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')

    # Evaluation
    train_test_evaluation(
        method,
        metric_dict,
        scores,
        model,
        X_train, y_train, X_test, y_test,
        train_pipeline=Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('minmax_scaler', minmax_scaler),
        ], verbose=True)
    )

# Phase 5 - Classification
- Re-sampling

In [None]:
pipelines = [
    (
        'ADASYN + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('adasyn', ADASYN(random_state=0, n_jobs=-1)),
            ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'ADASYN + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('adasyn', ADASYN(random_state=0, n_jobs=-1)),
            ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'SMOTE + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('smote_enn', SMOTEENN(random_state=0, n_jobs=-1)),
        ], verbose=True)
    ),
    (
        'SMOTE + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('smote_tomek', SMOTETomek(random_state=0, n_jobs=-1)),
        ], verbose=True)
    ),
    (
        'Borderline SMOTE + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
            ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'Borderline SMOTE + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
            ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'SVM SMOTE + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('svm_smote', SVMSMOTE(svm_estimator=SVC(class_weight='balanced', max_iter=1000, random_state=0),
                                   random_state=0, n_jobs=-1)),
            ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'SVM SMOTE + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('svm_smote', SVMSMOTE(svm_estimator=SVC(class_weight='balanced', max_iter=1000, random_state=0),
                                   random_state=0, n_jobs=-1)),
            ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
]

In [None]:
for method, resample_pipeline in pipelines:
    print(f'\n{method}:')
    
    # Cross validation
    model = LogisticRegression(random_state=0, n_jobs=-1)
    steps = resample_pipeline.steps.copy()
    steps.append(('minmax_scaler', minmax_scaler))
    steps.append(('model', model))

    # Reference: https://kiwidamien.github.io/how-to-do-cross-validation-when-upsampling-data.html
    scores = cross_validation(
        Pipeline(steps, verbose=True),
        X_train,
        y_train
    )
    print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')
    
    # Re-sampling
    X_bal, y_bal = resample_pipeline.fit_resample(X_train, y_train)
    
    # Evaluation
    train_test_evaluation(
        method,
        metric_dict,
        scores,
        model,
        X_bal, y_bal, X_test, y_test,
        train_pipeline=Pipeline(steps=[
            ('minmax_scaler', minmax_scaler),
        ], verbose=True),
        test_pipeline=Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('minmax_scaler', minmax_scaler),
        ], verbose=True)
    )

###### Box

In [None]:
def plot_metrics(metric_dict, title):
    eval_df  = pd.DataFrame(metric_dict)
    eval_dfs = []
    for index in eval_df.index:
        eval_dfs.append(
            pd.DataFrame({
                'method':   eval_df.at[index, 'method'],
                'cv_score': eval_df.at[index, 'cv_score'],
            })
        )

    vp.box(
        pd.concat(eval_dfs, axis=0),
        color='method',
        max_col=1,
        title=title,
        out_path=OUT_PATH_GRAPH,
        layout_kwargs={'showlegend': False},
        box_kwargs={
            'boxmean': 'sd',
            'boxpoints': False,
        }
    )

In [None]:
plot_metrics(metric_dict, 'Phase 5 - Box - CV Score')

###### Confusion Matrix

In [None]:
def plot_matrix(metric_dict, title, z_max=None):
    data_groups = []
    annotations = ()

    for index, matrix_df in enumerate(metric_dict['matrix']):
        # Reverse sorting to ensure plotly display is same as dataframe layout
        tmp_df = matrix_df.sort_index(ascending=False)
        
        # Heatmap data
        fig = fig = ff.create_annotated_heatmap(
            z=tmp_df.values,
            x=[f'Pred {x}' for x in tmp_df.columns],
            y=[f'True {x}' for x in tmp_df.index],
            colorscale='Portland',
            zmin=0,
            zmax=z_max
        )
        data_groups.append(fig['data'])

        # Heatmap annotation
        annotation = fig['layout']['annotations']
        for x in annotation:
            suffix = '' if index == 0 else index+1
            x['xref'] = f'x{suffix}'
            x['yref'] = f'y{suffix}'
        annotations += annotation

    vp.datagroups_subplots(data_groups,
                           xaxis_titles=metric_dict['method'],
                           max_col=4,
                           title=title,
                           out_path=OUT_PATH_GRAPH,
                           layout_kwargs={
                               'height': 1500,
                               'annotations': annotations,
                           })

In [None]:
plot_matrix(metric_dict, 'Phase 5 - Confusion Matrix', z_max=y_test.value_counts().values[-1])

# Phase 6 - Classification
- Hyperparameter Tuning

In [None]:
def cv_search(X, y, estimator, param_distributions, n_splits=10):
    search = RandomizedSearchCV(
        estimator,
        param_distributions,
        scoring='roc_auc',
        cv=StratifiedKFold(n_splits=n_splits),
        n_jobs=-1,
        verbose=10,
        n_iter=100,
        random_state=0,
        refit=False
    )
    search.fit(X, y)
    
    return search

### Class Weight (Normalize)
- Best performance among class weighting

In [None]:
# Reference: https://towardsdatascience.com/logistic-regression-model-tuning-with-scikit-learn-part-1-425142e01af5
model  = LogisticRegression(random_state=0, n_jobs=-1,
                            class_weight=mlpipe.class_weight(y_train, normalize=True))
search = cv_search(
    X_train, y_train,
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    param_distributions={
        'model__C': np.logspace(-4, 4, 20),
        'model__penalty': ['l2'],
    }
)

In [None]:
result_df = pd.DataFrame(search.cv_results_)
result_df[['params', 'mean_test_score', 'rank_test_score']].sort_values(by='mean_test_score', ascending=False)

In [None]:
search.best_params_

In [None]:
model = LogisticRegression(random_state=0, n_jobs=-1,
                           class_weight=mlpipe.class_weight(y_train, normalize=True),
                           **{k.replace('model__', ''): v for k,v in search.best_params_.items()})

# Cross validation
scores = cross_validation(
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    X_train,
    y_train
)
print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')

# Evaluation
train_test_evaluation(
    'CV Search (Class Weight)',
    metric_dict,
    scores,
    model,
    X_train, y_train, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True)
)

### Borderline SMOTE + Tomek
- 3rd best performance among re-sampling
- Top best performance falls on SVM SMOTE + ENN, and SVM SMOTE + Tomek, but it's too time consuming

In [None]:
model  = LogisticRegression(random_state=0, n_jobs=-1)
search = cv_search(
    X_train, y_train,
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
        ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    param_distributions={
        'model__C': np.logspace(-4, 4, 20),
        'model__penalty': ['l2'],
    },
    n_splits=5
)

In [None]:
result_df = pd.DataFrame(search.cv_results_)
result_df[['params', 'mean_test_score', 'rank_test_score']].sort_values(by='mean_test_score', ascending=False)

In [None]:
search.best_params_

In [None]:
model = LogisticRegression(random_state=0, n_jobs=-1,
                           **{k.replace('model__', ''): v for k,v in search.best_params_.items()})

# Cross validation
# Reference: https://kiwidamien.github.io/how-to-do-cross-validation-when-upsampling-data.html
scores = cross_validation(
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
        ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    X_train,
    y_train
)
print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')

# Re-sampling
X_bal, y_bal = Pipeline(steps=[
    ('standard_scaler', standard_scaler),
    ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
    ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
], verbose=True).fit_resample(X_train, y_train)

# Evaluation
train_test_evaluation(
    'CV Search (Borderline SMOTE + Tomek)',
    metric_dict,
    scores,
    model,
    X_bal, y_bal, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('minmax_scaler', minmax_scaler),
    ], verbose=True),
    test_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True)
)

###### Bar

In [None]:
plot_metrics(metric_dict, 'Phase 6 - Box - CV Score')

###### Confusion Matrix

In [None]:
plot_matrix(metric_dict, 'Phase 6 - Confusion Matrix', z_max=y_test.value_counts().values[-1])

# Phase 7 - Classification
- Model Stacking (Baseline)

In [None]:
metric_dict = {
    'macro_precision': [],
    'macro_recall':    [],
    'macro_f1':        [],
    'roc_auc':         [],
    'pr_auc':          [],
    'cv_score':        [],
    'method':          [],
}

In [None]:
def cv_base_models(estimators, X, y):
    eval_dict = {
        'model':   [],
        'score-m': [],
        'score-s': []
    }

    # Cross validation
    for k,v in tqdm(estimators):
        scores = cross_validation(
            Pipeline(steps=[
                ('standard_scaler', standard_scaler),
                ('minmax_scaler', minmax_scaler),
                ('model', v),
            ], verbose=True),
            X,
            y
        )
        eval_dict['model'].append(k)
        eval_dict['score-m'].append(np.mean(scores))
        eval_dict['score-s'].append(np.std(scores))
    
    eval_df = pd.DataFrame(eval_dict).set_index('model')
    eval_df['score-m'] = eval_df['score-m'].round(3)
    eval_df['score-s'] = eval_df['score-s'].round(3)
    eval_df.index.name = ''
    
    return eval_df.sort_index()

In [None]:
weight_dict = mlpipe.class_weight(y_train, normalize=True)

models = [
    OneVsRestClassifier(LinearSVC(class_weight=weight_dict, random_state=0), n_jobs=-1),
    OneVsRestClassifier(SVC(max_iter=1000, probability=True, class_weight=weight_dict, random_state=0), n_jobs=-1),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    DummyClassifier(random_state=0),
    AdaBoostClassifier(random_state=0),
    ExtraTreesClassifier(class_weight=weight_dict, random_state=0, n_jobs=-1),
    GradientBoostingClassifier(random_state=0),
    RandomForestClassifier(class_weight=weight_dict, random_state=0, n_jobs=-1),
    DecisionTreeClassifier(class_weight=weight_dict, random_state=0),
    LogisticRegression(class_weight=weight_dict, random_state=0, n_jobs=-1),
    PassiveAggressiveClassifier(class_weight=weight_dict, random_state=0, n_jobs=-1),
    Perceptron(class_weight=weight_dict, random_state=0, n_jobs=-1),
    RidgeClassifier(class_weight=weight_dict, random_state=0),
    SGDClassifier(class_weight=weight_dict, random_state=0, n_jobs=-1),
    MLPClassifier(random_state=0)
]

# Final model
model = StackingClassifier(
    estimators=[
        (x.estimator.__class__.__name__ if x.__class__.__name__ == 'OneVsRestClassifier', x)
        else (x.__class__.__name__, x)
        for x in models
    ],
    final_estimator=LogisticRegression(class_weight=weight_dict, random_state=0, n_jobs=-1),
    cv=StratifiedKFold(10),
    n_jobs=-1,
    verbose=10
)

In [None]:
# Cross validation (Base Models)
eval_df = cv_base_models(model.estimators, X_train, y_train)
eval_df

In [None]:
# Cross validation (Meta Model)
scores = cross_validation(
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    X_train,
    y_train
)
print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')

In [None]:
# Evaluation
train_test_evaluation(
    'Model Stacking (Baseline)',
    metric_dict,
    scores,
    model,
    X_train, y_train, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True),
)

# Phase 8 - Classification
- Model Stacking (Hyperparameter Tuning)

In [None]:
# TODO

# Phase 9 - Classification
- Model Stacking (Combination)

In [None]:
# TODO
# Reference: https://towardsdatascience.com/stacking-classifiers-for-higher-predictive-performance-566f963e4840#:~:text=What%20is%20Stacking%3F,any%20classifier%20of%20your%20choice.