In [None]:
import lib._util.visualplot as vp
import lib._util.mlpipe as mlpipe

# Pre-processing
from lib._class.DFDuplicateRemoval import DFDuplicateRemoval

# Feature scaling
from lib._class.DFStandardScaler import DFStandardScaler
from lib._class.DFMinMaxScaler import DFMinMaxScaler

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)

import numpy as np
from tqdm import tqdm

# Scikit-Learn
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import StackingClassifier

# Imbalanced-Learn
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import ADASYN, BorderlineSMOTE, SVMSMOTE
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks

# Plotly
import plotly.express as px

# Constant Variable

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/graph/'

# Phase 1 - Data Loading
- Reference: https://www.kaggle.com/mlg-ulb/creditcardfraud/home
- Time: Number of seconds elapsed between this transaction and the first transaction in the dataset
- V1-V28: May be result of a PCA dimensionality reduction to protect user identities and sensitive features
- Amount: Transaction amount
- Class: 1 for fraudulent transactions, 0 otherwise

In [None]:
df_chunks = pd.read_csv(f'{SOURCE_PATH_DATA}creditcard.csv', sep=',', chunksize=50_000)
data_df   = pd.concat(df_chunks)

data_df.shape

In [None]:
data_df.head()

In [None]:
vp.faststat(data_df)

In [None]:
vp.value_count(data_df, 'Class')

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=4,
             title='Phase 1 - Histogram',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 2048})

###### Box

In [None]:
vp.box(data_df,
       color='Class',
       max_col=4,
       title='Phase 1 - Box',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'height': 2048,
           'legend_orientation': 'h'
       })

###### KDE

In [None]:
vp.kde(data_df,
       color='Class',
       max_col=4,
       title='Phase 1 - KDE',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'height': 2048,
           'legend_orientation': 'h'
       })

# Phase 2 - Data Preparation
- Remove duplicated data

In [None]:
duplicate_removal = DFDuplicateRemoval(target='Class', keep='mean')
duplicate_removal.fit(data_df)

# Observe duplicated data
duplicate_df = duplicate_removal.duplicate_df

duplicate_df

In [None]:
vp.value_count(duplicate_df, 'Class')

In [None]:
# Observe if duplicated data are having different target label
vp.value_count(
    duplicate_df.groupby(duplicate_removal.subset).agg(
        Class=('Class', 'mean')
    ).reset_index(),
    'Class'
)

In [None]:
# Remove duplicated data
data_df = duplicate_removal.transform(data_df)

data_df.shape

In [None]:
vp.value_count(data_df, 'Class')

# Phase 3 - Classification
- Baseline

In [None]:
metric_dict = {
    'macro_precision': [],
    'macro_recall':    [],
    'macro_f1':        [],
    'roc_auc':         [],
    'pr_auc':          [],
    'cv_score':        [],
    'method':          [],
}

In [None]:
# Separate features & target
X, y = mlpipe.xy_split(data_df, 'Class')

vp.value_count(y.to_frame(), 'Class')
X.shape

In [None]:
# Separate dataset
X_train, X_test, y_train, y_test = mlpipe.dataset_split(X, y, test_size=.3, stratify=y, random_state=0)

print('Train dataset:\n-----------------------')
vp.value_count(y_train.to_frame(), 'Class')
print('\nTest dataset:\n----------------------')
vp.value_count(y_test.to_frame(), 'Class')

In [None]:
def cross_validation(estimator, X, y):
    return cross_val_score(
        estimator,
        X,
        y,
        scoring='roc_auc',
        cv=StratifiedKFold(10),
        verbose=10,
        n_jobs=-1
    )

def train_test_evaluation(method, metric_dict, cv_scores,
                          model, X_train, y_train, X_test, y_test,
                          train_pipeline, test_pipeline=None):
    
    # Model training
    model.fit(
        train_pipeline.fit_transform(X_train),
        y_train
    )

    # Evaluate test data
    test_pipeline = train_pipeline if test_pipeline is None else test_pipeline
    
    eval_dict = mlpipe.eval_classif(
        y_test,
        model.predict(
            test_pipeline.transform(X_test)
        ),
        y_prob=model.predict_proba(
            test_pipeline.transform(X_test)
        )[:,-1],
        return_evaluation=True
    )

    metric_dict['method'].append(method)
    metric_dict['cv_score'].append(cv_scores)
    metric_dict['roc_auc'].append(eval_dict['roc_auc'])
    metric_dict['pr_auc'].append(eval_dict['pr_auc'])
    metric_dict['macro_precision'].append(eval_dict['report']['macro avg']['precision'])
    metric_dict['macro_recall'].append(eval_dict['report']['macro avg']['recall'])
    metric_dict['macro_f1'].append(eval_dict['report']['macro avg']['f1-score'])

In [None]:
standard_scaler = DFStandardScaler(columns=['Time', 'Amount'])
minmax_scaler   = DFMinMaxScaler()
model           = LogisticRegression(random_state=0, n_jobs=-1)

# Cross validation
scores = cross_validation(
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    X_train,
    y_train
)
print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')

# Evaluation
train_test_evaluation(
    'Baseline',
    metric_dict,
    scores,
    model,
    X_train, y_train, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True)
)

# Phase 4 - Classification
- Class Weighting

In [None]:
def class_weight(y, normalize=False):
    classes     = np.unique(y)
    weights     = compute_class_weight('balanced', classes, y)
    weight_dict = {classes[i]: x for i,x in enumerate(weights)}
    
    if normalize:
        return {k: v / np.sum(list(weight_dict.values())) for k,v in weight_dict.items()}
    return weight_dict

# Reference:
# - https://machinelearningmastery.com/cost-sensitive-neural-network-for-imbalanced-classification/?fbclid=IwAR1PcEicqDXadG9hsNE-Tf4RQQ_DpIaCV4LRcuizGbTC9Ek5PiMbB_x26bU
# - https://www.youtube.com/watch?v=D6AChZlN5m0
def class_ratio(y, rounding=None, normalize=False):
    roundings = [None, 'round', 'ceil', 'floor']
    assert rounding in roundings, f'rounding not in valid list: {roundings}'
    
    n_class0 = y.value_counts().loc[0]
    n_class1 = y.value_counts().loc[1]
    
    if rounding == 'round':
        weight_dict = {0: 1, 1: int(np.round(n_class0 / n_class1))}
    elif rounding == 'ceil':
        weight_dict = {0: 1, 1: int(np.ceil(n_class0 / n_class1))}
    elif rounding == 'floor':
        weight_dict = {0: 1, 1: int(np.floor(n_class0 / n_class1))}
    else:
        weight_dict = {0: 1, 1: n_class0 / n_class1}
    
    if normalize:
        return {k: v / np.sum(list(weight_dict.values())) for k,v in weight_dict.items()}
    return weight_dict

In [None]:
class_weights = [
    ('Class Weight', class_weight(y_train)),
    ('Class Weight (Normalize)', class_weight(y_train, normalize=True)),
    ('Class Ratio', class_ratio(y_train)),
    ('Class Ratio (Normalize)', class_ratio(y_train, normalize=True)),
    ('Class Ratio Floor', class_ratio(y_train, rounding='floor')),
    ('Class Ratio Floor (Normalize)', class_ratio(y_train, rounding='floor', normalize=True)),
    ('Class Ratio Ceil', class_ratio(y_train, rounding='ceil')),
    ('Class Ratio Ceil (Normalize)', class_ratio(y_train, rounding='ceil', normalize=True)),
]

In [None]:
for method, class_weight_dict in class_weights:
    print(f'\n{method}:')
    
    model = LogisticRegression(class_weight=class_weight_dict, random_state=0, n_jobs=-1)
    
    # Cross validation
    scores = cross_validation(
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('minmax_scaler', minmax_scaler),
            ('model', model),
        ], verbose=True),
        X_train,
        y_train
    )
    print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')

    # Evaluation
    train_test_evaluation(
        method,
        metric_dict,
        scores,
        model,
        X_train, y_train, X_test, y_test,
        train_pipeline=Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('minmax_scaler', minmax_scaler),
        ], verbose=True)
    )

# Phase 5 - Classification
- Re-sampling

In [None]:
pipelines = [
    (
        'ADASYN + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('adasyn', ADASYN(random_state=0, n_jobs=-1)),
            ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'ADASYN + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('adasyn', ADASYN(random_state=0, n_jobs=-1)),
            ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'SMOTE + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('smote_enn', SMOTEENN(random_state=0, n_jobs=-1)),
        ], verbose=True)
    ),
    (
        'SMOTE + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('smote_tomek', SMOTETomek(random_state=0, n_jobs=-1)),
        ], verbose=True)
    ),
    (
        'Borderline SMOTE + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
            ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'Borderline SMOTE + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
            ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'SVM SMOTE + ENN',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('svm_smote', SVMSMOTE(random_state=0, n_jobs=-1)),
            ('enn', EditedNearestNeighbours(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
    (
        'SVM SMOTE + Tomek',
        Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('svm_smote', SVMSMOTE(random_state=0, n_jobs=-1)),
            ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
        ], verbose=True)
    ),
]

In [None]:
for method, resample_pipeline in pipelines:
    print(f'\n{method}:')
    
    # Cross validation
    model  = LogisticRegression(random_state=0, n_jobs=-1)
    steps = resample_pipeline.steps.copy()
    steps.append(('minmax_scaler', minmax_scaler))
    steps.append(('model', model))
    pipeline = Pipeline(steps, verbose=True)

    # Reference: https://kiwidamien.github.io/how-to-do-cross-validation-when-upsampling-data.html
    scores = cross_validation(
        pipeline,
        X_train,
        y_train
    )
    print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')
    
    # Re-sampling
    X_bal, y_bal = resample_pipeline.fit_resample(X_train, y_train)
    
    # Evaluation
    train_test_evaluation(
        method,
        metric_dict,
        scores,
        model,
        X_bal, y_bal, X_test, y_test,
        train_pipeline=Pipeline(steps=[
            ('minmax_scaler', minmax_scaler),
        ], verbose=True),
        test_pipeline=Pipeline(steps=[
            ('standard_scaler', standard_scaler),
            ('minmax_scaler', minmax_scaler),
        ], verbose=True)
    )

###### Bar

In [None]:
def plot_metrics(metric_dict, title):
    eval_df  = pd.DataFrame(metric_dict)
    eval_df['cv_mean'] = eval_df['cv_score'].apply(np.mean)

    metrics  = [x for x in eval_df.columns if x not in ['cv_score', 'method']]
    eval_dfs = [eval_df[[x, 'method']].rename(columns={x: 'score'}) for x in metrics]
    for i,x in enumerate(eval_dfs):
        x['metric'] = metrics[i]

    fig = px.bar(
        pd.concat(eval_dfs, axis=0),
        x='metric',
        y='score',
        color='method',
        barmode='group'
    )
    fig['layout']['legend_orientation'] = 'h'

    vp.generate_plot(
        fig,
        out_path=OUT_PATH_GRAPH,
        out_filename=title)

In [None]:
plot_metrics(metric_dict, 'Phase 5 - Bar - Metrics Comparison')

# Phase 6 - Classification
- Hyperparameter Tuning

In [None]:
def cv_search(X, y, estimator, param_distributions, n_splits=10):
    search = RandomizedSearchCV(
        estimator,
        param_distributions,
        scoring='roc_auc',
        cv=StratifiedKFold(n_splits=n_splits),
        n_jobs=-1,
        verbose=10,
        n_iter=100,
        random_state=0
    )
    search.fit(X, y)
    
    return search

### Class Weight (Normalize)
- Best performance among class weighting

In [None]:
# Reference: https://towardsdatascience.com/logistic-regression-model-tuning-with-scikit-learn-part-1-425142e01af5
model  = LogisticRegression(random_state=0, n_jobs=-1,
                            class_weight=class_weight(y_train, normalize=True))
search = cv_search(
    X_train, y_train,
    estimator=model,
    param_distributions={
        'C': np.logspace(-4, 4, 20),
        'penalty': ['l2'],
    }
)

In [None]:
result_df = pd.DataFrame(search.cv_results_)
result_df[['params', 'mean_test_score', 'rank_test_score']].sort_values(by='mean_test_score', ascending=False)

In [None]:
search.best_params_

In [None]:
model = LogisticRegression(random_state=0, n_jobs=-1,
                           class_weight=class_weight(y_train, normalize=True),
                           **search.best_params_)

# Cross validation
scores = cross_validation(
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    X_train,
    y_train
)
print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')

# Evaluation
train_test_evaluation(
    'CV Search (Class Weight)',
    metric_dict,
    scores,
    model,
    X_train, y_train, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True)
)

### Borderline SMOTE + Tomek
- 3rd best performance among re-sampling
- Top best performance falls on SVM SMOTE + ENN, and SVM SMOTE + Tomek, but it's too time consuming

In [None]:
model  = LogisticRegression(random_state=0, n_jobs=-1)
steps  = [
    ('standard_scaler', standard_scaler),
    ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
    ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
    ('minmax_scaler', minmax_scaler),
    ('model', model),
]
pipeline = Pipeline(steps, verbose=True)

search = cv_search(
    X_train, y_train,
    estimator=pipeline,
    param_distributions={
        'model__C': np.logspace(-4, 4, 20),
        'model__penalty': ['l2'],
    },
    n_splits=5
)

In [None]:
result_df = pd.DataFrame(search.cv_results_)
result_df[['params', 'mean_test_score', 'rank_test_score']].sort_values(by='mean_test_score', ascending=False)

In [None]:
search.best_params_

In [None]:
model = LogisticRegression(random_state=0, n_jobs=-1,
                           **{k.replace('model__', ''): v for k,v in search.best_params_.items()})

# Cross validation
steps = [
    ('standard_scaler', standard_scaler),
    ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
    ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
    ('minmax_scaler', minmax_scaler),
    ('model', model),
]
pipeline = Pipeline(steps, verbose=True)

# Reference: https://kiwidamien.github.io/how-to-do-cross-validation-when-upsampling-data.html
scores = cross_validation(
    pipeline,
    X_train,
    y_train
)
print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')

# Re-sampling
X_bal, y_bal = Pipeline(steps=[
    ('standard_scaler', standard_scaler),
    ('borderline_smote', BorderlineSMOTE(random_state=0, n_jobs=-1)),
    ('tomek', TomekLinks(n_jobs=-1, sampling_strategy='all')),
], verbose=True).fit_resample(X_train, y_train)

# Evaluation
train_test_evaluation(
    'CV Search (Borderline SMOTE + Tomek)',
    metric_dict,
    scores,
    model,
    X_bal, y_bal, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('minmax_scaler', minmax_scaler),
    ], verbose=True),
    test_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True)
)

###### Bar

In [None]:
plot_metrics(metric_dict, 'Phase 6 - Bar - Metrics Comparison')

# Phase 7 - Classification
- Model Stacking (Baseline)

In [None]:
metric_dict = {
    'macro_precision': [],
    'macro_recall':    [],
    'macro_f1':        [],
    'roc_auc':         [],
    'pr_auc':          [],
    'cv_score':        [],
    'method':          [],
}

In [None]:
def cv_base_models(estimators, X, y):
    eval_dict = {
        'model':   [],
        'score-m': [],
        'score-s': []
    }

    # Cross validation
    for k,v in tqdm(estimators):
        scores = cross_validation(
            Pipeline(steps=[
                ('standard_scaler', standard_scaler),
                ('minmax_scaler', minmax_scaler),
                ('model', v),
            ], verbose=True),
            X,
            y
        )
        eval_dict['model'].append(k)
        eval_dict['score-m'].append(np.mean(scores))
        eval_dict['score-s'].append(np.std(scores))
    
    eval_df = pd.DataFrame(eval_dict).set_index('model')
    eval_df['score-m'] = eval_df['score-m'].round(3)
    eval_df['score-s'] = eval_df['score-s'].round(3)
    eval_df.index.name = ''
    
    return eval_df.sort_index()

In [None]:
weight_dict = class_weight(y_train, normalize=True)

models = [
    OneVsRestClassifier(LinearSVC(class_weight=weight_dict, random_state=0), n_jobs=-1),
    OneVsRestClassifier(SVC(max_iter=1_000, probability=True, class_weight=weight_dict, random_state=0), n_jobs=-1),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    DummyClassifier(random_state=0),
    AdaBoostClassifier(random_state=0),
    ExtraTreesClassifier(class_weight=weight_dict, random_state=0, n_jobs=-1),
    GradientBoostingClassifier(random_state=0),
    RandomForestClassifier(class_weight=weight_dict, random_state=0, n_jobs=-1),
    DecisionTreeClassifier(class_weight=weight_dict, random_state=0),
    LogisticRegression(class_weight=weight_dict, random_state=0, n_jobs=-1),
    PassiveAggressiveClassifier(class_weight=weight_dict, random_state=0, n_jobs=-1),
    Perceptron(class_weight=weight_dict, random_state=0, n_jobs=-1),
    RidgeClassifier(class_weight=weight_dict, random_state=0),
    SGDClassifier(class_weight=weight_dict, random_state=0, n_jobs=-1),
    MLPClassifier(random_state=0)
]

# Final model
model = StackingClassifier(
    estimators=[
        (x.estimator.__class__.__name__ if x.__class__.__name__ == 'OneVsRestClassifier', x)
        else (x.__class__.__name__, x)
        for x in models
    ],
    final_estimator=LogisticRegression(class_weight=weight_dict, random_state=0, n_jobs=-1),
    cv=StratifiedKFold(10),
    n_jobs=-1,
    verbose=10
)

In [None]:
# Cross validation (Base Models)
eval_df = cv_base_models(model.estimators, X_train, y_train)
eval_df

In [None]:
# Cross validation (Meta Model)
scores = cross_validation(
    Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
        ('model', model),
    ], verbose=True),
    X_train,
    y_train
)
print(f'CV Score: {np.mean(scores) :.5f} ({np.std(scores) :.5f})\n')

In [None]:
# Evaluation
train_test_evaluation(
    'Model Stacking (Baseline)',
    metric_dict,
    scores,
    model,
    X_train, y_train, X_test, y_test,
    train_pipeline=Pipeline(steps=[
        ('standard_scaler', standard_scaler),
        ('minmax_scaler', minmax_scaler),
    ], verbose=True),
)

# Phase 8 - Classification
- Model Stacking (Hyperparameter Tuning)

In [None]:
# TODO