In [None]:
import lib._util.visualplot as vp

# Pre-processing
from lib._class.DFDuplicateRemoval import DFDuplicateRemoval

# Feature scaling
from lib._class.DFRobustScaler import DFRobustScaler
from lib._class.DFMinMaxScaler import DFMinMaxScaler

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

# Scikit-Learn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

# Plotly
import plotly.express as px

# Constant Variable

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/graph/'

# Phase 1 - Data Loading
- Reference: https://www.kaggle.com/mlg-ulb/creditcardfraud/home
- Time: Number of seconds elapsed between this transaction and the first transaction in the dataset
- V1-V28: May be result of a PCA dimensionality reduction to protect user identities and sensitive features
- Amount: Transaction amount
- Class: 1 for fraudulent transactions, 0 otherwise

In [None]:
df_chunks = pd.read_csv(f'{SOURCE_PATH_DATA}creditcard.csv', sep=',', chunksize=50_000,
                        dtype={'Class': str},
                        nrows=None)
data_df   = pd.concat(df_chunks)

data_df.shape

In [None]:
data_df.head()

In [None]:
vp.faststat(data_df)

###### Histogram

In [None]:
vp.histogram(data_df,
             bin_algo='count',
             max_col=4,
             title='Phase 1 - Histogram',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 2048})

###### Box

In [None]:
vp.box(data_df,
       color='Class',
       max_col=4,
       title='Phase 1 - Box',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'height': 2048,
           'legend_orientation': 'h'
       })

###### KDE

In [None]:
vp.kde(data_df,
       color='Class',
       max_col=4,
       title='Phase 1 - KDE',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'height': 2048,
           'legend_orientation': 'h'
       })

# Phase 2 - Feature Engineering
- Time & Amount features

In [None]:
# Reference: https://www.kaggle.com/miguelniblock/optimizing-imbalanced-classification-100-recall
data_df['Hour'] = pd.to_timedelta(data_df['Time'], unit='s').dt.components.hours

###### Histogram

In [None]:
fig = px.histogram(data_df, x='Hour', facet_col='Class')
vp.generate_plot(fig,
                 out_path=OUT_PATH_GRAPH,
                 out_filename='Phase 2 - Histogram - Hour')

In [None]:
fig = px.histogram(data_df, x='Time', facet_col='Class')
vp.generate_plot(fig,
                 out_path=OUT_PATH_GRAPH,
                 out_filename='Phase 2 - Histogram - Time')

###### Box

In [None]:
vp.box(data_df[['Time', 'Hour', 'Amount', 'Class']],
       color='Class',
       max_col=2,
       title='Phase 2 - Box',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'legend_orientation': 'h'
       })

###### KDE

In [None]:
vp.kde(data_df[['Time', 'Hour', 'Amount', 'Class']],
       color='Class',
       max_col=2,
       title='Phase 2 - KDE',
       out_path=OUT_PATH_GRAPH,
       layout_kwargs={
           'legend_orientation': 'h'
       })

In [None]:
# Less non-fraud transaction
data_df['LTE_4_Hour'] = np.where(data_df['Hour'] <= 4, 1,0)

# Normal distribution time
data_df['LTE_100K_Time'] = np.where(data_df['Time'] <= 100_000, 1,0)

# Data type conversion
data_df['Class'] = data_df['Class'].astype(int)
data_df = pd.concat([
    data_df[[x for x in data_df.columns if x != 'Class']],
    data_df['Class']
], axis=1)

###### Correlation Matrix

In [None]:
vp.corrmat(data_df,
           absolute=True,
           matrix_type='upper',
           title='Phase 2 - Correlation Matrix',
           out_path=OUT_PATH_GRAPH,
           heatmap_kwargs={
               'reversescale': True
           })

# Phase 3 - Data Preparation
- Remove duplicated data

In [None]:
duplicate_removal = DFDuplicateRemoval(target='Class')
duplicate_removal.fit(data_df)

In [None]:
# Observe duplicated data
duplicate_df = duplicate_removal.duplicate_df

duplicate_df

In [None]:
vp.value_count(duplicate_df, 'Class')

In [None]:
# Observe if duplicated data are having different target label
duplicate_df.groupby(duplicate_removal.subset).agg(
    Class=('Class', 'mean')
).reset_index()['Class'].describe()

In [None]:
# Remove duplicated data by keeping 1st record
data_df = duplicate_removal.transform(data_df)

data_df.shape

In [None]:
vp.faststat(data_df)

# Phase 4 - Classification
- Separate features & target
- Separate dataset
- Feature scaling
- Classification

In [None]:
# Separate features & target
X = data_df[[x for x in data_df.columns if x != 'Class']]
y = data_df['Class']

vp.value_count(y.to_frame(), 'Class')

In [None]:
def dataset_split(X, y, random_state=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)
    
    X_train = X_train.reset_index(drop=True)
    X_test  = X_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test  = y_test.reset_index(drop=True)
    
    return X_train, X_test, y_train, y_test

In [None]:
# Separate dataset
X_train, X_test, y_train, y_test = dataset_split(X, y, random_state=0)

print('Train dataset:')
vp.value_count(y_train.to_frame(), 'Class')
print('\nTest dataset:')
vp.value_count(y_test.to_frame(), 'Class')

In [None]:
# Feature scaling
robust_scaler = DFRobustScaler(columns=['Time', 'Amount', 'Hour'])
minmax_scaler = DFMinMaxScaler()

steps = [
    ('robust_scaler', robust_scaler),
    ('minmax_scaler', minmax_scaler),
]
pipeline = Pipeline(steps)

X_train = pipeline.fit_transform(X_train)
X_test  = pipeline.transform(X_test)

In [None]:
def eval_classif(X, y, model):
    y_pred = model.predict(X)
    y_true = y
    
    cofmat_df = pd.DataFrame(confusion_matrix(y_true, y_pred))
    cofmat_df.index.name   = 'True'
    cofmat_df.columns.name = 'Pred'

    print(cofmat_df)
    print()
    print(classification_report(y_true, y_pred, digits=5))

### Baseline Model

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=0)
model = model.fit(X_train, y_train)

In [None]:
print('Train dataset:')
eval_classif(X_train, y_train, model)

print('\nTest dataset:')
eval_classif(X_test, y_test, model)

### Class Weight Model

In [None]:
classes           = np.unique(y_train)
weights           = compute_class_weight('balanced', classes, y_train)
class_weight_dict = {classes[i]: x for i,x in enumerate(weights)}

class_weight_dict

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=0, class_weight=class_weight_dict)
model = model.fit(X_train, y_train)

In [None]:
print('Train dataset:')
eval_classif(X_train, y_train, model)

print('\nTest dataset:')
eval_classif(X_test, y_test, model)

### Class Ratio Model

In [None]:
# Reference:
# - https://machinelearningmastery.com/cost-sensitive-neural-network-for-imbalanced-classification/?fbclid=IwAR1PcEicqDXadG9hsNE-Tf4RQQ_DpIaCV4LRcuizGbTC9Ek5PiMbB_x26bU
# - https://www.youtube.com/watch?v=D6AChZlN5m0
n_class0          = y_train.value_counts().loc[0]
n_class1          = y_train.value_counts().loc[1]
class_weight_dict = {0: 1, 1: n_class0 / n_class1}

class_weight_dict

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=0, class_weight=class_weight_dict)
model = model.fit(X_train, y_train)

In [None]:
print('Train dataset:')
eval_classif(X_train, y_train, model)

print('\nTest dataset:')
eval_classif(X_test, y_test, model)