In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import os
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

data_folder = Path("../../data")
dataset_location = data_folder / "creditcard.csv"

# Necessary for notebook to see src
import sys
sys.path.append('../..')

from src.visualization import tsne_visualization, pca_visualization

Device: cuda


In [2]:
original_dataset = pd.read_csv(dataset_location)
dataset = original_dataset.copy(deep=True)

# Original Train Set

In [3]:
from src.evaluation import evaluate_binary_classification_stratifedkfold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier


(f1, precision, recall), _ = evaluate_binary_classification_stratifedkfold(
    original_dataset.iloc[:, :-1],
    original_dataset.iloc[:, -1],
    XGBClassifier(device=device, verbosity=2),
    transformers=StandardScaler(),
)


print(f'F1: {f1:>10.4f}')
print(f'Precision: {precision:>10.4f}')
print(f'Recall: {recall:>10.4f}')

F1:     0.8732
Precision:     0.9546
Recall:     0.8049


In [4]:
from sklearn.linear_model import LogisticRegression

(f1, precision, recall), _ = evaluate_binary_classification_stratifedkfold(
    original_dataset.iloc[:, :-1],
    original_dataset.iloc[:, -1],
    LogisticRegression(solver='newton-cholesky'),
    transformers=StandardScaler(),
)


print(f'F1: {f1:>10.4f}')
print(f'Precision: {precision:>10.4f}')
print(f'Recall: {recall:>10.4f}')

F1:     0.7258
Precision:     0.8774
Recall:     0.6200


# Time Feature Extracted Dataset

In [None]:
# Time Feature Extracted

dataset_new = dataset.copy(deep=True)

dataset_new.insert(0, 'Hour', (dataset['Time'] //  3600) % 24) 
# dataset_new.insert(0, 'Minute', (dataset['Time'] %  60))

dataset_new.drop(columns=['Time'], inplace=True)

In [30]:
(f1, precision, recall), _ = evaluate_binary_classification_stratifedkfold(
    dataset_new.iloc[:, :-1],
    dataset_new.iloc[:, -1],
    XGBClassifier(device=device, verbosity=2),
    transformers=StandardScaler(),
)


print(f'F1: {f1:>10.4f}')
print(f'Precision: {precision:>10.4f}')
print(f'Recall: {recall:>10.4f}')

F1:     0.8671
Precision:     0.9519
Recall:     0.7968


# BIN

In [None]:
bins = [0, 50, 100, 200, 500, 1000, 5000, 10000, 50000] 
# bins = [0, 10000, 50000] 
labels = [i for i in range(len(bins) - 1)]

dataset_bin = dataset.copy(deep=True)
dataset_bin.insert(0, 'Amount Range', pd.cut(dataset['Amount'], bins=bins, labels=labels, right=False).astype(int)) 
dataset_bin.drop(columns=['Amount'], inplace=True)

In [24]:
(f1, precision, recall), _ = evaluate_binary_classification_stratifedkfold(
    dataset_bin.iloc[:, :-1],
    dataset_bin.iloc[:, -1],
    XGBClassifier(device=device, verbosity=2),
    transformers=StandardScaler(),
)

print(f'F1: {f1:>10.4f}')
print(f'Precision: {precision:>10.4f}')
print(f'Recall: {recall:>10.4f}')

F1:     0.8735
Precision:     0.9616
Recall:     0.8008


# BOTH

In [25]:
# Time Feature Extracted

dataset_both = dataset.copy(deep=True)

dataset_both.insert(0, 'Hour', (dataset['Time'] //  3600) % 24) 
dataset_both.insert(0, 'Minute', (dataset['Time'] %  60))

dataset_both.drop(columns=['Time'], inplace=True)

bins = [0, 50, 100, 200, 500, 1000, 5000, 10000, 50000] 
labels = [i for i in range(len(bins) - 1)]

dataset_both.insert(0, 'Amount Range', pd.cut(dataset['Amount'], bins=bins, labels=labels, right=False).astype(int)) 

In [None]:
(f1, precision, recall), _ = evaluate_binary_classification_stratifedkfold(
    dataset_both.iloc[:, :-1],
    dataset_both.iloc[:, -1],
    XGBClassifier(device=device, verbosity=2),
    transformers=StandardScaler(),
)

print(f'F1: {f1:>10.4f}')
print(f'Precision: {precision:>10.4f}')
print(f'Recall: {recall:>10.4f}')

F1:     0.8679
Precision:     0.9563
Recall:     0.7948


# Test Every Combination

In [52]:
import itertools



truth_table = np.array(list(itertools.product([True, False], repeat=5)), dtype=np.bool_)
truth_table

evaluations = pd.DataFrame(
    truth_table,
    columns=['Time', 'Amount', 'Amount Bin', 'Time Hour', 'Time Min']
)

bins = [0, 50, 100, 200, 500, 1000, 5000, 10000, 50000] 
labels = [i for i in range(len(bins) - 1)]

f1_list = []
precision_list = []
recall_list = []

for idx in range(len(evaluations)): 

    temp_dataset = dataset.copy(deep=True)
    
    print(f'Iteration {idx + 1}')
    
    if evaluations.loc[idx, 'Amount Bin']:
        temp_dataset.insert(0, 'Amount Range', pd.cut(dataset['Amount'], bins=bins, labels=labels, right=False).astype(int)) 
    
    if evaluations.loc[idx, 'Time Hour']:
        temp_dataset.insert(0, 'Hour', (dataset['Time'] //  3600) % 24) 
    
    if evaluations.loc[idx, 'Time Min']:
        temp_dataset.insert(0, 'Minute', (dataset['Time'] %  60))    

    if not evaluations.loc[idx, 'Time']:
        temp_dataset.drop(columns=['Time'], inplace=True)

    if not evaluations.loc[idx, 'Amount']:
        temp_dataset.drop(columns=['Amount'], inplace=True)
        
    (f1, precision, recall), _ = evaluate_binary_classification_stratifedkfold(
        temp_dataset.iloc[:, :-1],
        temp_dataset.iloc[:, -1],
        XGBClassifier(device=device, verbosity=2),
        transformers=StandardScaler(),
    )
    
    f1_list.append(f1)
    precision_list.append(precision)
    recall_list.append(recall)
    

evaluations['F1 Score'] = f1_list 
evaluations['Precission Score'] = precision_list 
evaluations['Recall Score'] = recall_list 

evaluations

Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32


Unnamed: 0,Time,Amount,Amount Bin,Time Hour,Time Min,F1 Score,Precission Score,Recall Score
0,True,True,True,True,True,0.866877,0.953888,0.794723
1,True,True,True,True,False,0.868075,0.954243,0.796805
2,True,True,True,False,True,0.872865,0.956846,0.802865
3,True,True,True,False,False,0.873516,0.961579,0.800825
4,True,True,False,True,True,0.864699,0.951422,0.792703
5,True,True,False,True,False,0.867119,0.951924,0.796805
6,True,True,False,False,True,0.870076,0.958905,0.796764
7,True,True,False,False,False,0.873178,0.954628,0.804906
8,True,False,True,True,True,0.865043,0.949331,0.794723
9,True,False,True,True,False,0.86963,0.952067,0.800866


In [None]:
# Save Evaluations

evaluations.to_csv('credit_card_columns.csv')