In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import os
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

data_folder = Path("../data")
dataset_location = data_folder / "creditcard.csv"

# Necessary for notebook to see src
import sys
sys.path.append('..')

from src.visualization import tsne_visualization, pca_visualization

Device: cuda


In [2]:
original_dataset = pd.read_csv(dataset_location)
dataset = original_dataset.copy(deep=True)

In [3]:
X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1]

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

scaler = StandardScaler()
scaler.set_output(transform='pandas')

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

predictions = log_reg.predict(X_test)

print(f'F1 Score: {f1_score(y_test, predictions)}')

F1 Score: 0.7167630057803468


In [10]:
from imblearn.under_sampling import NearMiss

near_miss = NearMiss(n_neighbors=10)

X_resampled, y_resampled = near_miss.fit_resample(X_train, y_train)

In [11]:
y_resampled.value_counts()

Class
0    394
1    394
Name: count, dtype: int64

In [12]:
from sklearn.metrics import precision_score, recall_score

log_reg_resampled = LogisticRegression()

log_reg_resampled.fit(X_resampled, y_resampled)

resampled_predictions = log_reg_resampled.predict(X_test)

print(f'F1 Score: {f1_score(y_test, resampled_predictions)}')
print(f'Precission Score: {recall_score(y_test, resampled_predictions)}')
print(f'Recall Score: {precision_score(y_test, resampled_predictions)}')

F1 Score: 0.006540267872673509
Precission Score: 0.9591836734693877
Recall Score: 0.003281320906203093


In [13]:
from sklearn.metrics import precision_score, recall_score
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier()

xgb_classifier.fit(X_resampled, y_resampled)

resampled_predictions = xgb_classifier.predict(X_test)

print(f'F1 Score: {f1_score(y_test, resampled_predictions)}')
print(f'Precission Score: {recall_score(y_test, resampled_predictions)}')
print(f'Recall Score: {precision_score(y_test, resampled_predictions)}')

F1 Score: 0.0036796085200007586
Precission Score: 0.9897959183673469
Recall Score: 0.001843230403800475
