In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import os
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

data_folder = Path("../../data")
dataset_location = data_folder / "creditcard.csv"

# Necessary for notebook to see src
import sys
sys.path.append('../..')

from src.visualization import tsne_visualization, pca_visualization

Device: cuda


In [2]:
original_dataset = pd.read_csv(dataset_location)
dataset = original_dataset.copy(deep=True)

# Train Test Split

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    dataset.iloc[:, :-1],
    dataset.iloc[:, -1],
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=dataset.iloc[:, -1]
)

In [4]:
X_original = pd.concat([X_train, y_train.to_frame()], axis=1)

# Synthesizer Model

In [5]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(X_original)
metadata.columns

{'Time': {'sdtype': 'numerical'},
 'V1': {'sdtype': 'numerical'},
 'V2': {'sdtype': 'numerical'},
 'V3': {'sdtype': 'numerical'},
 'V4': {'sdtype': 'numerical'},
 'V5': {'sdtype': 'numerical'},
 'V6': {'sdtype': 'numerical'},
 'V7': {'sdtype': 'numerical'},
 'V8': {'sdtype': 'numerical'},
 'V9': {'sdtype': 'numerical'},
 'V10': {'sdtype': 'numerical'},
 'V11': {'sdtype': 'numerical'},
 'V12': {'sdtype': 'numerical'},
 'V13': {'sdtype': 'numerical'},
 'V14': {'sdtype': 'numerical'},
 'V15': {'sdtype': 'numerical'},
 'V16': {'sdtype': 'numerical'},
 'V17': {'sdtype': 'numerical'},
 'V18': {'sdtype': 'numerical'},
 'V19': {'sdtype': 'numerical'},
 'V20': {'sdtype': 'numerical'},
 'V21': {'sdtype': 'numerical'},
 'V22': {'sdtype': 'numerical'},
 'V23': {'sdtype': 'numerical'},
 'V24': {'sdtype': 'numerical'},
 'V25': {'sdtype': 'numerical'},
 'V26': {'sdtype': 'numerical'},
 'V27': {'sdtype': 'numerical'},
 'V28': {'sdtype': 'numerical'},
 'Amount': {'sdtype': 'numerical'},
 'Class': {'sdt

In [6]:
from sdv.single_table import CTGANSynthesizer

ctgan = CTGANSynthesizer(metadata, epochs=300, verbose=True)

ctgan.fit(X_original) 

Gen. (-0.34) | Discrim. (-0.27): 100%|██████████| 300/300 [57:01<00:00, 11.40s/it]


In [11]:
from src.synthesizers import save_model

save_model(ctgan, synthesizer_type='CTGAN', dataset_name='creditcard', folder_path="../../models", model_name='main')

'../../models/creditcard/CTGAN/main.pth'

In [6]:
from sdv.single_table import CTGANSynthesizer

synthesizer: CTGANSynthesizer = torch.load('../../models/creditcard/CTGAN/main.pth')

  synthesizer: CTGANSynthesizer = torch.load('../../models/creditcard/CTGAN/main.pth')


In [7]:
import sdv

num_rows_to_generate = len(X_original[X_original['Class'] == 0]) - len(X_original[X_original['Class'] == 1])

minority_class = synthesizer.sample_from_conditions(conditions=[sdv.sampling.Condition(column_values={'Class': 1}, num_rows=num_rows_to_generate)])

Sampling conditions: 100%|██████████| 227057/227057 [00:16<00:00, 13630.32it/s]


In [15]:
synhtetic_dataset = pd.concat([X_original, minority_class], axis=0)
X_synhtetic, y_synhtetic = synhtetic_dataset.iloc[:, :-1], synhtetic_dataset.iloc[:, -1]

# Train Set

In [19]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

original_model = XGBClassifier(device=device, verbosity=2)

original_model.fit(X_train, y_train)
prediction = original_model.predict(X_test)


print(f'F1: {f1_score(y_true=y_test, y_pred=prediction):>10.4f}')
print(f'Precision: {precision_score(y_true=y_test, y_pred=prediction):>10.4f}')
print(f'Recall: {recall_score(y_true=y_test, y_pred=prediction):>10.4f}')

F1:     0.8449
Precision:     0.8876
Recall:     0.8061


In [20]:
synhtetic_model = XGBClassifier(device=device, verbosity=2)

synhtetic_model.fit(X_synhtetic, y_synhtetic)
prediction = synhtetic_model.predict(X_test)


print(f'F1: {f1_score(y_true=y_test, y_pred=prediction):>10.4f}')
print(f'Precision: {precision_score(y_true=y_test, y_pred=prediction):>10.4f}')
print(f'Recall: {recall_score(y_true=y_test, y_pred=prediction):>10.4f}')

F1:     0.6052
Precision:     0.4740
Recall:     0.8367
