In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import torch
import os
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

data_folder = Path("../../data")
model_folder = Path("../../models")
dataset_location = data_folder / "creditcard.csv"

# Necessary for notebook to see src
import sys
sys.path.append('../..')

from src.visualization import tsne_visualization, pca_visualization

Device: cuda


In [2]:
original_dataset = pd.read_csv(dataset_location)
dataset = original_dataset.copy(deep=True)

In [3]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(dataset)
metadata.columns

{'Time': {'sdtype': 'numerical'},
 'V1': {'sdtype': 'numerical'},
 'V2': {'sdtype': 'numerical'},
 'V3': {'sdtype': 'numerical'},
 'V4': {'sdtype': 'numerical'},
 'V5': {'sdtype': 'numerical'},
 'V6': {'sdtype': 'numerical'},
 'V7': {'sdtype': 'numerical'},
 'V8': {'sdtype': 'numerical'},
 'V9': {'sdtype': 'numerical'},
 'V10': {'sdtype': 'numerical'},
 'V11': {'sdtype': 'numerical'},
 'V12': {'sdtype': 'numerical'},
 'V13': {'sdtype': 'numerical'},
 'V14': {'sdtype': 'numerical'},
 'V15': {'sdtype': 'numerical'},
 'V16': {'sdtype': 'numerical'},
 'V17': {'sdtype': 'numerical'},
 'V18': {'sdtype': 'numerical'},
 'V19': {'sdtype': 'numerical'},
 'V20': {'sdtype': 'numerical'},
 'V21': {'sdtype': 'numerical'},
 'V22': {'sdtype': 'numerical'},
 'V23': {'sdtype': 'numerical'},
 'V24': {'sdtype': 'numerical'},
 'V25': {'sdtype': 'numerical'},
 'V26': {'sdtype': 'numerical'},
 'V27': {'sdtype': 'numerical'},
 'V28': {'sdtype': 'numerical'},
 'Amount': {'sdtype': 'numerical'},
 'Class': {'sdt

# Train on Whole Data

In [4]:
from sdv.single_table import CTGANSynthesizer

ctgan = CTGANSynthesizer(metadata, epochs=300, verbose=True)

ctgan.fit(dataset)

Gen. (0.17) | Discrim. (-0.02): 100%|██████████| 300/300 [1:05:10<00:00, 13.03s/it] 


In [9]:
from src.synthesizers.save_model import save_model

save_model(model=ctgan, synthesizer_type='CTGAN', dataset_name='creditcard', model_name='ctgan_default', folder_path=model_folder)

'../../models/creditcard/CTGAN/ctgan_default.pth'

In [30]:
from src.evaluation.metrics_evaluation import MetricsEvaluation
from sdv.sampling import Condition

metrics = pd.DataFrame()

eval = MetricsEvaluation()

# Mixed Classes


evals = []

for i in range(5):
    synhtetic_data = ctgan.sample(len(dataset))
    eval.fit(dataset, synhtetic_data, metadata)
    
    evals.append(eval.get_evals())
    
metrics['Mixed'] = sum(evals) / len(evals)

# Class 1

evals = []

for i in range(5):
    synhtetic_data = ctgan.sample_from_conditions(conditions=[Condition(column_values={'Class': 1}, num_rows=len(dataset[dataset['Class'] == 1]))])
    eval.fit(dataset[dataset['Class'] == 1], synhtetic_data, metadata)
    
    evals.append(eval.get_evals())
    
metrics['Class 1'] = sum(evals) / len(evals)


# Class 0

evals = []

for i in range(5):
    synhtetic_data = ctgan.sample_from_conditions(conditions=[Condition(column_values={'Class': 0}, num_rows=len(dataset[dataset['Class'] == 0]))])
    eval.fit(dataset[dataset['Class'] == 0], synhtetic_data, metadata)
    
    evals.append(eval.get_evals())

metrics['Class 0'] = sum(evals) / len(evals)

metrics

Sampling conditions: 100%|██████████| 492/492 [00:00<00:00, 1981.35it/s]
Sampling conditions: 100%|██████████| 492/492 [00:00<00:00, 2087.17it/s]
Sampling conditions: 100%|██████████| 492/492 [00:00<00:00, 2023.81it/s]
Sampling conditions: 100%|██████████| 492/492 [00:00<00:00, 1093.62it/s]
Sampling conditions: 100%|██████████| 492/492 [00:00<00:00, 1967.68it/s]
Sampling conditions: 100%|██████████| 284315/284315 [00:10<00:00, 26692.70it/s]
Sampling conditions: 100%|██████████| 284315/284315 [00:10<00:00, 26419.76it/s]
Sampling conditions: 100%|██████████| 284315/284315 [00:11<00:00, 25413.50it/s]
Sampling conditions: 100%|██████████| 284315/284315 [00:10<00:00, 26679.70it/s]
Sampling conditions: 100%|██████████| 284315/284315 [00:10<00:00, 26123.56it/s]


Unnamed: 0,Mixed,Class 1,Class 0
KST,0.871797,0.82126,0.947763
CJSD,0.860785,0.780996,0.960202
WD,0.017671,0.047505,0.003054
JSD,0.6453,1.0,1.0
TVT,0.671745,1.0,1.0


# Train Only the Fraud class

In [88]:
fraud_dataset = dataset[dataset['Class'] == 1]
fraud_dataset.drop(columns=['Class'], inplace=True)

fraud_metadata = SingleTableMetadata()
fraud_metadata.detect_from_dataframe(fraud_dataset)
fraud_gan = CTGANSynthesizer(fraud_metadata, epochs=1200, generator_lr=5e-5, discriminator_lr=5e-5)

fraud_gan.fit(fraud_dataset)

In [89]:
fraud_synhtetic_data = fraud_gan.sample(len(fraud_dataset))

from sdmetrics.single_table import KSComplement, CorrelationSimilarity
from src.metrics.js_divergence import ContinuousJSComplement
from src.metrics.wasserstein_distance import WassersteinDistance

print(f'KS: {KSComplement.compute(fraud_dataset, fraud_synhtetic_data, fraud_metadata)}')
print(f'CJS: {ContinuousJSComplement.compute(fraud_dataset, fraud_synhtetic_data, fraud_metadata)}')
print(f'WD: {WassersteinDistance.compute(fraud_dataset, fraud_synhtetic_data, fraud_metadata)}')
print(f'C: {CorrelationSimilarity.compute(fraud_dataset, fraud_synhtetic_data, fraud_metadata)}')


KS: 0.8281165311653117
CJS: 0.8160878322454286
WD: 0.045691271652630074
C: 0.8616574380872374


In [90]:
fraud_gan.get_loss_values_plot()

In [91]:
save_model(model=ctgan, synthesizer_type='CTGAN', dataset_name='creditcard', model_name='ctgan_fraud', folder_path=model_folder)

'../../models/creditcard/CTGAN/ctgan_fraud.pth'

# Train Only with the subset of The Class 0

In [92]:
fraud_instances = dataset[dataset['Class'] == 1]
normal_instances = dataset[dataset['Class'] == 0].sample(len(fraud_instances))

fifty_fifty_dataset = pd.concat([fraud_instances, normal_instances])
fifty_fifty_dataset

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00,1
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1
4920,4462.0,-2.303350,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.562320,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.087330,-0.156114,-0.542628,0.039566,-0.153029,239.93,1
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.00,1
6329,7519.0,1.234235,3.019740,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120787,75961.0,1.315385,-0.033609,-1.089996,-0.557079,2.029921,3.268133,-0.534981,0.808248,-0.052725,...,-0.313301,-1.068940,0.103788,0.992126,0.369394,0.110372,-0.028304,0.011838,1.99,0
162632,115259.0,1.950318,0.236633,-1.961433,1.049823,0.915245,-0.658555,0.750884,-0.319208,-0.531847,...,0.206640,0.584530,-0.023460,0.723351,0.451359,-0.541020,-0.040878,-0.058391,44.70,0
208620,137198.0,2.106530,-0.206361,-1.639475,-0.022334,0.543560,-0.035514,-0.017479,-0.027448,0.498432,...,-0.331499,-0.883607,0.173942,-1.467873,-0.187532,0.271703,-0.078561,-0.086361,1.98,0
137159,82038.0,0.995355,-0.541092,1.240747,0.865975,-1.351096,-0.111886,-0.753003,0.288338,0.910801,...,-0.006537,-0.096626,0.006826,0.540269,0.114556,0.285065,-0.010438,0.027015,74.99,0


In [97]:
fifty_fifty_metadata = SingleTableMetadata()
fifty_fifty_metadata.detect_from_dataframe(fifty_fifty_dataset)
fifty_fifty_gan = CTGANSynthesizer(fifty_fifty_metadata, epochs=1500, generator_lr=5e-5, discriminator_lr=5e-5)

fifty_fifty_gan.fit(fifty_fifty_dataset)

In [98]:
fifty_fifty_synthetic = fifty_fifty_gan.sample(len(fifty_fifty_dataset))

fifty_fifty_eval = MetricsEvaluation()
fifty_fifty_eval.fit(fifty_fifty_dataset, fifty_fifty_synthetic, fifty_fifty_metadata)

fifty_fifty_eval.get_evals()

KST     0.865041
CJSD    0.849006
WD      0.026588
JSD     0.995688
TVT     0.993902
dtype: float64

In [99]:
fifty_fifty_gan.get_loss_values_plot()

In [100]:
save_model(model=ctgan, synthesizer_type='CTGAN', dataset_name='creditcard', model_name='ctgan_fifty_fifty', folder_path=model_folder)

'../../models/creditcard/CTGAN/ctgan_fifty_fifty.pth'

# CTABGAN++

In [117]:
from src.synthesizers.ctabgan.ctabgan import CTABGAN

ctabgan = CTABGAN(
    raw_data=dataset,
    test_ratio=0.02,
    categorical_columns=['Class'],
    log_columns=[],
    mixed_columns={},
    general_columns=[],
    non_categorical_columns=[col for col in dataset.columns if col != 'Class'],
    integer_columns=['Time'],
    problem_type={'Classification': 'Class'}
)

In [None]:
# Takes too much Time

ctabgan.fit()

  2%|▏         | 3/150 [10:02<8:11:49, 200.74s/it]
