In [None]:
#Data manipulation Libraries
import numpy as np
import pandas as pd 
import copy
from collections import Counter

#Preprocessing Libraries
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import StandardScaler

#Model Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from VAE import VAE_oversampling
from GAN import GAN
from CV import *
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Setting the random seed 
import random
seed = 42
np.random.seed(seed) 
random.seed(seed)

In [None]:
df = pd.read_csv('creditcard.csv')
df.head()

### Train, Validate, Test split

In [None]:
X = df.drop(['Class'], axis=1)
y = df['Class']

#Split data initially to train and remainingfor cross
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, stratify=y)

# Check the number of records
print('The number of records in the training dataset is', X_train.shape[0])
print('The number of records in the validation dataset is', X_test.shape[0])
print(f"The training dataset has {sorted(Counter(y_train).items())[0][1]} records for the majority class and {sorted(Counter(y_train).items())[1][1]} records for the minority class.")
print(f"The validation and test datasets have {sorted(Counter(y_test).items())[0][1]} records for the majority class and {sorted(Counter(y_test).items())[1][1]} records for the minority class.")

### Pre-processing

In [None]:
X_train_processed = copy.deepcopy(X_train)
X_test_processed = copy.deepcopy(X_test)

scaler = StandardScaler()

X_train_processed = pd.DataFrame(scaler.fit_transform(X_train_processed[:]), index=X_train_processed.index)
X_test_processed = pd.DataFrame(scaler.transform(X_test_processed[:]), index=X_test_processed.index)

In [None]:
smote = SMOTE(random_state=1)

#transform the dataset
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()

X_train_procesed_smote = pd.DataFrame(scaler.fit_transform(X_train_smote[:]), index=X_train_smote.index)
X_test_processed_smote = pd.DataFrame(scaler.transform(X_test[:]), index=X_test.index)

counter = Counter(y_train_smote)
print(counter)

In [None]:
adasyn = ADASYN(random_state=1)

#transform the dataset
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

scaler = StandardScaler()

X_train_processed_adasyn = pd.DataFrame(scaler.fit_transform(X_train_adasyn[:]), index=X_train_adasyn.index)
X_test_processed_adasyn = pd.DataFrame(scaler.transform(X_test[:]), index=X_test.index)

counter = Counter(y_train_adasyn)
print(counter)


## Scaled

In [None]:
rf = RandomForestClassifier()
model_cv(X_train, y_train, rf)

print("\n############### Evaluate model ###############")
rf = RandomForestClassifier()
rf.fit(X_train_processed, y_train)

evaluate_model(X_test_processed, y_test, rf)

## With SMOTE

In [None]:
rf = RandomForestClassifier()

model_cv(X_train, y_train, rf, 'smote')

print("\n############### Evaluate model ###############")
rf = RandomForestClassifier()
rf.fit(X_train_procesed_smote, y_train_smote)

evaluate_model(X_test_processed_smote, y_test, rf)

## ADASYN

In [None]:
rf = RandomForestClassifier()

model_cv(X_train, y_train, rf, 'adasyn')

print("\n############### Evaluate model ###############")
rf = RandomForestClassifier()
rf.fit(X_train_processed_adasyn, y_train_adasyn)

evaluate_model(X_test_processed_adasyn, y_test, rf)

## Data Augmentation with VAE

In [None]:
#Cross validation
skf = StratifiedKFold(n_splits=5)
lst_accuracy = []
lst_precision = []
lst_recall = []
lst_f1_score = []
lst_roc_auc_score = []

lst_accuracy_train = []
lst_precision_train = []
lst_recall_train = []
lst_f1_score_train = []
lst_roc_auc_score_train = []

model = RandomForestClassifier()

for count, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    print("------------------------ KFold:", count+1, "---------------------------")
    X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(f"The training dataset has {sorted(Counter(y_train_fold).items())[0][1]} records for the majority class and {sorted(Counter(y_train_fold).items())[1][1]} records for the minority class.")
    print(f"The test dataset has {sorted(Counter(y_valid_fold).items())[0][1]} records for the majority class and {sorted(Counter(y_valid_fold).items())[1][1]} records for the minority class.")
    
    maj = len(y_train_fold[y_train_fold == 0])
    mino = len(y_train_fold[y_train_fold == 1])
    frac = 0.5
    num_samples = round(1/(1/frac - 1) * maj - mino)

    print("NUmber of samples to be generated: ", num_samples)

    # Variational Oversampling 
    vos = VAE_oversampling(hidden_dim=32,
                            latent_dim=10,
                            original_dim=30,
                            minority_class_id=1,
                            epochs=100,
                            batch_size=1,
                            num_samples_to_generate = num_samples,
                            random_state=0,
                            optimizer="adam")

    #Fit the VAE oversampling model and get new data set
    X_res_val,y_res_val = vos.fit_sample(X_train_fold,y_train_fold)
    
    std = StandardScaler()
    
    X_train_processed = std.fit_transform(X_res_val) 
    X_valid_processed = std.transform(X_valid_fold)

    model.fit(X_train_processed, y_res_val)

    train_predictions = model.predict(X_train_processed)

    y_pred_test = model.predict(X_valid_processed)

    lst_accuracy.append(accuracy_score(y_valid_fold, y_pred_test))
    lst_precision.append(precision_score(y_valid_fold, y_pred_test))
    lst_recall.append(recall_score(y_valid_fold, y_pred_test))
    lst_f1_score.append(f1_score(y_valid_fold, y_pred_test))
    lst_roc_auc_score.append(roc_auc_score(y_valid_fold, y_pred_test))

    lst_accuracy_train.append(accuracy_score(y_res_val, train_predictions))
    lst_precision_train.append(precision_score(y_res_val, train_predictions))
    lst_recall_train.append(recall_score(y_res_val, train_predictions))
    lst_f1_score_train.append(f1_score(y_res_val, train_predictions))
    lst_roc_auc_score_train.append(roc_auc_score(y_res_val, train_predictions))


In [None]:
print('############ Validation #############')
print(f"Accuracy:, {np.mean(lst_accuracy):0.6f} (+/- {np.std(lst_accuracy):0.6f})")
print(f"Precision: {np.mean(lst_precision):0.6f} (+/- {np.std(lst_precision):0.6f})")
print(f"Recall: {np.mean(lst_recall):0.6f} (+/- {np.std(lst_recall):0.6f})")
print(f"F1 score: {np.mean(lst_f1_score):0.6f} (+/- {np.std(lst_f1_score):0.6f})")
print(f"ROC_AUC: {np.mean(lst_roc_auc_score):0.6f} (+/- {np.std(lst_roc_auc_score):0.6f})")

print('############ Training #############')
print(f"Accuracy:, {np.mean(lst_accuracy_train):0.6f} (+/- {np.std(lst_accuracy_train):0.6f})")
print(f"Precision: {np.mean(lst_precision_train):0.6f} (+/- {np.std(lst_precision_train):0.6f})")
print(f"Recall: {np.mean(lst_recall_train):0.6f} (+/- {np.std(lst_recall_train):0.6f})")
print(f"F1 score: {np.mean(lst_f1_score_train):0.6f} (+/- {np.std(lst_f1_score_train):0.6f})")
print(f"ROC_AUC: {np.mean(lst_roc_auc_score_train):0.6f} (+/- {np.std(lst_roc_auc_score_train):0.6f})")

In [None]:
rf = RandomForestClassifier()

maj = len(y_train[y_train == 0])
mino = len(y_train[y_train == 1])
frac = 0.5
num_samples = round(1/(1/frac - 1) * maj - mino)

vos = VAE_oversampling(hidden_dim=32,
                        latent_dim=10,
                        original_dim=30,
                        minority_class_id=1,
                        epochs=100,
                        batch_size=1,
                        num_samples_to_generate = num_samples,
                        random_state = 0,
                        optimizer="adam")

#Fit the VAE oversampling model and get new data set
X_res_val,y_res_val = vos.fit_sample(X_train,y_train)

std = StandardScaler()

X_train_processed = std.fit_transform(X_res_val) 
X_test_processed = std.transform(X_test)

rf.fit(X_train_processed, y_res_val)

train_predictions = rf.predict(X_train_processed)
print("################# Training Results ########################")
model_scores(y_res_val, train_predictions)

evaluate_model(X_test_processed, y_test, rf)

## Data Augmentation with GAN

In [None]:
#Cross validation
skf = StratifiedKFold(n_splits=5)
lst_accuracy = []
lst_precision = []
lst_recall = []
lst_f1_score = []
lst_roc_auc_score = []

model = RandomForestClassifier()

for count, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    print("------------------------ KFold:", count+1, "---------------------------")
    X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(f"The training dataset has {sorted(Counter(y_train_fold).items())[0][1]} records for the majority class and {sorted(Counter(y_train_fold).items())[1][1]} records for the minority class.")
    print(f"The test dataset has {sorted(Counter(y_valid_fold).items())[0][1]} records for the majority class and {sorted(Counter(y_valid_fold).items())[1][1]} records for the minority class.")
    
    maj = len(y_train_fold[y_train_fold == 0])
    mino = len(y_train_fold[y_train_fold == 1])
    frac = 0.5
    num_samples = round(1/(1/frac - 1) * maj - mino)

    print("NUmber of samples to be generated: ", num_samples)

    gan = GAN(generator_output_dim=30, 
              discriminator_input_dim=30,
              noise_dim=100,
              num_samples=num_samples, 
              epochs=100, 
              batch_size=24,
              dropout=0.4)

    #Fit the GAN oversampling model and get new data set
    X_res_val_gan,y_res_val_gan = gan.fit_sample(X_train_fold, y_train_fold)

    std = StandardScaler()
    
    X_train_processed = std.fit_transform(X_res_val_gan) 
    X_valid_processed = std.transform(X_valid_fold)

    model.fit(X_train_processed, y_res_val_gan)

    y_pred_test = model.predict(X_valid_processed)

    lst_accuracy.append(accuracy_score(y_valid_fold, y_pred_test))
    lst_precision.append(precision_score(y_valid_fold, y_pred_test))
    lst_recall.append(recall_score(y_valid_fold, y_pred_test))
    lst_f1_score.append(f1_score(y_valid_fold, y_pred_test))
    lst_roc_auc_score.append(roc_auc_score(y_valid_fold, y_pred_test))


In [None]:
print(f"Accuracy:, {np.mean(lst_accuracy):0.6f} (+/- {np.std(lst_accuracy):0.6f})")
print(f"Precision: {np.mean(lst_precision):0.6f} (+/- {np.std(lst_precision):0.6f})")
print(f"Recall: {np.mean(lst_recall):0.6f} (+/- {np.std(lst_recall):0.6f})")
print(f"F1 score: {np.mean(lst_f1_score):0.6f} (+/- {np.std(lst_f1_score):0.6f})")
print(f"ROC_AUC: {np.mean(lst_roc_auc_score):0.6f} (+/- {np.std(lst_roc_auc_score):0.6f})")

In [None]:
rf = RandomForestClassifier()

maj = len(y_train[y_train == 0])
mino = len(y_train[y_train == 1])
frac = 0.5
num_samples = round(1/(1/frac - 1) * maj - mino)

gan = GAN(generator_output_dim=30, 
              discriminator_input_dim=30,
              noise_dim=100,
              num_samples=num_samples, 
              epochs=100, 
              batch_size=24,
              dropout=0.4)


X_res_val_gan,y_res_val_gan = gan.fit_sample(X_train, y_train)

std = StandardScaler()
    
X_train_processed = std.fit_transform(X_res_val_gan) 
X_test_processed = std.transform(X_test)

rf.fit(X_train_processed, y_res_val_gan)

train_predictions = rf.predict(X_train_processed)
print("################# Training Results ########################")
model_scores(y_res_val_gan, train_predictions)

evaluate_model(X_test_processed, y_test, rf)