In [125]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

data = pd.read_csv('creditcard.csv')

dataset = data.to_numpy().astype(np.float32)  # Ensure data is float32 for PyTorch

scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), unit_variance=False)
dataset[:, [0, 29]] = scaler.fit_transform(dataset[:, [0, 29]])

In [126]:
data_train, data_test = train_test_split(dataset, test_size=0.3, random_state=42)

In [123]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# -----------------------------
# 3. Define the Autoencoder Model
# -----------------------------
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        # Encoder: 31 -> 23 -> 19 -> 17 -> 8 with dropout after first two hidden layers
        self.encoder = nn.Sequential(
            nn.Linear(31, 23),
            nn.Tanh(),
            nn.Dropout(0.1),
            nn.Linear(23, 19),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(19, 17),
            nn.Tanh(),
            nn.Linear(17, 8)
        )
        # Decoder: 8 -> 17 -> 19 -> 23 -> 31 with dropout after first two layers
        self.decoder = nn.Sequential(
            nn.Linear(8, 17),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(17, 19),
            nn.Tanh(),
            nn.Dropout(0.1),
            nn.Linear(19, 23),
            nn.Tanh(),
            nn.Linear(23, 31)
        )

    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return reconstructed

In [127]:
fraud_data = data_train[data_train[:, -1] == 1]
non_fraud_data = data_train[data_train[:, -1] == 0]

legit_data_limited = non_fraud_data[np.random.choice(non_fraud_data.shape[0], fraud_data.shape[0], replace=False)]

train_dataset_SVM = np.vstack((legit_data_limited, fraud_data))
np.random.shuffle(train_dataset_SVM)
train_labels_SVM = train_dataset_SVM[:, -1]
train_dataset_SVM = train_dataset_SVM[:, :-1]

data_train_SVM, data_test_SVM, labels_train_SVM, labels_test_SVM = train_test_split(train_dataset_SVM, train_labels_SVM, test_size=0.2, random_state=42)
model = SVC(kernel='linear', C=1.0)
model.fit(data_train_SVM, labels_train_SVM)

labels_pred_test = model.predict(data_test_SVM)
labels_pred_train = model.predict(data_train_SVM)
accuracy_train = jnp.mean(labels_pred_train == labels_train_SVM)
accuracy_test = jnp.mean(labels_pred_test == labels_test_SVM)

print(f"Train Accuracy: {accuracy_train:.4f}")
print(f"Test Accuracy: {accuracy_test:.4f}")

Train Accuracy: 0.9578
Test Accuracy: 0.9580


In [None]:
# load the model 
myModel = Autoencoder()

# Load the saved state dictionary
myModel.load_state_dict(torch.load("model/autoencoder/autoencoder.pth", map_location=torch.device('cpu'), weights_only=True))
myModel.eval()

alpha = 0.9

data_train_oversampled = data_train.copy()

while (len(data_train_oversampled[data_train_oversampled[:, -1] == 1]) < len(data_train_oversampled[data_train_oversampled[:, -1] == 0])):
    fraud_data = data_train_oversampled[data_train_oversampled[:, -1] == 1]

    # 2) Generate synthetic data by interpolating between fraud data points
    fraud_data_tensor = torch.tensor(fraud_data, dtype=torch.float32)

    latent_representations = myModel.encoder(fraud_data_tensor)

    noisy_vectors = []
    for _ in range(300):
        # Pick a random fraud vector
        i = np.random.randint(0, n)
        z_i = latent_representations[i]

        # Optionally pick a second random vector to interpolate
        j = np.random.randint(0, n)
        z_j = latent_representations[j]
        
        # Interpolation factor
        lambda_ = np.random.rand()
        z_ij = lambda_*z_i + (1 - lambda_)*z_j
        
        # Add Gaussian noise around that interpolation
        noise = torch.normal(mean=0.0, std=alpha, size=(latent_dim,))
        z_syn = z_ij + noise  # Both are tensors, so this should work directly

        noisy_vectors.append(z_syn.detach().numpy())

    noisy_vectors = np.array(noisy_vectors)

    # 3) Decode them to get synthetic data in original input space
    X_synthetic = myModel.decoder(torch.tensor(noisy_vectors, dtype=torch.float32)).detach().numpy()
    # remove label
    X_synthetic_to_predict = X_synthetic[:, :-1]
    # 4) Validate with SVM
    y_pred = model.predict(X_synthetic_to_predict)

    X_synthetic[:, -1] = y_pred

    # 5) Add the synthetic data to the training set
    X_synthetic_fraud = X_synthetic[X_synthetic[:, -1] == 1]

    data_train_oversampled = np.vstack((data_train_oversampled, X_synthetic_fraud))

    print("New synthetic data generated", len(X_synthetic_fraud))
    print(f"Number of fraud in the balanced dataset {len(data_train_oversampled[data_train_oversampled[:, -1] == 1])}/{len(data_train_oversampled[data_train_oversampled[:, -1] == 0])}")

# for i in range(len(data_train_oversampled[data_train_oversampled[:, -1] == 1]) - len(data_train_oversampled[data_train_oversampled[:, -1] == 0])):
#     noisy_vectors = []
#     for _ in range(n):
#         # Pick a random fraud vector
#         i = np.random.randint(0, n)
#         z_i = latent_representations[i]

#         # Optionally pick a second random vector to interpolate
#         j = np.random.randint(0, n)
#         z_j = latent_representations[j]
        
#         # Interpolation factor
#         lambda_ = np.random.rand()
#         z_ij = lambda_*z_i + (1 - lambda_)*z_j
        
#         # Add Gaussian noise around that interpolation
#         noise = torch.normal(mean=0.0, std=alpha, size=(latent_dim,))
#         z_syn = z_ij + noise  # Both are tensors, so this should work directly

#         noisy_vectors.append(z_syn.detach().numpy())

#     noisy_vectors = np.array(noisy_vectors)

#     # 3) Decode them to get synthetic data in original input space
#     X_synthetic = myModel.decoder(torch.tensor(noisy_vectors, dtype=torch.float32)).detach().numpy()
#     # remove label
#     X_synthetic_to_predict = X_synthetic[:, :-1]
#     # 4) Validate with SVM
#     y_pred = model.predict(X_synthetic_to_predict)

#     X_synthetic[:, -1] = y_pred

#     # 5) Add the synthetic data to the training set
#     X_synthetic_fraud = X_synthetic[X_synthetic[:, -1] == 1]

#     data_train_oversampled = np.vstack((data_train_oversampled, X_synthetic_fraud))

print(data_train_oversampled.shape)
print("Number of fraud in the balanced dataset", len(data_train_oversampled[data_train_oversampled[:, -1] == 1]))
print("Number of non-fraud in the balanced dataset", len(data_train_oversampled[data_train_oversampled[:, -1] == 0]))

New synthetic data generated 283
Number of fraud in the balanced dataset 639/199008
New synthetic data generated 288
Number of fraud in the balanced dataset 927/199008
New synthetic data generated 292
Number of fraud in the balanced dataset 1219/199008
New synthetic data generated 290
Number of fraud in the balanced dataset 1509/199008
New synthetic data generated 287
Number of fraud in the balanced dataset 1796/199008
New synthetic data generated 297
Number of fraud in the balanced dataset 2093/199008
New synthetic data generated 288
Number of fraud in the balanced dataset 2381/199008
New synthetic data generated 285
Number of fraud in the balanced dataset 2666/199008
New synthetic data generated 287
Number of fraud in the balanced dataset 2953/199008
New synthetic data generated 286
Number of fraud in the balanced dataset 3239/199008
New synthetic data generated 289
Number of fraud in the balanced dataset 3528/199008
New synthetic data generated 288
Number of fraud in the balanced da

In [None]:
# Create an LSTM layer
# Input dim is 3, output dim is 3
lstm = nn.LSTM(input_size=30, hidden_size=3, num_layers=1, batch_first=False)