In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

data = pd.read_csv('creditcard.csv')

dataset = data.to_numpy().astype(np.float32)  # Ensure data is float32 for PyTorch

scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), unit_variance=False)
dataset[:, [0, 29]] = scaler.fit_transform(dataset[:, [0, 29]])

fraud_data = dataset[dataset[:, -1] == 1]
non_fraud_data = dataset[dataset[:, -1] == 0]

Train Accuracy: 0.9517
Test Accuracy: 0.9645


In [None]:
legit_data_limited = non_fraud_data[np.random.choice(non_fraud_data.shape[0], 492, replace=False)]

train_dataset_SVM = np.vstack((legit_data_limited, fraud_data))
np.random.shuffle(train_dataset_SVM)
train_labels_SVM = train_dataset_SVM[:, -1]
train_dataset_SVM = train_dataset_SVM[:, :-1]

data_train, data_test, labels_train, labels_test = train_test_split(train_dataset_SVM, train_labels_SVM, test_size=0.2, random_state=42)
model = SVC(kernel='linear', C=1.0)
model.fit(data_train, labels_train)

labels_pred_test = model.predict(data_test)
labels_pred_train = model.predict(data_train)
accuracy_train = jnp.mean(labels_pred_train == labels_train)
accuracy_test = jnp.mean(labels_pred_test == labels_test)

print(f"Train Accuracy: {accuracy_train:.4f}")
print(f"Test Accuracy: {accuracy_test:.4f}")

In [7]:
# -----------------------------
# 3. Define the Autoencoder Model
# -----------------------------
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        # Encoder: 31 -> 23 -> 19 -> 17 -> 8 with dropout after first two hidden layers
        self.encoder = nn.Sequential(
            nn.Linear(31, 23),
            nn.Tanh(),
            nn.Dropout(0.1),
            nn.Linear(23, 19),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(19, 17),
            nn.Tanh(),
            nn.Linear(17, 8)
        )
        # Decoder: 8 -> 17 -> 19 -> 23 -> 31 with dropout after first two layers
        self.decoder = nn.Sequential(
            nn.Linear(8, 17),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(17, 19),
            nn.Tanh(),
            nn.Dropout(0.1),
            nn.Linear(19, 23),
            nn.Tanh(),
            nn.Linear(23, 31)
        )

    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return reconstructed

In [20]:
# load the model 
myModel = Autoencoder()

# Load the saved state dictionary
myModel.load_state_dict(torch.load("model/autoencoder/autoencoderRobust.pth", map_location=torch.device('cpu'), weights_only=True))
myModel.eval()

fraud_data_tensor = torch.tensor(fraud_data, dtype=torch.float32)

with torch.no_grad():
    latent_representations = myModel.encoder(fraud_data_tensor)

# Add small Gaussian noise to generate variations
noise = torch.randn_like(latent_representations) * 0.05  # Adjust noise level as needed
synthetic_latent = latent_representations + noise

# Decode back to input space
with torch.no_grad():
    synthetic_data = myModel.decoder(synthetic_latent)

synthetic_data = synthetic_data.cpu().numpy()

# apply SVM model to the synthetic data
labels_pred_synth = model.predict(synthetic_data)
accuracy_synth = jnp.mean(labels_pred_synth == np.ones(len(labels_pred_synth)))


ValueError: X has 31 features, but SVC is expecting 30 features as input.