In [47]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import sys
import time
import seaborn as sns
import os
import math
import pickle as pkl

#%%
# ML
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import (
    f1_score,
    confusion_matrix,
    mean_absolute_error,
    mean_squared_error,
    precision_recall_curve,
    r2_score,
    roc_auc_score,
    classification_report,
    roc_curve,
    auc
)
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer

import torch
import torch.nn as nn
import torch.optim as optim

# # Anomaly detection models
# import pyod
# from pyod.models.ocsvm import OCSVM
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import (LocalOutlierFactor, NearestNeighbors)

In [48]:
'''
Todos: 
- Try Different Scalers

''' 


'\nTodos: \n- Try Different Scalers\n\n'

In [49]:
# Data loading
df_head = pd.read_csv('../data/creditcard_2023_head.csv')

# Loading first setup
with open('../data/setup_1.pkl', 'rb') as f:
    setup1 = pkl.load(f)

X_train, _, X_test, y_test = setup1['X_train'], setup1['y_train'], setup1['X_test'], setup1['y_test']

# Todo: Try Different Scalers
scaler = MinMaxScaler().fit(X_train)  # Initialize the MinMaxScaler and fit to the training set
X_train_scaled = scaler.transform(X_train)  # the scaler is applied to the training set
X_test_scaled = scaler.transform(X_test)  # the scaler is applied to the test set

# Convert everything to DataFrame
# Assuming the first column is 'id' and the last column is 'amount'
columns = ['Feature_' + str(i) for i in range(1, X_train_scaled.shape[1]-1)] + ['Amount']
X_train_scaled_df = pd.DataFrame(X_train_scaled[:, 1:], columns=columns)  # Excluding 'id'
X_test_scaled_df = pd.DataFrame(X_test_scaled[:, 1:], columns=columns)  # Excluding 'id'

y_test_df = pd.DataFrame(y_test, columns=['Class'])

In [50]:
X_train_scaled_df_mini = X_train_scaled_df[:10000] # Prototyping with only 10000 instances.
# Timing and Training the model
start_time = time.time()
oc_svm = OneClassSVM().fit(X_train_df_mini)
duration = time.time() - start_time
print(f"Training time: {duration:.2f} seconds")

Training time: 0.00 seconds


In [51]:
# Assuming you have your X_train_scaled and X_test_scaled ready
# Set the number of neighbors
k = 5

# Initialize and fit the NearestNeighbors model
model = NearestNeighbors(n_neighbors=k)
model.fit(X_train_scaled_df_mini)

# Calculate the distance to the kth nearest neighbor for each point in the training set
distances, indices = model.kneighbors(X_train_scaled_df_mini)
kth_dist_train = distances[:, k-1]

# Calculate the distance to the kth nearest neighbor for each point in the test set
distances_test, indices_test = model.kneighbors(X_test_scaled_df)
kth_dist_test = distances_test[:, k-1]

# Set a threshold for anomaly detection
# This could be a fixed value or based on a percentile. Here we use the 95th percentile of the training distances.
threshold = np.percentile(kth_dist_train, 95)

# Detect anomalies in the training set
anomalies_train = kth_dist_train > threshold

# Detect anomalies in the test set
anomalies_test = kth_dist_test > threshold

# Now, anomalies_train and anomalies_test contain boolean arrays indicating anomalies in your datasets.

In [53]:
# Predict on the test set
y_pred_test = model.predict(X_test_scaled_df)
# Convert predictions to match y_test labels (0 for anomalies, 1 for normal)
y_pred_test = (y_pred_test == 1).astype(int)

# Calculate ROC Curve and AUC
fpr, tpr, _ = roc_curve(y_test_df, y_pred_test)
roc_auc = auc(fpr, tpr)

# Calculate Precision-Recall Curve and AUC
precision, recall, _ = precision_recall_curve(y_test_df, y_pred_test)
pr_auc = auc(recall, precision)

# Generate a classification report
class_report = classification_report(y_test_df, y_pred_test)

# Plotting the ROC and Precision-Recall Curves
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (kNN)')
plt.legend(loc="lower right")

plt.subplot(1, 2, 2)
plt.plot(recall, precision, color='blue', lw=2, label=f'PR curve (area = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve (kNN)')
plt.legend(loc="upper right")

plt.tight_layout()
plt.show()

print(class_report)

AttributeError: 'NearestNeighbors' object has no attribute 'predict'

In [33]:
# For kNN
x_train, x_test, y_train, y_test= train_test_split(x, y,
                                                   test_size= 0.2,
                                                   shuffle= True, #shuffle the data to avoid bias
                                                   random_state= 0)
x_train= np.asarray(x_train)
y_train= np.asarray(y_train)

x_test= np.asarray(x_test)
y_test= np.asarray(y_test)


array([[ 2.80671000e+05, -8.08528462e-01,  6.74858605e-01, ...,
         2.11655911e-01,  6.86938843e-01,  5.62733000e+03],
       [ 8.43760000e+04,  7.44825917e-02, -4.12612090e-01, ...,
        -1.84876612e-01,  3.22587263e-01,  1.50637600e+04],
       [ 1.42456000e+05,  7.56834805e-01, -4.49988696e-01, ...,
        -1.59757067e-01, -7.57162666e-02,  1.96396500e+04],
       ...,
       [ 2.33693000e+05, -5.35104472e-01, -8.24174857e-02, ...,
        -6.51894955e-02, -4.50267367e-01,  1.85104000e+03],
       [ 1.83175000e+05, -3.81533059e-01, -2.58213631e-02, ...,
         4.98182210e-02, -5.70302169e-02,  1.77748100e+04],
       [ 1.25630000e+04,  8.85956620e-01, -3.44200065e-01, ...,
        -2.24585410e-01, -1.47908027e-02,  2.98234000e+03]])