In [1]:
from utils.autoencoder import AutoEncoder, train_autoencoder
from sklearn.metrics import classification_report
import pandas as pd
from utils.scoring import purity_score, score_clustering, clustering_classification_report
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
from utils.spectral_clustering import score_spectral, extract_and_visualize_graph
from utils.clustering import score_dbscan, score_kmeans, fit_dbscan, fit_kmeans, extract_kmeans_cluster_labels, test_dbscan, tune_kmeans_dbscan
#from utils.SpectralNet import score_spectral_net
from utils.plotting import plot_2d_clusters
from xgboost import XGBClassifier
import warnings


warnings.filterwarnings('ignore')

# Load and preprocess data

In [2]:
data = pd.read_csv('creditcard.csv')
X = data.drop(['Class', 'Amount', 'Time'], axis=1)
y = data['Class']

X_scale = MinMaxScaler().fit_transform(X)

# sample data to reduce class imbalance
non_fraud_df = X_scale[y == 0][:2000]
fraud_df = X_scale[y == 1]

X_sample = np.vstack([non_fraud_df, fraud_df])
fraud_idx = np.zeros(len(X_sample))
fraud_idx[-len(fraud_df):] = 1

indices = np.arange(len(X_sample))
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(X_sample, fraud_idx, indices, test_size=0.2, random_state=42, stratify=fraud_idx)

# Clustering on raw data

## Kmeans

In [6]:
n_clusters = [2, 3, 5, 10]
cluster_metrics, classification_metrics = score_kmeans(X_train, X_test, y_train, y_test)
results_df = pd.DataFrame(cluster_metrics)
reports_df = pd.DataFrame(classification_metrics)

results_df.round(2)

Unnamed: 0,n_clusters,Train_DB,Train_Sil,Train_Pur,Test_DB,Test_Sil,Test_Pur
0,2,0.84,0.63,0.56,0.79,0.64,0.6
1,3,1.09,0.48,0.79,1.05,0.47,0.81
2,5,1.97,0.12,0.77,1.93,0.13,0.79
3,10,1.96,0.11,0.82,1.74,0.12,0.83


## DBSCAN

In [7]:
eps = [0.1, 0.15, 0.2, 0.25]
min_samples = [5, 10, 15]
dbscan_cluster_metrics, dbscan_classification_metrics = score_dbscan(X_train, X_test, y_train, y_test, eps, min_samples)
dbscan_results_df = pd.DataFrame(dbscan_cluster_metrics)
dbscan_reports_df = pd.DataFrame(dbscan_classification_metrics)

dbscan_results_df.round(2)

Unnamed: 0,eps,min_samples,Train_DB,Train_Sil,Train_Pur,Test_DB,Test_Sil,Test_Pur
0,0.1,5,1.6,-0.22,0.49,1.57,-0.27,0.42
1,0.1,10,2.19,-0.19,0.38,2.11,-0.22,0.35
2,0.1,15,2.28,-0.23,0.31,2.31,-0.26,0.28
3,0.15,5,1.54,0.22,0.83,1.84,0.15,0.79
4,0.15,10,1.26,0.1,0.82,1.46,0.08,0.78
5,0.15,15,1.78,0.35,0.8,1.78,0.34,0.75
6,0.2,5,1.87,0.31,0.83,1.93,0.25,0.82
7,0.2,10,1.76,0.28,0.84,1.97,0.25,0.82
8,0.2,15,1.65,0.28,0.84,1.6,0.24,0.82
9,0.25,5,1.26,0.54,0.31,1.56,0.56,0.37


# Spectral Clustering
With pairwise euclidean minimum distance from approximated train centroids to Xtest

In [8]:
spectral_cluster_metrics, spectral_classification_metrics = score_spectral(X_train, X_test, y_train, y_test)
spectral_cluster_results_df = pd.DataFrame(spectral_cluster_metrics)
spectral_cluster_reports_df = pd.DataFrame(spectral_classification_metrics)

spectral_cluster_results_df.round(2)

Unnamed: 0,n_clusters,Train_DB,Train_Sil,Train_Pur,Test_DB,Test_Sil,Test_Pur
0,2,0.57,0.69,0.19,0.6,0.69,0.37
1,3,0.98,0.57,0.78,0.9,0.59,0.78
2,5,0.96,0.57,0.77,0.9,0.59,0.78
3,10,1.02,0.56,0.8,0.95,0.58,0.82


In [6]:
# graph, plt_obj = extract_and_visualize_graph(X_train, y_train, affinity='rbf', gamma=1.0)
# plt_obj.show()

## SpectralNet

In [7]:
spectralnet_cluster_metrics, spectralnet_classification_metrics = score_spectral_net(X_train, X_test, y_train, y_test)
spectralnet_cluster_results_df = pd.DataFrame(spectralnet_cluster_metrics)
spectralnet_cluster_reports_df = pd.DataFrame(spectralnet_classification_metrics)

spectralnet_cluster_results_df.round(2)

Training SpectralNet:


Train Loss: 0.1026208, Valid Loss: 0.1649356, LR: 0.001000: 100%|██████████| 30/30 [00:03<00:00,  7.95it/s]


Training SpectralNet:


Train Loss: 0.4866607, Valid Loss: 1.5635267, LR: 0.000100: 100%|██████████| 30/30 [00:03<00:00,  8.51it/s]


Training SpectralNet:


Train Loss: 2.6633844, Valid Loss: 11.4138193, LR: 0.001000: 100%|██████████| 30/30 [00:03<00:00,  8.70it/s] 


Training SpectralNet:


Train Loss: 14.6339464, Valid Loss: 28.5341110, LR: 0.001000: 100%|██████████| 30/30 [00:03<00:00,  8.54it/s] 


   n_clusters  Train_DB  Train_Sil  Train_Pur  Test_DB  Test_Sil  Test_Pur
0           2      0.95       0.57       0.77     0.90      0.60      0.77
1           3      1.00       0.56       0.82     0.95      0.57      0.83
2           5      0.99       0.56       0.81     4.48      0.11      0.24
3          10      1.10       0.53       0.87     3.72      0.15      0.02


Unnamed: 0,n_clusters,Train_Acc,Train_F1,Train_Recall,Train_Precision,Test_Acc,Test_F1,Test_Recall,Test_Precision
0,2,0.95,0.87,0.77,1.0,0.95,0.87,0.77,1.0
1,3,0.96,0.9,0.82,0.99,0.96,0.9,0.83,0.99
2,5,0.96,0.89,0.81,1.0,0.8,0.0,0.0,0.0
3,10,0.97,0.92,0.88,0.97,0.8,0.0,0.0,0.0


### Classifier - XGBoost - Baseline

In [62]:
#Defining the XGB model
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

#Define grid search for hyperparameter tuning of XGB
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.001, 0.01, 0.1, 0.2],
    "subsample": [0.8, 1, 1.2],
}
# Define Grid
grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=3,
    verbose=1
)

# Fit and use best estimator
grid.fit(X_train, y_train)
print(grid.best_params_)
best_xgb = grid.best_estimator_


Fitting 3 folds for each of 108 candidates, totalling 324 fits
{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.8}


In [63]:
#Predictions and Threshold predictions
y_pred = best_xgb.predict(X_test)
y_pred_proba = best_xgb.predict_proba(X_test)[:, 1]

threshold = 0.9
y_pred_threshold = (y_pred_proba[:] > threshold).astype(int)

print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"{threshold} Threshold Classification Report:\n", classification_report(y_test, y_pred_threshold))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       400
         1.0       0.99      0.92      0.95        99

    accuracy                           0.98       499
   macro avg       0.98      0.96      0.97       499
weighted avg       0.98      0.98      0.98       499

0.9 Threshold Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.99       400
         1.0       0.99      0.89      0.94        99

    accuracy                           0.98       499
   macro avg       0.98      0.94      0.96       499
weighted avg       0.98      0.98      0.98       499



### Classifier - RandomForest - Baseline

In [64]:
#Define the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

#Define grid search for hyperparameter tuning of Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
# Define Grid for RandomForest
grid = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=3,
    verbose=1
)

#Fit and use best estimator
grid.fit(X_train, y_train)
print(grid.best_params_)
best_rf = grid.best_estimator_

Fitting 3 folds for each of 24 candidates, totalling 72 fits
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [65]:
#Predictions and Threshold predictions
y_pred = best_rf.predict(X_test)
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]

y_pred_threshold = (y_pred_proba[:] > threshold).astype(int)

print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"{threshold} Threshold Classification Report:\n", classification_report(y_test, y_pred_threshold))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.99       400
         1.0       0.99      0.89      0.94        99

    accuracy                           0.98       499
   macro avg       0.98      0.94      0.96       499
weighted avg       0.98      0.98      0.98       499

0.9 Threshold Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      1.00      0.98       400
         1.0       1.00      0.83      0.91        99

    accuracy                           0.97       499
   macro avg       0.98      0.91      0.94       499
weighted avg       0.97      0.97      0.96       499



### Classifier - Logistic Regression - Baseline

In [70]:
log_reg = LogisticRegression(random_state=42)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  
    'penalty': ['l2', 'l1'],       
    'solver': ['liblinear', 'saga'] 
}

grid = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='neg_log_loss',
    cv=3,
    verbose=1
)

grid.fit(X_train, y_train)
print(grid.best_params_)
best_log_reg = grid.best_estimator_

Fitting 3 folds for each of 20 candidates, totalling 60 fits
{'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}


In [71]:
#Predictions and Threshold predictions
y_pred = best_log_reg.predict(X_test)
y_pred_proba = best_log_reg.predict_proba(X_test)[:, 1]

y_pred_threshold = (y_pred_proba[:] > threshold).astype(int)

print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"{threshold} Threshold Classification Report:\n", classification_report(y_test, y_pred_threshold))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      0.99      0.99       400
         1.0       0.97      0.91      0.94        99

    accuracy                           0.98       499
   macro avg       0.97      0.95      0.96       499
weighted avg       0.98      0.98      0.98       499

0.9 Threshold Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      1.00      0.98       400
         1.0       1.00      0.83      0.91        99

    accuracy                           0.97       499
   macro avg       0.98      0.91      0.94       499
weighted avg       0.97      0.97      0.96       499

