In [52]:
import torch
from model import AutoEncoder
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans, DBSCAN
import pandas as pd
from sklearn.metrics import davies_bouldin_score, silhouette_score, f1_score, recall_score, precision_score, accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import seaborn as sns

# load from ordered dict
# state_dict = torch.load('model.pth')
# # load from state dict
# model = AutoEncoder()
# model.load_state_dict(state_dict)
# model.eval()

#### Load and preprocess data

In [18]:
data = pd.read_csv('creditcard.csv')
X = data.drop(['Class', 'Amount', 'Time'], axis=1)
y = data['Class']

X_scale = MinMaxScaler().fit_transform(X)

# sample data to reduce class imbalance
non_fraud_df = X_scale[y == 0][:2000]
fraud_df = X_scale[y == 1]

X_sample = np.vstack([non_fraud_df, fraud_df])
fraud_idx = np.zeros(len(X_sample))
fraud_idx[-len(fraud_df):] = 1

X_train, X_test, y_train, y_test = train_test_split(X_sample, fraud_idx, test_size=0.2, random_state=42, stratify=fraud_idx)

In [69]:
def purity_score(X, y, cluster_labels) -> float:
    """
    Function to calculate the purity score. Which is a measure of how well a cluster contains only one class.
    It is calculated as the fraction of the dominant class in the cluster. The purity score is then adjusted based on the expected purity.
    """

    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    global_class_distribution = y.value_counts(normalize=True).to_dict() # this mihgt be wrong as we are using splits, but they are stratified so it should be okay
    # Compute metrics for each cluster, including adjusted purity
    cluster_purity = []
    cluster_weight = []

    for cluster_id in np.unique(cluster_labels):
        idx = np.where(cluster_labels == cluster_id)
        cluster_data = X[idx]
        total_in_cluster = len(cluster_data)
        
        # Compute class distribution within the cluster
        class_distribution = y.iloc[idx].value_counts(normalize=True)
        
        # Purity: Fraction of the dominant class in the cluster
        dominant_class = class_distribution.idxmax()
        purity = class_distribution[dominant_class]
        
        # Expected purity based on global distribution
        expected_purity = global_class_distribution[dominant_class]
        
        # Adjusted Purity
        if purity > expected_purity:
            adjusted_purity = (purity - expected_purity) / (1 - expected_purity)
        else:
            adjusted_purity = 0  # Set to 0 if purity is less than or equal to expected purity
        
        # Weighted purity
        cluster_purity.append(adjusted_purity)
        cluster_weight.append(total_in_cluster / len(X))
    
    # Compute the weighted average of cluster purity
    purity = np.sum(np.array(cluster_purity) * np.array(cluster_weight))
    return purity

def score_clustering(X, y, cluster_labels):
    return davies_bouldin_score(X, cluster_labels), silhouette_score(X, cluster_labels), purity_score(X, y, cluster_labels)

def clustering_classification_report(cluster_labels, class_labels):
    """
    Function to calculate the accuracy of a clustering algorithm.
    The accuracy is calculated as the maximum of the purity score and the adjusted purity score.
    """
    df = pd.DataFrame({'cluster': cluster_labels, 'class': class_labels})

    # for each cluster find the class that is most common
    cluster_class = df.groupby('cluster')['class'].agg(lambda x:x.value_counts().index[0])

    # map the cluster labels to the class labels
    predicted_class = df['cluster'].map(cluster_class)
    # calculate the accuracy
    accuracy = accuracy_score(class_labels, predicted_class)
    f1 = f1_score(class_labels, predicted_class)
    recall = recall_score(class_labels, predicted_class)
    precision = precision_score(class_labels, predicted_class)
    metrics = {
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall,
        'precision': precision
    }
    return metrics
    

## Clustering on raw data

### Kmeans

In [87]:
def fit_kmeans(X_train, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X_train)
    return kmeans

def extract_kmeans_cluster_labels(kmeans, X):
    return kmeans.predict(X)

def format_clustering_metrics(train_scores, test_scores, **kwargs):
    results = kwargs
    results.update({
        'Train_DB': train_scores[0],
        'Train_Sil': train_scores[1],
        'Train_Pur': train_scores[2],
        'Test_DB': test_scores[0],
        'Test_Sil': test_scores[1],
        'Test_Pur': test_scores[2],
    })
    return results

def format_classification_metrics(train_metrics, test_metrics, **kwargs):
    results = kwargs
    results.update({
        'Train_Acc': train_metrics['accuracy'],
        'Train_F1': train_metrics['f1'],
        'Train_Recall': train_metrics['recall'],
        'Train_Precision': train_metrics['precision'],
        'Test_Acc': test_metrics['accuracy'],
        'Test_F1': test_metrics['f1'],
        'Test_Recall': test_metrics['recall'],
        'Test_Precision': test_metrics['precision'],
    })
    return results

def evaluate_kmeans(X_train, X_test, y_train, y_test, n_clusters):
    kmeans = fit_kmeans(X_train, n_clusters)
    train_cluster_labels = extract_kmeans_cluster_labels(kmeans, X_train)
    test_cluster_labels = extract_kmeans_cluster_labels(kmeans, X_test)

    train_clustering_scores = score_clustering(X_train, y_train, train_cluster_labels)
    test_clustering_scores = score_clustering(X_test, y_test, test_cluster_labels)

    train_classification_metrics = clustering_classification_report(train_cluster_labels, y_train)
    test_classification_metrics = clustering_classification_report(test_cluster_labels, y_test)
    clustering_scores = format_clustering_metrics(train_clustering_scores, test_clustering_scores, n_clusters=n_clusters)
    classification_metrics = format_classification_metrics(train_classification_metrics, test_classification_metrics, n_clusters=n_clusters)
    return clustering_scores, classification_metrics


def score_kmeans(X_train, X_test, y_train, y_test, cluster_counts=None):
    if cluster_counts is None:
        cluster_counts = [2, 3, 5, 10]
    clustering_metrics_list = []
    classification_metrics_list = []
    for n_clusters in cluster_counts:
        clustering_scores, classification_scores = evaluate_kmeans(X_train, X_test, y_train, y_test, n_clusters)
        clustering_metrics_list.append(clustering_scores)
        classification_metrics_list.append(classification_scores)
    return clustering_metrics_list, classification_metrics_list

In [88]:
cluster_metrics, classification_metrics = score_kmeans(X_train, X_test, y_train, y_test)
results_df = pd.DataFrame(cluster_metrics)
reports_df = pd.DataFrame(classification_metrics)

print(results_df.round(2))
reports_df.round(2)


   n_clusters  Train_DB  Train_Sil  Train_Pur  Test_DB  Test_Sil  Test_Pur
0           2      0.84       0.63       0.56     0.79      0.64      0.60
1           3      1.09       0.48       0.79     1.05      0.47      0.81
2           5      1.97       0.12       0.77     1.93      0.13      0.79
3          10      1.96       0.11       0.82     1.74      0.12      0.83


Unnamed: 0,n_clusters,Train_Acc,Train_F1,Train_Recall,Train_Precision,Test_Acc,Test_F1,Test_Recall,Test_Precision
0,2,0.91,0.72,0.56,1.0,0.92,0.75,0.6,1.0
1,3,0.96,0.88,0.79,0.99,0.96,0.89,0.81,0.99
2,5,0.95,0.87,0.77,1.0,0.96,0.88,0.79,0.99
3,10,0.96,0.9,0.82,0.99,0.96,0.9,0.83,0.99


#### DBSCAN

In [91]:

def score_dbscan(X_train, X_test, y_train, y_test, eps=None):
    if eps is None:
        eps = [0.2, 0.3, 0.4]
    
    clustering_metrics = []
    classification_metrics = []

    for e in eps:
        dbscan = DBSCAN(eps=e, min_samples=20)
        dbscan.fit(X_train)
        train_clusters = dbscan.labels_

        # Assign clusters to test data using nearest neighbors (for DBSCAN)
        # This is a way to make predictions on new data, as DBSCAN does not define any cluster centers
        # we use the cluster of the nearest core point for each test point
        core_samples_mask = dbscan.core_sample_indices_
        core_points = X_train[core_samples_mask]

        nn = NearestNeighbors(n_neighbors=1).fit(core_points)
        distances, indices = nn.kneighbors(X_test)

        test_clusters = np.array([dbscan.labels_[core_samples_mask[i]] if distances[j] < dbscan.eps else -1 
                                for j, i in enumerate(indices.flatten())])

        # These outliers might be considered as a separate cluster
        # Count outliers in train and test clusters
        train_outliers = np.sum(train_clusters == -1)
        test_outliers = np.sum(test_clusters == -1)
        # Exclude outliers for scoring
        # train_clusters_filtered = train_clusters[train_clusters != -1]
        # X_train_filtered = X_train[train_clusters != -1]
        # y_train_filtered = y_train[train_clusters != -1]

        # test_clusters_filtered = test_clusters[test_clusters != -1]
        # X_test_filtered = X_test[test_clusters != -1]
        # y_test_filtered = y_test[test_clusters != -1]

        # if np.unique(train_clusters_filtered).shape[0] < 2:
        #     results.append({
        #         'Eps': e,
        #         'Train_DB': np.nan,
        #         'Train_Sil': np.nan,
        #         'Train_Pur': np.nan,
        #         'Train_Outliers': train_outliers,
        #         'Test_DB': np.nan,
        #         'Test_Sil': np.nan,
        #         'Test_Pur': np.nan,
        #         'Test_Outliers': test_outliers
        #     })
        # else: 
        train_scores = score_clustering(X_train, y_train, train_clusters)
        test_scores = score_clustering(X_test, y_test, test_clusters)
        clustering_metrics.append({
            'Eps': e,
            'Train_DB': train_scores[0],
            'Train_Sil': train_scores[1],
            'Train_Pur': train_scores[2],
            'Train_Outliers': train_outliers,
            'Test_DB': test_scores[0],
            'Test_Sil': test_scores[1],
            'Test_Pur': test_scores[2],
            'Test_Outliers': test_outliers
        })

        train_classification_report = clustering_classification_report(train_clusters, y_train)
        test_classification_report = clustering_classification_report(test_clusters, y_test)

        classification_metrics.append({
            'Eps': e,
            'Train_Acc': train_classification_report['accuracy'],
            'Test_Acc': test_classification_report['accuracy'],
            'Train_F1': train_classification_report['f1'],
            'Test_F1': test_classification_report['f1'],
            'Train_Recall': train_classification_report['recall'],
            'Test_Recall': test_classification_report['recall'],
            'Train_Precision': train_classification_report['precision'],
            'Test_Precision': test_classification_report['precision'],
        })

    return clustering_metrics, classification_metrics

dbscan_cluster_metrics, dbscan_classification_metrics = score_dbscan(X_train, X_test, y_train, y_test)
dbscan_results_df = pd.DataFrame(dbscan_cluster_metrics)
dbscan_reports_df = pd.DataFrame(dbscan_classification_metrics)

print(dbscan_results_df.round(2))
dbscan_reports_df.round(2)



   Eps  Train_DB  Train_Sil  Train_Pur  Train_Outliers  Test_DB  Test_Sil  \
0  0.2      1.84       0.28       0.84             342     1.60      0.22   
1  0.3      0.78       0.67       0.27             108     0.69      0.68   
2  0.4      1.52       0.66       0.22              35     1.09      0.67   

   Test_Pur  Test_Outliers  
0      0.82             94  
1      0.33             34  
2      0.26              6  


Unnamed: 0,Eps,Train_Acc,Test_Acc,Train_F1,Test_F1,Train_Recall,Test_Recall,Train_Precision,Test_Precision
0,0.2,0.96,0.94,0.89,0.86,0.86,0.85,0.93,0.87
1,0.3,0.85,0.87,0.42,0.5,0.27,0.33,0.97,0.97
2,0.4,0.84,0.85,0.35,0.42,0.22,0.26,0.99,1.0


## Clustering on dimensionality reduced data

#### PCA

In [92]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_sample)

X_train_pca, X_test_pca, _, _ = train_test_split(X_pca, fraud_idx, test_size=0.2, random_state=42, stratify=fraud_idx)

pca_clustering, pca_classification = score_kmeans(X_train_pca, X_test_pca, y_train, y_test)

                                                   0  \
0  {'n_clusters': 2, 'Train_DB': 0.50917173404104...   
1  {'n_clusters': 2, 'Train_Acc': 0.9131961866532...   

                                                   1  \
0  {'n_clusters': 3, 'Train_DB': 0.63016932674601...   
1  {'n_clusters': 3, 'Train_Acc': 0.9573507275464...   

                                                   2  \
0  {'n_clusters': 5, 'Train_DB': 0.82322024542433...   
1  {'n_clusters': 5, 'Train_Acc': 0.9573507275464...   

                                                   3  
0  {'n_clusters': 10, 'Train_DB': 0.7415460606238...  
1  {'n_clusters': 10, 'Train_Acc': 0.965880582037...  


#### TSNE

In [23]:
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(X_sample)

X_train_tsne, X_test_tsne, _, _ = train_test_split(X_tsne, fraud_idx, test_size=0.2, random_state=42, stratify=fraud_idx)

pca_clustering, pca_classification = score_kmeans(X_train_pca, X_test_pca, y_train, y_test)

   N_clusters  Train_DB  Train_Sil  Train_Pur   Test_DB  Test_Sil  Test_Pur
0           2  1.047768   0.381645   0.448394  1.032818  0.382833  0.462359
1           3  0.917799   0.405688   0.796482  0.909303  0.418108  0.815682
2           4  0.768266   0.454414   0.795188  0.772892  0.453318  0.815682
3           5  0.778189   0.437504   0.795188  0.765989  0.425675  0.815682
4           6  0.811373   0.435136   0.795188  0.807339  0.434319  0.815682
5           7  0.777682   0.435263   0.795188  0.806587  0.425364  0.818576
6           8  0.771253   0.432216   0.795188  0.781865  0.431731  0.822585
7           9  0.720862   0.456086   0.795188  0.748067  0.440399  0.822585
8          10  0.685635   0.465199   0.795188  0.691458  0.459602  0.820581
