In [23]:
import torch
from model import AutoEncoder
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans, DBSCAN
import pandas as pd
#import clustering metrics 
from sklearn.metrics import davies_bouldin_score, silhouette_score
import numpy as np
# load from ordered dict
# state_dict = torch.load('model.pth')
# # load from state dict
# model = AutoEncoder()
# model.load_state_dict(state_dict)
# model.eval()

#### Load and preprocess data

In [57]:
data = pd.read_csv('creditcard.csv')
amount = data['Amount']
time = data['Time']
X = data.drop(['Class', 'Amount', 'Time'], axis=1)
y = data['Class']

# sample data to reduce class imbalance
non_fraud_df = X[y == 0].iloc[:2000]
fraud_df = X[y == 1]

X_sample = pd.concat([non_fraud_df, fraud_df])
fraud_idx = np.zeros(len(X_sample))
fraud_idx[-len(fraud_df):] = 1

#### Clustering on raw data

In [106]:
def purity_score(X: pd.DataFrame):
    global_class_distribution = X['Fraud'].value_counts(normalize=True).to_dict()
    # Compute metrics for each cluster, including adjusted purity
    cluster_purity = []
    cluster_weight = []

    for cluster_id in np.unique(X['Cluster']):
        cluster_data = X[X['Cluster'] == cluster_id]
        total_in_cluster = len(cluster_data)
        
        # Compute class distribution within the cluster
        class_distribution = cluster_data['Fraud'].value_counts(normalize=True)
        
        # Purity: Fraction of the dominant class in the cluster
        dominant_class = class_distribution.idxmax()
        purity = class_distribution[dominant_class]
        
        # Expected purity based on global distribution
        expected_purity = global_class_distribution[dominant_class]
        
        # Adjusted Purity
        if purity > expected_purity:
            adjusted_purity = (purity - expected_purity) / (1 - expected_purity)
        else:
            adjusted_purity = 0  # Set to 0 if purity is less than or equal to expected purity
        
        # Weighted purity
        cluster_purity.append(adjusted_purity)
        cluster_weight.append(total_in_cluster / len(X_sample))
    
    # Compute the weighted average of cluster purity
    purity = np.sum(np.array(cluster_purity) * np.array(cluster_weight))
    return purity

def score_clustering(algorithm, X, class_labels):
    X = X.copy()
    clusters = algorithm.fit(X)
    cluster_labels = clusters.labels_
    X['Cluster'] = cluster_labels
    X['Fraud'] = class_labels
    # clustering scores
    david_score = davies_bouldin_score(X, cluster_labels)
    sil_score = silhouette_score(X, cluster_labels)
    purity = purity_score(X)
    return david_score, sil_score, purity

    # purity score

In [110]:
N_feature = [2,3,5,10,15,20]

results = []

for n_features in N_feature:
    kmeans = KMeans(n_clusters=5, random_state=0)
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    
    kmeans_scores = score_clustering(kmeans, X_sample.iloc[:, :n_features], X_sample['Fraud'])
    dbscan_scores = score_clustering(dbscan, X_sample.iloc[:, :n_features], X_sample['Fraud'])
    
    results.append({
        'N_features': n_features,
        'KMeans_Davies_Bouldin': kmeans_scores[0],
        'KMeans_Silhouette': kmeans_scores[1],
        'KMeans_Purity': kmeans_scores[2],
        'DBSCAN_Davies_Bouldin': dbscan_scores[0],
        'DBSCAN_Silhouette': dbscan_scores[1],
        'DBSCAN_Purity': dbscan_scores[2]
    })

results_df = pd.DataFrame(results)
results_kmeans = results_df[['N_features', 'KMeans_Davies_Bouldin', 'KMeans_Silhouette', 'KMeans_Purity']]
results_dbscan = results_df[['N_features', 'DBSCAN_Davies_Bouldin', 'DBSCAN_Silhouette', 'DBSCAN_Purity']]

print(results_kmeans)
print(results_dbscan)

   N_features  KMeans_Davies_Bouldin  KMeans_Silhouette  KMeans_Purity
0           2               0.542726           0.641630       0.461272
1           3               0.630811           0.562065       0.701850
2           5               0.841433           0.577800       0.793813
3          10               0.657906           0.608716       0.739337
4          15               1.236534           0.435483       0.804878
5          20               0.849672           0.623534       0.792683
   N_features  DBSCAN_Davies_Bouldin  DBSCAN_Silhouette  DBSCAN_Purity
0           2               1.196259           0.474127       0.349419
1           3               0.895155           0.330439       0.810146
2           5               0.831900           0.108728       0.507193
3          10               0.841945          -0.211458       0.208240
4          15               1.057017          -0.352267       0.036116
5          20               1.215708          -0.374148       0.027287
