In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from utils.clustering import test_dbscan

In [2]:
data = pd.read_csv('creditcard.csv')

# Normalize the data
scaler = MinMaxScaler()

X = data.drop('Class', axis=1)
y = data['Class']
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_train_fraud = X_train[y_train == 1]
X_train_non_fraud = pd.DataFrame(X_train[y_train == 0]).sample(10000).values
X_train_sample = np.concatenate([X_train_fraud, X_train_non_fraud])

In [3]:
def evaluate_clustering(X, labels):
    """Compute Silhouette Score and Davies-Bouldin Index for clustering."""
    if len(set(labels)) > 1:  # Ensure more than one cluster
        silhouette = silhouette_score(X, labels)
        davies_bouldin = davies_bouldin_score(X, labels)
        return silhouette, davies_bouldin
    else:
        return -1, float('inf')  # Invalid clustering

def tune_kmeans_dbscan(X, method, params, scoring='combined'):
    """Evaluate KMeans or DBSCAN with Silhouette and Davies-Bouldin Index."""
    best_score = float('-inf')
    best_params = None

    for param in params:
        print(param)
        if method == 'kmeans':
            kmeans = KMeans(**param, random_state=42)
            labels = kmeans.fit_predict(X)
        elif method == 'dbscan':
            dbscan = DBSCAN(**param)
            labels = dbscan.fit_predict(X)
        

        # Combine the two metrics (normalize DBI by its range for simplicity)
        if scoring == 'combined':
            silhouette, davies_bouldin = evaluate_clustering(X, labels)
            combined_score = silhouette - (1 / (1 + davies_bouldin))  # Example formula
            score = combined_score
        elif scoring == 'silhouette':
            score = silhouette_score(X, labels)
        elif scoring == 'davies_bouldin':
            # sign to make it maximization problem
            score = -davies_bouldin_score(X, labels)

        print(f"{scoring}, Params={param}, Score={score:.4f}")
        print()
        
        if score > best_score:
            best_score = score
            best_params = param

    print(f"Best Params: {best_params}, Best Combined Score: {best_score:.4f}")
    return best_params, best_score

In [4]:
print("Tuning K-Means...")

params = ({'n_clusters': x,} for x in range(2, 10))
tune_kmeans_dbscan(X_train_sample, method='kmeans', params=params, scoring='combined')  # Tune K (2 to 10)

Tuning K-Means...
{'n_clusters': 2}
combined, Params={'n_clusters': 2}, Score=-0.1013

{'n_clusters': 3}
combined, Params={'n_clusters': 3}, Score=-0.1130

{'n_clusters': 4}
combined, Params={'n_clusters': 4}, Score=-0.1541

{'n_clusters': 5}
combined, Params={'n_clusters': 5}, Score=-0.1956

{'n_clusters': 6}
combined, Params={'n_clusters': 6}, Score=-0.1944

{'n_clusters': 7}
combined, Params={'n_clusters': 7}, Score=-0.1980

{'n_clusters': 8}
combined, Params={'n_clusters': 8}, Score=-0.1992

{'n_clusters': 9}
combined, Params={'n_clusters': 9}, Score=-0.2101

Best Params: {'n_clusters': 2}, Best Combined Score: -0.1013


({'n_clusters': 2}, np.float64(-0.10127835968671722))

In [5]:
print("\nTuning DBSCAN...")
eps_values = np.linspace(0.1, 1.0, 10)
min_samples_values = range(2, 10)
dbscan_params = ({'eps': eps, 'min_samples': min_samples} for eps in eps_values for min_samples in min_samples_values)
tune_kmeans_dbscan(X_train_sample, method='dbscan', params=dbscan_params, scoring='combined')  # Tune eps and


Tuning DBSCAN...
{'eps': np.float64(0.1), 'min_samples': 2}
combined, Params={'eps': np.float64(0.1), 'min_samples': 2}, Score=-0.7507

{'eps': np.float64(0.1), 'min_samples': 3}
combined, Params={'eps': np.float64(0.1), 'min_samples': 3}, Score=-0.7588

{'eps': np.float64(0.1), 'min_samples': 4}
combined, Params={'eps': np.float64(0.1), 'min_samples': 4}, Score=-0.7521

{'eps': np.float64(0.1), 'min_samples': 5}
combined, Params={'eps': np.float64(0.1), 'min_samples': 5}, Score=-0.7532

{'eps': np.float64(0.1), 'min_samples': 6}
combined, Params={'eps': np.float64(0.1), 'min_samples': 6}, Score=-0.7311

{'eps': np.float64(0.1), 'min_samples': 7}
combined, Params={'eps': np.float64(0.1), 'min_samples': 7}, Score=-0.7365

{'eps': np.float64(0.1), 'min_samples': 8}
combined, Params={'eps': np.float64(0.1), 'min_samples': 8}, Score=-0.7215

{'eps': np.float64(0.1), 'min_samples': 9}
combined, Params={'eps': np.float64(0.1), 'min_samples': 9}, Score=-0.7329

{'eps': np.float64(0.2), 'min_

({'eps': np.float64(0.7000000000000001), 'min_samples': 5},
 np.float64(0.2519329799887736))

In [6]:
# fit best model kmeans
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_train)
train_labels = kmeans.predict(X_train)
test_labels = kmeans.predict(X_test)

In [21]:
# fit best model dbscan
dbscan = DBSCAN(eps=0.7, min_samples=5)
train_labels = dbscan.fit_predict(X_train_sample)
test_labels = test_dbscan(dbscan, X_train_sample, X_test)

print(np.unique(test_labels, return_counts=True))

(array([-1,  0]), array([    7, 56955]))
