In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import train_test_split

In [16]:
data = pd.read_csv('creditcard.csv')

# Normalize the data
scaler = MinMaxScaler()

X = data.drop('Class', axis=1)
y = data['Class']
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [19]:
def evaluate_clustering(X, labels):
    """Compute Silhouette Score and Davies-Bouldin Index for clustering."""
    if len(set(labels)) > 1:  # Ensure more than one cluster
        silhouette = silhouette_score(X, labels)
        davies_bouldin = davies_bouldin_score(X, labels)
        return silhouette, davies_bouldin
    else:
        return -1, float('inf')  # Invalid clustering

def tune_kmeans_dbscan(X, method, params, scoring='combined'):
    """Evaluate KMeans or DBSCAN with Silhouette and Davies-Bouldin Index."""
    best_score = float('-inf')
    best_params = None

    for param in params:
        print(param)
        if method == 'kmeans':
            kmeans = KMeans(**param, random_state=42)
            labels = kmeans.fit_predict(X)
        elif method == 'dbscan':
            dbscan = DBSCAN(**param)
            labels = dbscan.fit_predict(X)
        

        # Combine the two metrics (normalize DBI by its range for simplicity)
        if scoring == 'combined':
            silhouette, davies_bouldin = evaluate_clustering(X, labels)
            combined_score = silhouette - (1 / (1 + davies_bouldin))  # Example formula
            score = combined_score
        elif scoring == 'silhouette':
            score = silhouette_score(X, labels)
        elif scoring == 'davies_bouldin':
            # sign to make it maximization problem
            score = -davies_bouldin_score(X, labels)

        print(f"{scoring}, Params={param}, Score={score:.4f}")
        print()
        
        if score > best_score:
            best_score = score
            best_params = param

    print(f"Best Params: {best_params}, Best Combined Score: {best_score:.4f}")
    return best_params, best_score

In [20]:
print("Tuning K-Means...")

params = ({'n_clusters': x,} for x in range(2, 10))
tune_kmeans_dbscan(X, method='kmeans', params=params, scoring='combined')  # Tune K (2 to 10)

Tuning K-Means...
{'n_clusters': 2}
combined, Params={'n_clusters': 2}, Score=-0.0959

{'n_clusters': 3}


KeyboardInterrupt: 

In [30]:
print("\nTuning DBSCAN...")
eps_values = np.linspace(0.1, 1.0, 10)
min_samples_values = range(2, 10)
dbscan_params = ({'eps': eps, 'min_samples': min_samples} for eps in eps_values for min_samples in min_samples_values)
tune_kmeans_dbscan(X, method='dbscan', params=dbscan_params, scoring='davies_bouldin')  # Tune eps and


Tuning DBSCAN...


0it [00:00, ?it/s]

{'eps': np.float64(0.1), 'min_samples': 2}


1it [00:32, 32.04s/it]

davies_bouldin, Params={'eps': np.float64(0.1), 'min_samples': 2}, Score=-2.4453
{'eps': np.float64(0.1), 'min_samples': 3}


In [8]:
# fit best model kmeans
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)
labels = kmeans.predict(X)

In [7]:
# fit best model dbscan
dbscan = DBSCAN(eps=0.6, min_samples=8)
labels = dbscan.fit_predict(X)