In [7]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import pandas as pd
import numpy as np

In [8]:
# Path settings csv file
csv_filepath = '../data/interim/gower_imputed.csv'

# Loading the csv file
data_gower = pd.read_csv(csv_filepath)
print("Data loaded successfully")

Data loaded successfully


In [9]:
start = 0.1
end = 0.32
step = 0.02
eps_values = np.arange(start, end, step)
start = 0
end = 140
step = 20
min_samples = np.arange(start, end, step)

total_combinations = len(eps_values[1:]) * len(min_samples[1:])
total_combinations 

60

# Clustering the imputed data

In [10]:
df_analysis_gower = pd.DataFrame(columns=["eps_value", "min_sample", "score", "n_clusters", "cluster_types"])

current_combination = 0

for eps_value in eps_values[1:]:
    for min_sample in min_samples[1:]:
        current_combination += 1
        print(f"Processing combination {current_combination}/{total_combinations} (eps_value={eps_value}, min_sample={min_sample})")

        # Perform DBSCAN clustering
        dbscan_cluster = DBSCAN(eps=eps_value, min_samples=min_sample, metric="precomputed", n_jobs=-1)
        labels = dbscan_cluster.fit_predict(data_gower)
        n_clusters = len(set(labels))
        cluster_types = set(labels)
        try:
            score = silhouette_score(data_gower, labels)
        except ValueError:
            score = 0
        new_row = {'eps_value': eps_value, 'min_sample': min_sample, 'score': score, 'n_clusters': n_clusters,
                   'cluster_types': cluster_types}

        # Adding the new row without using append
        df_analysis_gower.loc[len(df_analysis_gower)] = new_row

Processing combination 1/60 (eps_value=0.12000000000000001, min_sample=20)
Processing combination 2/60 (eps_value=0.12000000000000001, min_sample=40)
Processing combination 3/60 (eps_value=0.12000000000000001, min_sample=60)
Processing combination 4/60 (eps_value=0.12000000000000001, min_sample=80)
Processing combination 5/60 (eps_value=0.12000000000000001, min_sample=100)
Processing combination 6/60 (eps_value=0.12000000000000001, min_sample=120)
Processing combination 7/60 (eps_value=0.14, min_sample=20)
Processing combination 8/60 (eps_value=0.14, min_sample=40)
Processing combination 9/60 (eps_value=0.14, min_sample=60)
Processing combination 10/60 (eps_value=0.14, min_sample=80)
Processing combination 11/60 (eps_value=0.14, min_sample=100)
Processing combination 12/60 (eps_value=0.14, min_sample=120)
Processing combination 13/60 (eps_value=0.16000000000000003, min_sample=20)
Processing combination 14/60 (eps_value=0.16000000000000003, min_sample=40)
Processing combination 15/60 (e

In [None]:
df_analysis_gower.sort_values(by="score", ascending=False)

Unnamed: 0,eps_value,min_sample,score,n_clusters,cluster_types
81,0.20,20,0.323211,2,"{0, -1}"
82,0.20,40,0.319070,2,"{0, -1}"
83,0.20,60,0.315165,2,"{0, -1}"
84,0.20,80,0.310424,2,"{0, -1}"
85,0.20,100,0.308021,2,"{0, -1}"
...,...,...,...,...,...
26,0.06,180,0.000000,1,{-1}
34,0.08,160,0.000000,1,{-1}
35,0.08,180,0.000000,1,{-1}
0,0.02,20,0.000000,1,{-1}


# Clustering the data, where the NaN where dropped

In [None]:
# Path settings csv file
csv_filepath = '../data/interim/gower_no_nan.csv'

# Loading the csv file
data_gower_no_nan = pd.read_csv(csv_filepath)
print("Data loaded successfully")

Data loaded successfully


In [None]:
df_analysis_gower_no_nan = pd.DataFrame(columns=["eps_value", "min_sample", "score", "n_clusters", "cluster_types"])

current_combination = 0

for eps_value in eps_values[1:]:
    for min_sample in min_samples[1:]:
        current_combination += 1
        print(f"Processing combination {current_combination}/{total_combinations} (eps_value={eps_value}, min_sample={min_sample})")

        # Perform DBSCAN clustering
        dbscan_cluster = DBSCAN(eps=eps_value, min_samples=min_sample, metric="precomputed", n_jobs=-1)
        labels = dbscan_cluster.fit_predict(data_gower_no_nan)
        n_clusters = len(set(labels))
        cluster_types = set(labels)
        try:
            score = silhouette_score(data_gower_no_nan, labels)
            print(score)
        except ValueError:
            score = 0
        new_row = {'eps_value': eps_value, 'min_sample': min_sample, 'score': score, 'n_clusters': n_clusters,
                   'cluster_types': cluster_types}

        # Adding the new row without using append
        df_analysis_gower_no_nan.loc[len(df_analysis_gower_no_nan)] = new_row

Processing combination 1/60 (eps_value=0.12000000000000001, min_sample=20)
0.17192738661847665
Processing combination 2/60 (eps_value=0.12000000000000001, min_sample=40)
0.1554297069956887
Processing combination 3/60 (eps_value=0.12000000000000001, min_sample=60)
0.13568154173426306
Processing combination 4/60 (eps_value=0.12000000000000001, min_sample=80)
0.10948730618062436
Processing combination 5/60 (eps_value=0.12000000000000001, min_sample=100)
0.08165184169311451
Processing combination 6/60 (eps_value=0.12000000000000001, min_sample=120)
0.052922357650790425
Processing combination 7/60 (eps_value=0.14, min_sample=20)
0.20821476007698062
Processing combination 8/60 (eps_value=0.14, min_sample=40)
0.20084377228365866
Processing combination 9/60 (eps_value=0.14, min_sample=60)
0.19483289920926972
Processing combination 10/60 (eps_value=0.14, min_sample=80)
0.18908380689993312
Processing combination 11/60 (eps_value=0.14, min_sample=100)
0.1835626265914732
Processing combination 12/

In [None]:
df_analysis_gower_no_nan.query("n_clusters > 1").sort_values(by="score", ascending=False)

Unnamed: 0,eps_value,min_sample,score,n_clusters,cluster_types
59,0.3,120,0.482274,2,"{0, -1}"
58,0.3,100,0.482274,2,"{0, -1}"
57,0.3,80,0.482274,2,"{0, -1}"
48,0.28,20,0.477062,2,"{0, -1}"
50,0.28,60,0.471439,2,"{0, -1}"
49,0.28,40,0.471439,2,"{0, -1}"
51,0.28,80,0.468304,2,"{0, -1}"
53,0.28,120,0.462379,2,"{0, -1}"
52,0.28,100,0.462379,2,"{0, -1}"
45,0.26,80,0.375174,2,"{0, -1}"


# PCA Clustering

In [None]:
start = 1
end = 2
step = 0.01
eps_values_pca = np.arange(start, end, step)
start = 0
end = 200
step = 20
min_samples_pca = np.arange(start, end, step)
total_combinations = len(eps_values_pca[1:]) * len(min_samples_pca[1:])
total_combinations 

891

In [None]:
# Path settings csv file
csv_filepath = '../data/interim/pca_non_nan.csv'

# Loading the csv file
data_pca = pd.read_csv(csv_filepath)
print("Data loaded successfully")

Data loaded successfully


In [None]:
df_analysis_pca = pd.DataFrame(columns=["eps_value", "min_sample", "score", "n_clusters", "cluster_types"])

current_combination = 0

for eps_value in eps_values_pca[1:]:
    for min_sample in min_samples_pca[1:]:
        current_combination += 1
        print(f"Processing combination {current_combination}/{total_combinations} (eps_value={eps_value}, min_sample={min_sample})")

        # Perform DBSCAN clustering
        dbscan_cluster = DBSCAN(eps=eps_value, min_samples=min_sample, metric="euclidean", n_jobs=-1)
        labels = dbscan_cluster.fit_predict(data_pca)
        n_clusters = len(set(labels))
        cluster_types = set(labels)
        try:
            score = silhouette_score(data_pca, labels)
            print(score)
        except ValueError:
            score = 0
        new_row = {'eps_value': eps_value, 'min_sample': min_sample, 'score': score, 'n_clusters': n_clusters,
                   'cluster_types': cluster_types}

        # Adding the new row without using append
        df_analysis_pca.loc[len(df_analysis_pca)] = new_row

Processing combination 1/891 (eps_value=1.01, min_sample=20)
-0.1773120552615291
Processing combination 2/891 (eps_value=1.01, min_sample=40)
Processing combination 3/891 (eps_value=1.01, min_sample=60)
Processing combination 4/891 (eps_value=1.01, min_sample=80)
Processing combination 5/891 (eps_value=1.01, min_sample=100)
Processing combination 6/891 (eps_value=1.01, min_sample=120)
Processing combination 7/891 (eps_value=1.01, min_sample=140)
Processing combination 8/891 (eps_value=1.01, min_sample=160)
Processing combination 9/891 (eps_value=1.01, min_sample=180)
Processing combination 10/891 (eps_value=1.02, min_sample=20)
-0.19362047172377123
Processing combination 11/891 (eps_value=1.02, min_sample=40)
Processing combination 12/891 (eps_value=1.02, min_sample=60)
Processing combination 13/891 (eps_value=1.02, min_sample=80)
Processing combination 14/891 (eps_value=1.02, min_sample=100)
Processing combination 15/891 (eps_value=1.02, min_sample=120)
Processing combination 16/891 (

In [None]:
df_analysis_pca.query("n_clusters > 1").sort_values(by="score", ascending=False)

Unnamed: 0,eps_value,min_sample,score,n_clusters,cluster_types
882,1.99,20,0.097094,2,"{0, -1}"
873,1.98,20,0.092684,2,"{0, -1}"
864,1.97,20,0.089422,2,"{0, -1}"
855,1.96,20,0.087300,2,"{0, -1}"
846,1.95,20,0.085329,2,"{0, -1}"
...,...,...,...,...,...
126,1.15,20,-0.222201,8,"{0, 1, 2, 3, 4, 5, 6, -1}"
99,1.12,20,-0.222372,8,"{0, 1, 2, 3, 4, 5, 6, -1}"
90,1.11,20,-0.222372,8,"{0, 1, 2, 3, 4, 5, 6, -1}"
81,1.10,20,-0.222741,8,"{0, 1, 2, 3, 4, 5, 6, -1}"


Long story short, clustering does not yield good usage possiblities as it the high sillhouette scores are achieved with just one cluster in all cases.