In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import pandas as pd

In [None]:

X = pd.read_csv('global_preprocessed.csv', encoding='ISO-8859-1')
X = X.sample(frac=0.1) 
# Define the parameter grid
n_clusters_values = range(2, 10)
affinity_values = ["euclidean", "l1", "l2", "manhattan", "cosine"]  # Different types of affinity
linkage_values = ["complete", "average", "single", "ward"]  # Different types of linkage 

# Initialize variables to store best parameters
best_score = -1  # Silhouette score ranges from -1 to 1
best_n_clusters = None
best_affinity = None
best_linkage = None

In [None]:

# Manual grid search
for n_clusters in n_clusters_values:
    for affinity in affinity_values:
        for linkage in linkage_values:
            # 'ward' linkage can only work with 'euclidean' affinity
            if linkage == 'ward' and affinity != 'euclidean':
                continue
            
            model = AgglomerativeClustering(n_clusters=n_clusters, 
                                            affinity=affinity, 
                                            linkage=linkage)
            model.fit(X)
            score = silhouette_score(X, model.labels_)

            # If the model's score is better than our current best, update best_score and best parameters
            if score > best_score:
                best_score = score
                best_n_clusters = n_clusters
                best_affinity = affinity
                best_linkage = linkage

In [None]:

print(f'Best n_clusters: {best_n_clusters}')
print(f'Best affinity: {best_affinity}')
print(f'Best linkage: {best_linkage}')
print(f'Best silhouette score: {best_score}')
