# Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score

# Hierarchical

In [None]:
# Function to calculate R² for a clustering solution
def get_rsq(df, features, label_column):
    """Calculates R² for a clustering solution.
    
    R² = SSb / SSt
    where:
    SSb = between-group sum of squares
    SSt = total sum of squares
    
    Parameters:
    df (DataFrame): The dataframe containing the features and cluster labels
    features (list): List of feature columns to consider
    label_column (str): The column containing the cluster labels
    
    Returns:
    float: R² value indicating homogeneity of the clusters
    """
    # Total Sum of Squares (SSt)
    mean = df[features].mean().values
    sst = np.sum((df[features] - mean) ** 2)

    # Between-group Sum of Squares (SSb)
    ssb = 0
    for label in df[label_column].unique():
        group = df[df[label_column] == label]
        group_mean = group[features].mean().values
        ssb += len(group) * np.sum((group_mean - mean) ** 2)
        
    # R² Calculation
    r2 = ssb / sst if sst != 0 else 0
    return r2

# Running R² Calculation for Different Hierarchical Clustering Methods
hc_methods = ["ward", "complete", "average", "single"]
max_nclus = 10

# Initialize an empty list to store R² values for each method
r2_hc = []

# Compute R² for each clustering method and range of clusters
for link in hc_methods:
    r2_method = []
    for i in range(1, max_nclus + 1):  # Loop over cluster counts
        cluster = AgglomerativeClustering(n_clusters=i, metric="euclidean", linkage=link)
        
        # Get cluster labels
        hclabels = cluster.fit_predict(df) 
        
        # Concatenate the df with the cluster labels
        df_concat = pd.concat([df, pd.Series(hclabels, name='labels', index=df.index)], axis=1)  
        
        # Compute R² for the current number of clusters
        r2_method.append(get_rsq(df_concat, df.columns.tolist(), 'labels'))
    
    r2_hc.append(r2_method)

# Convert R² results into a numpy array for easier manipulation
r2_hc = np.vstack(r2_hc)

## R²

In [None]:
# Plotting the R² values for Hierarchical Clustering Methods
sns.set()

# Create a figure for the plot
fig = plt.figure(figsize=(11, 5))

# Plot the R² values for each clustering method
for i, link in enumerate(hc_methods):
    sns.lineplot(data=r2_hc[i], linewidth=2.5, markers=["o"] * max_nclus, label=link)

# Finalize the plot
plt.legend(title="HC Methods", title_fontsize=11)
plt.xticks(range(1, max_nclus + 1))
plt.xlabel("Number of Clusters", fontsize=13)
plt.ylabel("R² Metric", fontsize=13)
fig.suptitle("$R^2$ Plot for Various Hierarchical Methods", fontsize=21)

plt.show()

## Dendrogram

In [None]:
# Plotting the Dendrogram for Hierarchical Clustering
# Using the 'ward' method for dendrogram plotting (you can adjust this method)
linkage_matrix = linkage(df.values, method="ward", metric="euclidean")

sns.set()
fig = plt.figure(figsize=(11, 5))
# Set a threshold for cutting the dendrogram
y_threshold = 5
dendrogram(linkage_matrix, truncate_mode='level', p=5, color_threshold=y_threshold, above_threshold_color='k')
plt.hlines(y_threshold, 0, 1000, colors="r", linestyles="dashed")
plt.title(f'Hierarchical Clustering Dendrogram: Ward Linkage', fontsize=21)
plt.xlabel('Number of Points in Node')
plt.ylabel(f'Euclidean Distance', fontsize=13)
plt.show()

# Kmeans - with Hiearchical Clustering init

## Centroid Seeds

In [None]:
n_clusters = 4  # Example number of clusters chosen

# Get the labels from hierarchical chosen clustering solution
cluster = AgglomerativeClustering(n_clusters=n_clusters, metric="euclidean", linkage="ward") # Example params
hc_labels = cluster.fit_predict(df)

# Calculate centroids based on these hierarchical clusters
centroids = []
for i in range(n_clusters):
    cluster_points = df[hc_labels == i]  # Get points belonging to cluster i
    centroid = cluster_points.mean(axis=0)  # Calculate the mean of these points (centroid)
    centroids.append(centroid)

centroids = np.array(centroids)


## Kmeans

In [None]:
# Hierarchical Clustreing centroids as seeds for Kmeans init
kmeans = KMeans(n_clusters=n_clusters, init=centroids, n_init=1)  # n_init=1 since we are providing initial centroids
kmeans.fit(df)

# Calculate the silhouette score to evaluate clustering
silhouette_avg = silhouette_score(df, kmeans_labels)
print(f"Silhouette Score: {silhouette_avg}")

In [None]:
# KMeans cluster labels
kmeans_labels = kmeans.labels_

# KMeans centroids after fitting
kmeans_centroids = kmeans.cluster_centers_
print("KMeans Centroids:")
print(kmeans_centroids)