In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("data.csv")
print(df.head())

               country  child_mort  exports  health  imports  income  \
0          Afghanistan        90.2     10.0    7.58     44.9    1610   
1              Albania        16.6     28.0    6.55     48.6    9930   
2              Algeria        27.3     38.4    4.17     31.4   12900   
3               Angola       119.0     62.3    2.85     42.9    5900   
4  Antigua and Barbuda        10.3     45.5    6.03     58.9   19100   

   inflation  life_expec  total_fer   gdpp  
0       9.44        56.2       5.82    553  
1       4.49        76.3       1.65   4090  
2      16.10        76.5       2.89   4460  
3      22.40        60.1       6.16   3530  
4       1.44        76.8       2.13  12200  


In [3]:

# Separate categorical and numerical data
categorical_col = df.iloc[:, 0]
numerical_cols = df.iloc[:, 1:]

# One-hot encode the categorical column
categorical_encoded = pd.get_dummies(categorical_col)

In [4]:
# Combine encoded categorical + numerical features
X = pd.concat([categorical_encoded, numerical_cols], axis=1)

# Convert to NumPy array
X = X.values.astype(float)
print(X)

[[1.00e+00 0.00e+00 0.00e+00 ... 5.62e+01 5.82e+00 5.53e+02]
 [0.00e+00 1.00e+00 0.00e+00 ... 7.63e+01 1.65e+00 4.09e+03]
 [0.00e+00 0.00e+00 1.00e+00 ... 7.65e+01 2.89e+00 4.46e+03]
 ...
 [0.00e+00 0.00e+00 0.00e+00 ... 7.31e+01 1.95e+00 1.31e+03]
 [0.00e+00 0.00e+00 0.00e+00 ... 6.75e+01 4.67e+00 1.31e+03]
 [0.00e+00 0.00e+00 0.00e+00 ... 5.20e+01 5.40e+00 1.46e+03]]


In [6]:
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

In [8]:
def compute_distance_matrix(X):
    n = X.shape[0]
    dist_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(i + 1, n):
            dist = euclidean_distance(X[i], X[j])
            dist_matrix[i, j] = dist
            dist_matrix[j, i] = dist
            
    return dist_matrix

In [13]:
def average_linkage(cluster_a, cluster_b, dist_matrix):
    distances = []
    for i in cluster_a:
        for j in cluster_b:
            distances.append(dist_matrix[i, j])
    return np.mean(distances)


In [14]:
def hierarchical_clustering(X):
    n = X.shape[0]
    dist_matrix = compute_distance_matrix(X)
    
    # Start with each point as its own cluster
    clusters = [[i] for i in range(n)]
    linkage_history = []

    while len(clusters) > 1:
        min_dist = np.inf
        pair_to_merge = (0, 1)

        # Find closest pair of clusters
        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                dist = average_linkage(clusters[i], clusters[j], dist_matrix)
                if dist < min_dist:
                    min_dist = dist
                    pair_to_merge = (i, j)

        i, j = pair_to_merge
        new_cluster = clusters[i] + clusters[j]

        # Save merge step (for dendrogram-like output)
        linkage_history.append((clusters[i], clusters[j], min_dist))

        # Update clusters
        clusters.pop(j)
        clusters.pop(i)
        clusters.append(new_cluster)

    return linkage_history


In [15]:
linkage_history = hierarchical_clustering(X)

# Display merge steps
for step, (c1, c2, dist) in enumerate(linkage_history, 1):
    print(f"Step {step}: Merge {c1} and {c2} at distance {dist:.4f}")


Step 1: Merge [25] and [64] at distance 49.6352
Step 2: Merge [0] and [56] at distance 54.7063
Step 3: Merge [146] and [147] at distance 55.5273
Step 4: Merge [31] and [106] at distance 69.2352
Step 5: Merge [164] and [165] at distance 72.2939
Step 6: Merge [17] and [97] at distance 75.5184
Step 7: Merge [50] and [93] at distance 80.7371
Step 8: Merge [126] and [25, 64] at distance 82.1461
Step 9: Merge [142] and [166] at distance 96.2613
Step 10: Merge [28] and [40] at distance 100.4809
Step 11: Merge [12] and [27] at distance 100.6157
Step 12: Merge [69] and [164, 165] at distance 101.7339
Step 13: Merge [115] and [128] at distance 101.7365
Step 14: Merge [155] and [0, 56] at distance 105.0152
Step 15: Merge [37] and [88] at distance 107.8945
Step 16: Merge [132] and [150] at distance 119.3676
Step 17: Merge [112] and [31, 106] at distance 128.4475
Step 18: Merge [50, 93] and [126, 25, 64] at distance 132.2284
Step 19: Merge [57] and [62] at distance 134.3230
Step 20: Merge [26] and 

In [20]:
def silhouette(X, labels):
    n = len(X)
    unique_clusters = np.unique(labels)
    silhouette_vals = []

    for i in range(n):
        same_cluster = labels == labels[i]
        other_clusters = labels != labels[i]

        # a = intra-cluster distance
        if np.sum(same_cluster) > 1:
            a = np.mean([np.linalg.norm(X[i] - X[j])
                         for j in range(n) if same_cluster[j] and j != i])
        else:
            a = 0

        # b = nearest-cluster distance
        b = np.inf
        for cluster in unique_clusters:
            if cluster != labels[i]:
                cluster_points = X[labels == cluster]
                dist = np.mean([np.linalg.norm(X[i] - p) for p in cluster_points])
                b = min(b, dist)

        s = (b - a) / max(a, b)
        silhouette_vals.append(s)

    return np.mean(silhouette_vals)


In [26]:
def get_hierarchical_labels(X, k):
    n = X.shape[0]
    dist_matrix = compute_distance_matrix(X)

    # Start with each point as its own cluster
    clusters = [[i] for i in range(n)]

    while len(clusters) > k:
        min_dist = np.inf
        pair_to_merge = (0, 1)

        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                dist = average_linkage(clusters[i], clusters[j], dist_matrix)
                if dist < min_dist:
                    min_dist = dist
                    pair_to_merge = (i, j)

        i, j = pair_to_merge
        new_cluster = clusters[i] + clusters[j]

        clusters.pop(j)
        clusters.pop(i)
        clusters.append(new_cluster)

    # Create labels array
    labels = np.zeros(n, dtype=int)
    for cluster_id, cluster in enumerate(clusters):
        for idx in cluster:
            labels[idx] = cluster_id

    return labels


In [27]:
k = 3  # choose number of clusters
labels = get_hierarchical_labels(X, k)


In [28]:
print("Silhouette Score:", silhouette(X, labels))


Silhouette Score: 0.7160188906800354
