# Hierarchical Clustering of Countries Economic Development

In [1]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

In [2]:
n = 113
T_gdp = 118
T_areas = 61
T_pop = 118
T_labor = 64
T_currency = 58

names_path = "Data/names.txt"
gdp_path = "Data/yp_raw.csv"
locations_path = "Data/locations.csv"
population_path = "Data/pop_raw.csv"
currency_path = "Data/currency.csv"

In [3]:
names = []
with open(names_path, 'r') as file:
    rows = file.readlines()
    for row in rows:
        names.append(row[:3])

gdp = np.zeros((n,T_gdp))
with open(gdp_path, 'r') as file:
    rows = csv.reader(file)
    for i, row in enumerate(rows):
        for j, val in enumerate(row):
            gdp[j][i] = float(val)
gdp_data = gdp[:, -T_currency:]

locations = np.zeros((n,2))
with open(locations_path, 'r') as file:
    rows = csv.reader(file)
    for i, row in enumerate(rows):
        locations[i][0] = float(row[0])
        locations[i][1] = float(row[1])

pop = np.zeros((n, T_pop))
with open(population_path, 'r') as file:
    rows = csv.reader(file)
    for i, row in enumerate(rows):
        for j, val in enumerate(row):
            pop[j][i] = float(val)
pop_data = pop[:, -T_currency:]

currency = np.zeros((n, T_currency))
with open(currency_path, 'r') as file:
    rows = csv.reader(file)
    for i, row in enumerate(rows):
        for j, val in enumerate(row):
            currency[i][j] = float(val)

## PCA and Data Preparation

In [4]:
df = pd.DataFrame({})
variance = 0.95
scaler = StandardScaler()

### PCA for GDP

In [5]:
gdp_scaled = scaler.fit_transform(gdp_data)
gdp_components = PCA(n_components=variance).fit_transform(gdp_data)
for i in range(gdp_components.shape[1]):
    df[f"GDP #{i}"] = gdp_components[:,i]

### PCA for Population

In [6]:
pop_scaled = scaler.fit_transform(pop_data)
pop_components = PCA(n_components=variance).fit_transform(pop_data)
for i in range(pop_components.shape[1]):
    df[f"Population #{i}"] = pop_components[:,i]

### PCA for Currency

In [7]:
imputer = KNNImputer(n_neighbors=3)
currency_imputed = imputer.fit_transform(currency)
currency_scaled = scaler.fit_transform(currency_imputed)
currency_components = PCA(n_components=variance).fit_transform(currency_imputed)
for i in range(currency_components.shape[1]):
    df[f"Currency #{i}"] = currency_components[:,i]

### Scaling the Data

In [8]:
scaled_df = scaler.fit_transform(df)

## Hierarchical Clustering Algorithms

### Complete Linkage

In [None]:
complete1 = linkage(scaled_df, method='complete', metric='euclidean')
plt.figure(figsize=(15, 7))
dendrogram(complete1)
plt.show()

In [None]:
complete_clusters1 = fcluster(complete1, 0.7*np.max(complete1[:,2]), criterion='distance')
clusters, counts = np.unique(complete_clusters1, return_counts=True)
print("Number of Clusters:", len(clusters))
print("Cluster Sizes:", counts)
print("Silhouette Score:",silhouette_score(scaled_df, complete_clusters1))

### Average Linkage

In [None]:
average1 = linkage(scaled_df, method='average', metric='euclidean')
plt.figure(figsize=(15, 7))
dendrogram(average1)
plt.show()

In [None]:
average_clusters1 = fcluster(average1, 0.7*np.max(average1[:,2]), criterion='distance')
clusters, counts = np.unique(average_clusters1, return_counts=True)
print("Number of Clusters:", len(clusters))
print("Cluster Sizes:", counts)
print("Silhouette Score:",silhouette_score(scaled_df, average_clusters1))

### Single Linkage

In [None]:
single1 = linkage(scaled_df, method='single', metric='euclidean')
plt.figure(figsize=(15, 7))
dendrogram(single1)
plt.show()

In [None]:
single_clusters1 = fcluster(single1, 0.7*np.max(single1[:,2]), criterion='distance')
clusters, counts = np.unique(single_clusters1, return_counts=True)
print("Number of Clusters:", len(clusters))
print("Cluster Sizes:", counts)
print("Silhouette Score:",silhouette_score(scaled_df, single_clusters1))

### Ward Linkage

In [None]:
ward1 = linkage(scaled_df, method='ward', metric='euclidean')
plt.figure(figsize=(15, 7))
dendrogram(ward1)
plt.show()

In [None]:
ward_clusters1 = fcluster(ward1, 0.7*np.max(ward1[:,2]), criterion='distance')
clusters, counts = np.unique(ward_clusters1, return_counts=True)
print("Number of Clusters:", len(clusters))
print("Cluster Sizes:", counts)
print("Silhouette Score:",silhouette_score(scaled_df, ward_clusters1))