In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')



Matplotlib is building the font cache; this may take a moment.


In [None]:
plt.style.use('seaborn')
sns.set_palette("husl")

data = pd.read_csv('https://raw.githubusercontent.com/datasets/world-development-indicators/master/data/world-development-indicators.csv')


indicators = [
    'GDP per capita (current US$)',
    'Life expectancy at birth, total (years)',
    'School enrollment, tertiary (% gross)',
    'CO2 emissions (metric tons per capita)',
    'Access to electricity (% of population)'
]


df = data[data['Year'] == 2020][['Country Name'] + indicators].dropna()
print(f"Number of countries with complete data: {len(df)}")


print("\nFeature Statistics:")
print(df[indicators].describe())

# Preprocessing
# Scale the features
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df[indicators])
df_scaled = pd.DataFrame(df_scaled, columns=indicators)

# Determine optimal number of clusters using elbow method and silhouette score
inertias = []
silhouette_scores = []
K = range(2, 11)

for k in K:
    # Create and fit KMeans model
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_scaled)
    
    # Calculate inertia and silhouette score
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(df_scaled, kmeans.labels_))

# Plot elbow curve
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')

plt.subplot(1, 2, 2)
plt.plot(K, silhouette_scores, 'rx-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. k')

plt.tight_layout()
plt.show()

# Choose optimal number of clusters (k=4 based on elbow curve and silhouette score)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_scaled)

# Analyze clusters
print("\nCluster Sizes:")
print(df['Cluster'].value_counts())

# Calculate cluster centers
cluster_centers = pd.DataFrame(
    scaler.inverse_transform(kmeans.cluster_centers_),
    columns=indicators
)

print("\nCluster Centers:")
print(cluster_centers)

# Visualize clusters using PCA for dimensionality reduction
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)

# Create scatter plot of clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df['Cluster'], cmap='viridis')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Country Clusters Visualization (PCA)')
plt.colorbar(scatter)
plt.show()

# Create parallel coordinates plot
plt.figure(figsize=(15, 8))
pd.plotting.parallel_coordinates(
    df.reset_index(),
    'Cluster',
    cols=indicators,
    colormap=plt.cm.get_cmap("Set2")
)
plt.xticks(rotation=45)
plt.title('Parallel Coordinates Plot of Clusters')
plt.tight_layout()
plt.show()

# Analyze characteristics of each cluster
print("\nCluster Characteristics:")
for i in range(optimal_k):
    cluster_data = df[df['Cluster'] == i]
    print(f"\nCluster {i} ({len(cluster_data)} countries):")
    print("Top 5 countries:", ", ".join(cluster_data['Country Name'].head().tolist()))
    print("\nAverage values:")
    print(cluster_data[indicators].mean())

# Save results
df.to_csv("world_bank_clusters.csv", index=False)

# Create boxplots for each feature by cluster
plt.figure(figsize=(15, 10))
for i, indicator in enumerate(indicators, 1):
    plt.subplot(3, 2, i)
    sns.boxplot(data=df, x='Cluster', y=indicator)
    plt.xticks(rotation=0)
    plt.title(f'{indicator} by Cluster')
    plt.tight_layout()
plt.show()

# Calculate and visualize feature importance
pca_full = PCA()
pca_full.fit(df_scaled)

# Feature importance based on PCA
feature_importance = pd.DataFrame(
    abs(pca_full.components_),
    columns=indicators
)

plt.figure(figsize=(12, 6))
sns.heatmap(feature_importance, annot=True, cmap='YlOrRd')
plt.title('Feature Importance in Principal Components')
plt.tight_layout()
plt.show()