# **Importing Libraries**

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score


In [3]:
df = pd.read_csv('../input/unsupervised-learning-on-country-data/Country-data.csv')

In [4]:
df.head()


In [5]:
df.shape

In [6]:
df.info()

In [7]:
df.describe()

In [8]:
df.isnull().sum()

# **Data distribution**

In [9]:
df.set_index('country',inplace = True)

In [10]:
plt.figure(figsize=(20,5))
sns.barplot(x=df.index,y=df['child_mort'].sort_values(),data=df)
plt.xticks(rotation=90)
plt.show()

In [11]:
columns = df.columns


for c in columns:
    fig, ax = plt.subplots()
    ax.hist(df[c])
    ax.set_title(c)
plt.show() 

   
    
   


# Data evaluation and reduction

In [12]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),square=True,annot=True)
plt.show()

life_expect, high correlation with child mortality

total_fertility, high correlation with child mortality

income, high correlation with gdpp

# Scale the data: MinMaxScaler (normalised)


In [14]:
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(data_scaled,columns = df.columns)
print(df_scaled.head())

# PCA: Principal Component Analysis

In [24]:
model = PCA()
model.fit(data_scaled)
transformed = model.transform(data_scaled)
print(model.n_components_)
print(transformed.shape)
print(model.explained_variance_ratio_)

In [20]:
features = range(model.n_components_)
plt.figure(figsize =(12,10))
plt.bar(features,model.explained_variance_)
plt.xticks(features,df.columns)
plt.show()

** we can see that there are 4 principal components can explain about 90% of the distribution of the original data.**

In [23]:
model = PCA(n_components = 4)
model.fit(data_scaled)
pca_features = model.transform(data_scaled)
print(model.n_components_)
print(pca_features.shape)
print


# Model: K-Means Clustering

In [30]:
ks = range(1,10)
inertias = []
for k in ks:
    kn = KMeans(n_clusters=k)
    kn.fit(pca_features)
    inertias.append(kn.inertia_)
    
plt.plot(ks,inertias,marker='o')
plt.xticks(ks)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

In [31]:
ks = range(1,10)
inertias = []
for k in ks:
    kn = KMeans(n_clusters=k)
    kn.fit(data_scaled)
    inertias.append(kn.inertia_)
    
plt.plot(ks,inertias,marker='o')
plt.xticks(ks)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

**After running the K-Means model with the a normalised dataset and a PCA with 4 components (with standardised scaling) we can see that the optimal number of clusters is still 3 with different levels of inertia. Two clusters could also be considered as per results of dataset after PCA.**

# Cluster analysis

In [32]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(pca_features)
labels = kmeans.predict(pca_features)

In [33]:
df['cluster'] = labels

In [34]:
sns.pairplot(df,hue='cluster',data=df)
plt.show()

# Cluster Table

In [35]:
clusters_table = pd.pivot_table(df,index=df.cluster)
clusters_table

In [36]:
print(df[df.cluster == 2].index)

**Cluster 2: This cluster is characterised by having the most negative values: high child mortality, lowest economic development, low gdpp, exports and imports, lowest life expectancy**

In [58]:
print(df[df.cluster == 1].index)
print(df.cluster.value_counts())

**Cluster 1: This cluster is characterised by showing really strong or positive values such as good economic development, high life expectancy, low child mortality**

In [38]:
print(df[df.cluster == 0].index)

**Cluster 0: This cluster is characterised by showing average values for all features when comparing with other clusters**

# Conclusion

Based on an initial assessment of the average values of each cluster, Cluster 2 could be focus for further analysis. However, when we plot the clusters and look at the graphs, we see that there is overlapping of clusters as well as spread out clusters.

Utilising PCA as an alternative did not result in a significant difference.

We've been able to identify some patters in the data and group countries into 3 clusters. However, we should not rely solely on this result to make the recommendation of countries that should receive funding. There are a few alternatives to explore before we can make this recommendation.

The clustering can be considered as a preprocessing step and further analysis is required.

# Hierarchical Clustering

In [47]:
plt.figure(figsize=(20,20))
mergings = linkage(data_scaled,method='complete')
dendrogram(mergings,labels=df.index,leaf_rotation=90,leaf_font_size=10)
plt.show()

In [62]:
labels = fcluster(mergings,1.35,criterion='distance')

df['cluster_hier'] = labels
print(labels)
print(len(labels))

In [65]:
print(df[df['cluster_hier']==1].index)
