# K-Means Lab


## Import required packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import silhouette_score

In [None]:
report_df = pd.read_csv('WH Report_preprocessed.csv')
report_df.pivot(index=['Name','Continent'], columns='year', values=['population', 'Life_Ladder',
       'Log_GDP_per_capita', 'Social_support',
       'Healthy_life_expectancy_at_birth', 'Freedom_to_make_life_choices',
       'Generosity', 'Perceptions_of_corruption', 'Positive_affect',
       'Negative_affect'])

In [None]:
report_pdf= report_df.pivot(index=['Name'], columns='year', values=['population', 'Life_Ladder',
       'Log_GDP_per_capita', 'Social_support',
       'Healthy_life_expectancy_at_birth', 'Freedom_to_make_life_choices',
       'Generosity', 'Perceptions_of_corruption', 'Positive_affect',
       'Negative_affect'])
report_pdf

In [None]:
Xs = report_pdf[[ 'Life_Ladder',
       'Log_GDP_per_capita', 'Social_support',
       'Healthy_life_expectancy_at_birth', 'Freedom_to_make_life_choices',
       'Generosity', 'Perceptions_of_corruption', 'Positive_affect',
       'Negative_affect']]
Xs = (Xs - Xs.min())/(Xs.max()-Xs.min())
Xs.describe()

# Data Clusteribility

## Hopkins Statistics 

In [None]:
def hopkins(df,m):
    from sklearn.neighbors import NearestNeighbors
    from random import sample
    from pandas import DataFrame
    from numpy import random

    d = len(df.columns) # columns
    n = len(df) # rows
    
    df = (df - df.min())/(df.max()-df.min()) *2 -1
    df = df / df.std()
    

    knn = NearestNeighbors(n_neighbors=2).fit(df)

    rand_df = DataFrame(random.rand(m,d),index = range(0,m),columns =df.columns )
    rand_df = rand_df*2-1
    rand_df = rand_df * df.abs().max()

    ujd = []
    wjd = []

    for j in range(0, m):
        u_dist, _ = knn.kneighbors([rand_df.iloc[j]])
        ujd.append(u_dist[0][0])

        w_dist, _ = knn.kneighbors(df.sample(1))
        wjd.append(w_dist[0][1])

    return(sum(ujd) / (sum(ujd) + sum(wjd)))

In [None]:
m = 10
hopkins(Xs,m)   

# K-Means Clustering

In [None]:
kmeans = KMeans(n_clusters=7).fit(Xs)

# Cluster membership
memb = pd.Series(kmeans.labels_, index=Xs.index)
for key, item in memb.groupby(memb):
    print(key, ': ', ', '.join(item.index))

# Clustering Comparison

In [None]:
def countpairs(Clustering1,Clustering2):
    from pandas import Series
    
    output = Series(0,index=['N00','N01','N10','N11'])

    for i in range(0,10):
        for j in range(0,i):
            if(i!=j):

                c1_same = False
                c2_same = False
                c1_Not_same = False
                c2_Not_same = False

                if(Clustering1[i]==Clustering1[j]):
                    c1_same=True
                else:
                    c1_Not_same=True
                if(Clustering2[i]==Clustering2[j]):
                    c2_same=True
                else:
                    c2_Not_same = True

                if(c1_same & c2_same):
                    output.N11 = output.N11 +1 
                if(c1_Not_same & c2_Not_same):
                    output.N00 = output.N00 +1
                if(c1_same & c2_Not_same):
                    output.N10 = output.N10 +1
                if(c1_Not_same & c2_same):
                    output.N01 = output.N01 +1

    return(output)

## Fowlkes–Mallows

In [None]:
def fowlkes_mallows(Clustering1,Clustering2):
    
    from numpy import sqrt
    from pandas import Series
    
    p = countpairs(Clustering1,Clustering2)
    
    return(p.N11/((p.N11+p.N01)+(p.N11+p.N10)))

# Measure K-Means consistency

In [None]:
kmeans = KMeans(n_clusters=7)
Clustering1 = kmeans.fit(Xs).labels_
Clustering2 = kmeans.fit(Xs).labels_

print('fowlkes_mallows: {}'.format(fowlkes_mallows(Clustering1,Clustering2)))

In [None]:
for i in range(0,20):
    Clustering1 = kmeans.fit(Xs).labels_
    Clustering2 = kmeans.fit(Xs).labels_
    
    print('fowlkes_mallows: {}'.format(fowlkes_mallows(Clustering1,Clustering2)))
    print('----------------')

# Find the number of clusters using SSE

In [None]:
repetitions = ['R{}'.format(i) for i in range(1,10)]

SSE_results = pd.DataFrame(0.0, index = range(2,15), 
                       columns= repetitions)


for n_cluster in SSE_results.index:
    for col in SSE_results.columns:
        algort = KMeans(n_clusters=n_cluster).fit(Xs)
        SSE_results.at[n_cluster,col] = algort.inertia_ 
        # Inertia: Sum of distances of samples to their closest cluster center

SSE_results['Mean'] = SSE_results[repetitions].mean(axis=1)
SSE_results['Var'] = SSE_results[repetitions].var(axis=1)
SSE_results.sort_values('Mean')


In [None]:
(SSE_results.Mean).plot()
plt.show()

# Find the number of clusters using Silhouette Score

In [None]:
repetitions = ['R{}'.format(i) for i in range(1,10)]

SIL_results = pd.DataFrame( index = range(2,15), 
                       columns= repetitions)


for n_cluster in SIL_results.index:
    for col in SIL_results.columns:
        algort = KMeans(n_clusters=n_cluster).fit(Xs)
        SIL_results.at[n_cluster,col] = silhouette_score(Xs,algort.labels_)
        
SIL_results['Mean'] = SIL_results[repetitions].mean(axis=1)
SIL_results['Var'] = SIL_results[repetitions].var(axis=1)
SIL_results.sort_values('Mean',ascending=False)

In [None]:
SIL_results.Mean.plot()
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4)

# Cluster membership
for i in range(0,20):
    Clustering1 = kmeans.fit(Xs).labels_
    Clustering2 = kmeans.fit(Xs).labels_
    
    print('fowlkes_mallows: {}'.format(fowlkes_mallows(Clustering1,Clustering2)))
    print('----------------')

In [None]:
kmeans = KMeans(n_clusters=4).fit(Xs)
memb =  pd.Series(kmeans.labels_, index=Xs.index)

# centroid Analysis

In [None]:
clusters = ['Cluster {}'.format(i) for i in range(4)]
Centroids_orig = pd.DataFrame(0.0, index = clusters,
                        columns = report_pdf.columns)

Centroids_std = pd.DataFrame(0.0, index =  clusters,
                        columns = Xs.columns)
for i in range(4):
    BM = memb==i
    Centroids_orig.iloc[i] = report_pdf[BM].median(axis=0)
    Centroids_std.iloc[i] = Xs[BM].mean(axis=0)
    
Centroids_orig

In [None]:
plt.figure(figsize=(30,5))
sns.heatmap(Centroids_std, linewidths=.5, annot=True, 
                    cmap='Purples')
plt.show()

In [None]:
replace_dic = {0:'unhappy but generous',
               1:'generously happy but crime-ridden',
               2:'happy but crime-ridden',
               3:'Very happy'}
report_pdf['Cluster_noPreprocess'] = memb.replace(replace_dic)
report_pdf

# PCA Transformation

In [None]:
pcs = PCA()
pcs.fit(Xs)


pcsSummary_df = pd.DataFrame({'Standard deviation': np.sqrt(pcs.explained_variance_),
                           'Proportion of variance': pcs.explained_variance_ratio_,
                           'Cumulative proportion': np.cumsum(pcs.explained_variance_ratio_),
                             'Variance Explanation Ratio': pcs.explained_variance_ratio_,
                             'Cumulative Ratio' : np.cumsum(pcs.explained_variance_ratio_) })

pcsSummary_df = pcsSummary_df.transpose()
pcsSummary_df.columns = ['PC{}'.format(i) for i in range(1, len(Xs.columns) + 1)]
pcsSummary_df.round(4)

In [None]:
scores = pd.DataFrame(pcs.fit_transform(Xs), index = report_pdf.index,
                      columns=[f'PC{i}' for i in range(1, len(Xs.columns) + 1)])
scores = scores[[f'PC{i}' for i in range(1, 9)]]

scores.plot.scatter(x='PC1', y='PC2',c='blue')
plt.show()

In [None]:
scores.plot.scatter(x='PC1', y='PC2',c='PC3', cmap='YlOrRd')
plt.show()

# Find the  number of clusters 

In [None]:
repetitions = ['R{}'.format(i) for i in range(1,10)]

SSE_results = pd.DataFrame(0.0, index = range(2,15), 
                       columns= repetitions)


for n_cluster in SSE_results.index:
    for col in SSE_results.columns:
        algort = KMeans(n_clusters=n_cluster).fit(scores)
        SSE_results.at[n_cluster,col] = algort.inertia_ 
        # Inertia: Sum of distances of samples to their closest cluster center

SSE_results['Mean'] = SSE_results[repetitions].mean(axis=1)
SSE_results['Var'] = SSE_results[repetitions].var(axis=1)
SSE_results.sort_values('Mean')


In [None]:
(SSE_results.Mean).plot()
plt.show()

In [None]:
repetitions = ['R{}'.format(i) for i in range(1,10)]

SIL_results = pd.DataFrame( index = range(2,15), 
                       columns= repetitions)


for n_cluster in SIL_results.index:
    for col in SIL_results.columns:
        algort = KMeans(n_clusters=n_cluster).fit(scores)
        SIL_results.at[n_cluster,col] = silhouette_score(scores,algort.labels_)
        
SIL_results['Mean'] = SIL_results[repetitions].mean(axis=1)
SIL_results['Var'] = SIL_results[repetitions].var(axis=1)
SIL_results.sort_values('Mean',ascending=False)

In [None]:
SIL_results.Mean.plot()
plt.show()

In [None]:
kmeans = KMeans(n_clusters=4)

# Cluster membership
for i in range(0,20):
    Clustering1 = kmeans.fit(scores).labels_
    Clustering2 = kmeans.fit(scores).labels_
    
    print('fowlkes_mallows: {}'.format(fowlkes_mallows(Clustering1,Clustering2)))
    print('----------------')

In [None]:
kmeans = KMeans(n_clusters=4).fit(scores)
memb =  pd.Series(kmeans.labels_, index=Xs.index)

# Centroid Analysis

In [None]:
clusters = ['Cluster {}'.format(i) for i in range(4)]
Centroids_orig = pd.DataFrame(0.0, index = clusters,
                        columns = report_pdf.columns)

Centroids_pca = pd.DataFrame(0.0, index =  clusters,
                        columns = scores.columns)
for i in range(4):
    BM = memb==i
    Centroids_orig.iloc[i] = report_pdf[BM].median(axis=0)
    Centroids_pca.iloc[i] = scores[BM].mean(axis=0)
    
Centroids_orig

In [None]:
#Centroids_pca = (Centroids_pca - Centroids_pca.min())/(Centroids_pca.max()-Centroids_pca.min())

sns.heatmap(Centroids_pca, linewidths=.5, annot=True, 
                    cmap='Purples')
plt.show()

In [None]:
scores.describe()

In [None]:
report_pdf['Cluster_pca'] = memb
report_pdf

In [None]:
contingency_tbl = pd.crosstab(report_pdf.Cluster_noPreprocess, report_pdf.Cluster_pca)
probablity_tbl = contingency_tbl/ contingency_tbl.sum()
sns.heatmap(probablity_tbl, annot=True, center=0.5 ,cmap="Greys")
plt.show()

# Functional Data Reduction

In [None]:
BM = report_df.Name =='United States'
wdf = report_df[BM][['year','Life_Ladder']]
wdf.reset_index(drop=True,inplace=True)
wdf.drop(columns = ['year'],inplace=True)
wdf['integer'] = range(len(wdf))
wdf['ones'] = 1
wdf.Life_Ladder.plot()
plt.title('USA Life Ladder')


# Linear Regression
lm = LinearRegression()
lm.fit(wdf.drop(columns=['Life_Ladder']), wdf.Life_Ladder)

b = lm.intercept_
a = lm.coef_[0]

X = wdf.integer
y = b + a*X

plt.plot(X,y,label = 'Fitted regression')
plt.legend()

print('Feature one: Mean = {}'.format(wdf.Life_Ladder.mean()))
print('Feature two: Slope = {}'.format(a))

In [None]:
# Create a placeholder for preprocessing
columns = ['Life_Ladder', 'Log_GDP_per_capita', 'Social_support',
           'Healthy_life_expectancy_at_birth', 'Freedom_to_make_life_choices',
           'Generosity', 'Perceptions_of_corruption', 'Positive_affect',
           'Negative_affect']
features = ['Mean','Slope']

my_column = pd.MultiIndex.from_product([features,columns],
                                     names=('features','columns'))

preprocess_df = pd.DataFrame(index = report_pdf.index,
                       columns=my_column)

preprocess_df

In [None]:
Countries = report_df.Name.unique()
columns = ['Life_Ladder', 'Log_GDP_per_capita', 'Social_support',
           'Healthy_life_expectancy_at_birth', 'Freedom_to_make_life_choices',
           'Generosity', 'Perceptions_of_corruption', 'Positive_affect',
           'Negative_affect']

for ct in Countries:
    for col in columns:
        BM = report_df.Name ==ct
               
        wdf = pd.DataFrame(report_df[BM][col])
        wdf.reset_index(drop=True,inplace=True)
        wdf['integer'] = range(len(wdf))
        wdf['ones'] = 1
        lm = LinearRegression()
        lm.fit(wdf.drop(columns=[col]), wdf[col])
        a = lm.coef_[0]
        preprocess_df.at[ct,('Slope',col)]=a
        preprocess_df.at[ct,('Mean',col)]=wdf[col].mean()
preprocess_df

In [None]:
preprocess_df.loc['United States']

# Find number of Clusters

In [None]:
Xs = (preprocess_df - preprocess_df.min())/(preprocess_df.max()-preprocess_df.min())
Xs

In [None]:
repetitions = ['R{}'.format(i) for i in range(1,10)]

SSE_results = pd.DataFrame(0.0, index = range(2,15), 
                       columns= repetitions)


for n_cluster in SSE_results.index:
    for col in SSE_results.columns:
        algort = KMeans(n_clusters=n_cluster).fit(Xs)
        SSE_results.at[n_cluster,col] = algort.inertia_ 
        # Inertia: Sum of distances of samples to their closest cluster center

SSE_results['Mean'] = SSE_results[repetitions].mean(axis=1)
SSE_results['Var'] = SSE_results[repetitions].var(axis=1)
SSE_results.sort_values('Mean')


In [None]:
(SSE_results.Mean).plot()
plt.show()

In [None]:
repetitions = ['R{}'.format(i) for i in range(1,10)]

SIL_results = pd.DataFrame( index = range(2,15), 
                       columns= repetitions)


for n_cluster in SIL_results.index:
    for col in SIL_results.columns:
        algort = KMeans(n_clusters=n_cluster).fit(Xs)
        SIL_results.at[n_cluster,col] = silhouette_score(Xs,algort.labels_)
        
SIL_results['Mean'] = SIL_results[repetitions].mean(axis=1)
SIL_results['Var'] = SIL_results[repetitions].var(axis=1)
SIL_results.sort_values('Mean',ascending=False)

In [None]:
SIL_results.Mean.plot()
plt.show()

# KMeans(4)

In [None]:
kmeans = KMeans(n_clusters=4)

# Cluster membership
for i in range(0,20):
    Clustering1 = kmeans.fit(Xs).labels_
    Clustering2 = kmeans.fit(Xs).labels_
    
    print('fowlkes_mallows: {}'.format(fowlkes_mallows(Clustering1,Clustering2)))
    print('----------------')

In [None]:
kmeans = KMeans(n_clusters=4).fit(Xs)
memb =  pd.Series(kmeans.labels_, index=Xs.index)

# Centroid Analysis

In [None]:
clusters = ['Cluster {}'.format(i) for i in range(4)]
Centroids_orig = pd.DataFrame(0.0, index = clusters,
                        columns = preprocess_df.columns)

Centroids_std = pd.DataFrame(0.0, index =  clusters,
                        columns = Xs.columns)
for i in range(4):
    BM = memb==i
    Centroids_orig.iloc[i] = preprocess_df[BM].median(axis=0)
    Centroids_std.iloc[i] = Xs[BM].mean(axis=0)
    
Centroids_orig

In [None]:
plt.figure(figsize=(10,3))
sns.heatmap(Centroids_std, linewidths=.5, annot=True, 
                    cmap='Purples')
plt.show()

In [None]:
replace_dic = {2:'unhappy but generous',
               1:'generously happy but crime-ridden',
               0:'happy but crime-ridden',
               3:'Very happy'}
report_pdf['Cluster_functional'] = memb.replace(replace_dic)
report_pdf

In [None]:
contingency_tbl = pd.crosstab(report_pdf.Cluster_noPreprocess, report_pdf.Cluster_functional)
probablity_tbl = contingency_tbl/ contingency_tbl.sum()
sns.heatmap(probablity_tbl, annot=True, center=0.5 ,cmap="Greys")
plt.show()