# Lab 6: Clustering


## Preprocess Data: check for missing values

In [70]:
import numpy as np
import pandas as pd 
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.cluster import normalized_mutual_info_score

heartData = pd.read_csv("D:\Programming\Python_code\PrinciplesOfDS_Course\Labs\Data\heart.csv")
print(heartData.info()) # all data types int or float, no non-numeric features. No categorical features to convert 

# Check for missing values 
heartData.isna().sum()/heartData.shape[0] # all values 0. No missing values. 

X, y = heartData.drop(columns='target'), heartData.target

normalizer = StandardScaler()
X_norm = normalizer.fit_transform(X)
print("X", X.iloc[:10,])
print('X_norm',X_norm[:10,])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
None
X    age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1     

In [71]:
# no point in doing more than 2 clusters as we know there are only 2 possible categories already
cluster = KMeans(n_clusters=2, random_state=1).fit(X_norm) 

nmi = normalized_mutual_info_score(cluster.labels_, y, average_method='arithmetic')
print('The normalized mutual information score of the K-means method is {:.4f}'.format(nmi))

The normalized mutual information score of the K-means method is 0.3234




## Try different hyper-parameters


In [72]:
test_k = [2,3,4,5]
test_n_init = [10, 20, 30, 50]
test_init = ['k-means++', 'random']

best_nmi = 0
best_k = None
best_n_init = None
best_init = None

for k in test_k:
    for n in test_n_init:
        for init in test_init:
            temp_cluster = KMeans(n_clusters=k, init=init, n_init=n).fit(X_norm)
            temp_nmi = normalized_mutual_info_score(temp_cluster.labels_, y, average_method='arithmetic')
            print(temp_nmi)
            if(temp_nmi > best_nmi):
                best_nmi = temp_nmi
                best_k = k
                best_n_init = n
                best_init = init

print("Best of each category:\nBest nmi {}\nBest k cluster {}\nBest n_init {}\nBest init {}".format(best_nmi, best_k, best_n_init, best_init))

0.32341194603576945
0.3234119460357694
0.3234119460357694
0.32341194603576945
0.32967483885081544
0.32967483885081544
0.3234119460357694
0.3296748388508154
0.2749021353467294
0.25373964228288276
0.2749021353467294
0.2749021353467294
0.27136707411437866
0.2749021353467294
0.2749021353467294
0.2749021353467294
0.19537049518825259
0.2030474974165272
0.20093935290612572
0.20624278848512545
0.20486584603827884
0.21188736682468978
0.20624278848512545
0.19688080257753196
0.19580947694841144
0.19514005531513862
0.1989861459774026
0.195209873123672
0.1950388717106049
0.18810542151947232
0.197761964850533
0.1968382531471918
Best of each category:
Best nmi 0.32967483885081544
Best k cluster 2
Best n_init 30
Best init k-means++


### Thoughts on different parameters: 
As stated above, there really isnt a point in trying to use more than 2 clusters, as we only have 2 possible categories of data. We know this going in, so we know the optimal cluster amount going in. Results substantially change for the worse with greater cluster amounts, further, there is no real imrovement on the nmi score in testing other hyperparameters. 

## Agglomerative Clustering:


In [73]:
agg_cluster = AgglomerativeClustering(n_clusters=2).fit(X)

nmi = normalized_mutual_info_score(agg_cluster.labels_, y, average_method='arithmetic')
print('The normalized mutual information score of the Agglomerative clustering method is {:.4f}'.format(nmi))

The normalized mutual information score of the Agglomerative clustering method is 0.0111


### Test different hyperparameters: 


In [74]:
n_clusters = [2,3,4,5,6,7,8,9,10]
linkage_setting = ['ward','complete','average','single']

best_agg_nmi = 0
best_setting = None
best_n_clusters = 0
for n in n_clusters:
    for s in linkage_setting:
        agg_cluster_temp = AgglomerativeClustering(n_clusters=n, linkage=s).fit(X)
        temp_nmi = normalized_mutual_info_score(agg_cluster_temp.labels_, y, average_method='arithmetic')
        print(temp_nmi)
        if(temp_nmi > best_agg_nmi):
            best_agg_nmi = temp_nmi
            best_setting = s
            best_n_clusters = n

print("Best nmi {:.4f}\nBest setting {}\nBest n_clusters {}".format(best_agg_nmi,best_setting, best_n_clusters))

0.011063515436362354
0.00027129245492457866
0.00027129245492457866
0.005652750492624307
0.009001669557496405
0.03393886998938809
0.005272370527725735
0.012547879744986242
0.009229764064007892
0.036507355506030355
0.02674253534651711
0.01749723859550799
0.01326847734067055
0.04322159014456702
0.030500596599103175
0.016133747429041743
0.02871304161117195
0.06274143379776896
0.03129531303084799
0.021841921862053443
0.03690019438473182
0.06378177909188953
0.0414857169831583
0.025893914500633223
0.036909483511863635
0.05676875387176104
0.051571308995400744
0.026296269762886657
0.03828095912038763
0.0605508197088417
0.050286051160642456
0.031208774165090748
0.03866165426109681
0.060131912673690044
0.051291767771655096
0.03592698315155799
Best nmi 0.0638
Best setting complete
Best n_clusters 7


## Discussion: 
While it appears like 7 clusters performs better than lower numbers of them for the agglomerative clustering method, I don't think this is because there actually exist 7 distinct clusters in our data. The fact that at best we achieve a .06 NMI score also implies there is very little correlation between the clusters found and the base data. This score is substantially worse than the best score found for KMeans clustering, with ~.33 NMI. 