# Part 3: An Unsupervised Model / Clustering

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score

### Import the dataset

In [2]:
with open('data/all_samples.pkl', 'rb') as f:
    all_samples = pickle.load(f)

### Set Up X

In [9]:
X = all_samples.drop(columns=['is_cancer'])

In [11]:
# Scale
X_sc = StandardScaler().fit_transform(X)
X_sc

array([[-2.47188075,  0.62072527, -1.93928452, ..., -1.03836614,
         0.09949178, -0.3822872 ],
       [-1.32924643,  0.62072527,  0.4601481 , ...,  0.65474557,
        -0.31077208, -0.56461261],
       [-0.04378282,  0.62072527,  0.28876006, ..., -0.32204965,
         0.09949178, -0.56461261],
       ...,
       [ 0.67036363, -3.33252721, -1.76789648, ...,  1.2408227 ,
         0.09949178, -0.19996179],
       [-0.18661211, -2.34421409, -2.22493126, ...,  0.65474557,
        -0.31077208, -0.19996179],
       [ 0.95602221, -1.35590097, -1.0823443 , ..., -0.51740869,
        -0.31077208, -0.3822872 ]])

### Cluster with KMeans

In [13]:
kmeans = KMeans()
kmeans.fit(X_sc)

KMeans()

In [14]:
# see how many clusters there are:
set(kmeans.labels_)

{0, 1, 2, 3, 4, 5, 6, 7}

In [None]:
# Try other k values:
kmeans_sil = []

for k in range(2,15):
    kmeans = KMeans(n_clusters=k, random_state=123)
    kmeans.fit(X_sc)
    kmeans_sil.append(silhouette_score(X_sc, kmeans.labels_))


#### An elbow plot of the silouette score:

In [None]:
sns.lineplot(x=range(2,15), y=kmeans_sil)

In [None]:
As seen at the plot, the optimal k is 

In [20]:
# make a new df with the kmeans labels column

# copying the original df
all_samples_kmeans = X.copy()

# add labels column
all_samples_kmeans['kmeans_default_labels'] = kmeans.labels_
all_samples_kmeans.head(3)

site,cg16045340,cg08036346,cg26702958,cg21774865,cg09516898,cg20055230,cg17863042,cg03200166,cg00912939,cg08789741,...,cg06045225,cg06969287,cg15545692,cg08989942,cg24076348,cg17754680,cg09460231,cg22502319,cg02078370,kmeans_default_labels
6005486021_R01C01,0.65,0.99,0.1,0.85,0.81,0.91,0.03,0.04,0.03,0.72,...,0.13,0.13,0.82,0.51,0.52,0.7,0.25,0.04,0.06,4
6005486021_R02C01,0.73,0.99,0.52,0.98,0.85,0.95,0.02,0.08,0.02,0.93,...,0.09,0.24,0.83,0.78,0.83,0.8,0.51,0.03,0.05,7
6005486021_R03C01,0.82,0.99,0.49,0.98,0.85,0.91,0.01,0.07,0.02,0.9,...,0.1,0.33,0.82,0.75,0.52,0.78,0.36,0.04,0.05,6


In [22]:
all_samples_kmeans.groupby('kmeans_default_labels').mean()

site,cg16045340,cg08036346,cg26702958,cg21774865,cg09516898,cg20055230,cg17863042,cg03200166,cg00912939,cg08789741,...,cg13156863,cg06045225,cg06969287,cg15545692,cg08989942,cg24076348,cg17754680,cg09460231,cg22502319,cg02078370
kmeans_default_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.806415,0.988302,0.555283,0.959811,0.848396,0.937547,0.011698,0.193302,0.030189,0.918774,...,0.122075,0.086887,0.473302,0.816887,0.527264,0.648113,0.826226,0.339811,0.031226,0.085
1,0.845377,0.98283,0.253208,0.96217,0.844811,0.908208,0.012736,0.063868,0.03217,0.94717,...,0.085755,0.089057,0.080566,0.810849,0.803962,0.788774,0.784434,0.418019,0.031321,0.074717
2,0.833947,0.983316,0.411263,0.957737,0.841421,0.919737,0.011211,0.140158,0.034632,0.938789,...,0.087579,0.092316,0.278526,0.802842,0.667421,0.692105,0.772316,0.456737,0.034789,0.073579
3,0.8725,0.983333,0.375,0.945833,0.864583,0.915833,0.004583,0.12625,0.015833,0.955417,...,0.15375,0.2075,0.373333,0.875417,0.858333,0.819167,0.823333,0.681667,0.158333,0.207917
4,0.79129,0.98086,0.430108,0.937634,0.82828,0.915591,0.013118,0.103763,0.039462,0.888387,...,0.094086,0.083441,0.301398,0.801505,0.525699,0.596452,0.773656,0.331505,0.032151,0.066774
5,0.7164,0.98,0.392,0.9672,0.8168,0.9216,0.0136,0.1516,0.0344,0.8496,...,0.0796,0.0924,0.4292,0.8016,0.3312,0.502,0.7828,0.3056,0.0328,0.0852
6,0.814328,0.978806,0.500149,0.962239,0.835522,0.90806,0.011045,0.092985,0.036716,0.91791,...,0.060149,0.080448,0.16194,0.797015,0.781343,0.473284,0.79806,0.321642,0.03597,0.058507
7,0.842195,0.986829,0.539919,0.958049,0.846016,0.933252,0.011951,0.209675,0.031463,0.947967,...,0.109756,0.092764,0.424715,0.814959,0.686748,0.744553,0.817317,0.463821,0.035122,0.091626
