# Lab | Unsupervised learning

## Libraries

In [None]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

## Importing data

## Clustering the data

### Preprocessing

In [None]:
# K-Means is a distance based algorithm: we need to scale / normalize:

X_prep = StandardScaler().fit_transform(X)

pd.DataFrame(X_prep).head()

### Model building (k-means)

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_prep)

In [None]:
# Predicting / assigning the clusters:
clusters = kmeans.predict(X_prep)
clusters

In [None]:
# Check the size of the clusters
pd.Series(clusters).value_counts().sort_index()

In [None]:
# Explore the cluster assignment in the original dataset
X_df = pd.DataFrame(X)
X_df["cluster"] = clusters
X_df.head()

In [None]:
# "performance metric"
kmeans.inertia_

### Chosing the best k

In [None]:
kmeans = KMeans(n_clusters=3, random_state=1234, verbose=1, n_init=1)
kmeans.fit(X_prep)
kmeans.inertia_

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.show()

### Silhouette score

In [None]:
K = range(2, 20)

silhouette = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_prep)
    silhouette.append(silhouette_score(X_prep, kmeans.predict(X_prep)))


plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.show()

In [None]:
print(classification_report(y, X_df['cluster']))