<a href="https://colab.research.google.com/github/Mukeshvudayagiri/upgrad-mlops-intro/blob/main/Clustering_KMeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unsupervised Learning

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

# use seaborn plotting defaults
import seaborn as sns; sns.set()

# Application of KMeans Clustering

## Dataset

- Hand-written digits dataset
- MNIST (Modified National Institute of Standards and Technology) database ([uci link](https://archive.ics.uci.edu/ml/datasets/optical+recognition+of+handwritten+digits))
- Commonly used for classification and clustering exercises

![](https://upload.wikimedia.org/wikipedia/commons/2/27/MnistExamples.png)

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()

In [None]:
digits.data.shape

In [None]:
digits.target[:20]

In [None]:
digits.data[0].reshape((8,8))

In [None]:
plt.imshow(digits.data[0].reshape((8,8)), cmap=plt.cm.binary);

In [None]:
fig = plt.figure(figsize=(10,3))
for i in range(5):
    ax = fig.add_subplot(1,5,i+1, xticks=[], yticks=[])
    ax.imshow(digits.data[i].reshape((8,8)), cmap=plt.cm.binary)
    ax.text(0,0,digits.target[i])

In [None]:
fig = plt.figure(figsize=(10,10))
for i in range(10):
    for j in range(10):
        ax = fig.add_subplot(10,10,10*i+j+1, xticks=[], yticks=[])
        ax.imshow(digits.data[10*i+j].reshape((8,8)), cmap=plt.cm.binary)
        ax.text(0,0,digits.target[10*i+j])

In [None]:
from sklearn.cluster import KMeans
est = KMeans(n_clusters=10, precompute_distances=False, algorithm='full', max_iter=100)
clusters = est.fit_predict(digits.data)

In [None]:
clusters[:10]

In [None]:
digits.target[:10]

In [None]:
est.cluster_centers_.shape

In [None]:
fig = plt.figure(figsize=(8, 3))
for i in range(10):
    ax = fig.add_subplot(2, 5, 1 + i, xticks=[], yticks=[])
    ax.imshow(est.cluster_centers_[i].reshape((8, 8)), cmap=plt.cm.binary)

In [None]:
from scipy.stats import mode

labels = np.zeros_like(clusters)
for i in range(10):
    mask = (clusters == i)
    labels[mask] = mode(digits.target[mask])[0]

In [None]:
label = 0
fig = plt.figure(figsize=(20,3))
for i in range(10):
    ax = fig.add_subplot(1,10,i+1, xticks=[], yticks=[])
    ax.imshow(digits.data[labels==label][i].reshape((8,8)), cmap=plt.cm.binary)
    ax.text(0,0,digits.target[labels==label][i])
    ax.text(0,7,labels[labels==label][i])

# Visualizing the clusters

- Visualization in 64D space?
    - Compress to visualizable dimensions

In [None]:
from sklearn.decomposition import PCA

X = PCA(2).fit_transform(digits.data)

In [None]:
kwargs = dict(cmap = plt.cm.get_cmap('rainbow', 10), edgecolor='none', alpha=0.6)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))

ax1.scatter(X[:, 0], X[:, 1], c=labels, **kwargs)
ax1.set_title('learned cluster labels')

sc = ax2.scatter(X[:, 0], X[:, 1], c=digits.target, **kwargs)
ax2.set_title('true labels')
fig.legend(*sc.legend_elements(), loc='upper right');

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(digits.target, labels)

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(digits.target, labels))

In [None]:
plt.imshow(confusion_matrix(digits.target, labels),
           cmap='Blues', interpolation='nearest')
plt.colorbar()
plt.grid(False)
plt.ylabel('true')
plt.xlabel('predicted');