In [None]:
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
import numpy as np

In [None]:
def cluster_labels(model, true_labels):

    cluster_labels = {}

    for i in range(10):

        # find index of points in cluster
        labels = []
        index = np.where(model== i)

        # append actual labels for each point in cluster
        labels.append(true_labels[index])

        # determine most common label
        if len(labels[0]) == 1:
            counts = np.bincount(labels[0])
        else:
            counts = np.bincount(np.squeeze(labels))

        # assign the cluster to a value in the cluster_labels dictionary
        if np.argmax(counts) in cluster_labels:
            # append the new number to the existing array at this index
            cluster_labels[np.argmax(counts)].append(i)
        else:
            # create a new array in this indeex
            cluster_labels[np.argmax(counts)] = [i]

        
    return cluster_labels  

In [None]:
def infer_labels(X_labels, cluster_labels):
    #assign labels to the predicted data according to the model
    # empty array of len(X)
    predicted_labels = np.zeros(len(X_labels)).astype(np.uint8)
    
    for i, cluster in enumerate(X_labels):
        for key, value in cluster_labels.items():
            if cluster in value:
                predicted_labels[i] = key
                
    return predicted_labels

In [None]:
import mnist_reader
X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train')
X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k')

X_train, X_test = X_train/255. , X_test/255.

In [None]:
#Define the K-means cluster
km = KMeans(n_clusters = 10, n_init = 20)

In [None]:
#fit the model
fit_model = km.fit(X_train)

In [None]:
#predict the clusters using the data
test_clusters = fit_model.predict(X_test)

#assign the cluster labels based on target data
kmeans_labels = cluster_labels(test_clusters, y_test)

#infer the predicted labels from the model
predicted_train_labels = infer_labels(test_clusters, kmeans_labels)

In [None]:
#Test the accuracy
Accuracy = accuracy_score(y_test, predicted_train_labels)

print("Accuracy of K-means model is ", Accuracy)

In [None]:
#reshape data in to 4 dimensional array
X_train = X_train.reshape(-1, 28,28, 1)
X_test = X_test.reshape(-1, 28,28, 1)

In [None]:
#split the data into training and test data
from sklearn.model_selection import train_test_split
train_X,valid_X,train_ground,valid_ground = train_test_split(X_train, X_train, test_size=0.2, random_state=13)

In [None]:
from keras import Model,Sequential
from keras.layers import Flatten,Conv2D,Dense,Reshape,Conv2DTranspose

In [None]:
def auto_encoder(input_shape=(28, 28, 1), filters=[32, 64, 128, 10]):
    model = Sequential()
    if input_shape[0] % 8 == 0:
        pad3 = 'same'
    else:
        pad3 = 'valid'
    #encoder    
    model.add(Conv2D(filters[0], 5, strides=2, padding='same', activation='relu', name='conv1', input_shape=input_shape))

    model.add(Conv2D(filters[1], 5, strides=2, padding='same', activation='relu', name='conv2'))

    model.add(Conv2D(filters[2], 3, strides=2, padding=pad3, activation='relu', name='conv3'))

    model.add(Flatten())
    #latent space
    model.add(Dense(units=filters[3], name='latent_space'))
    #decoder
    model.add(Dense(units=filters[2]*int(input_shape[0]/8)*int(input_shape[0]/8), activation='relu'))

    model.add(Reshape((int(input_shape[0]/8), int(input_shape[0]/8), filters[2])))
    model.add(Conv2DTranspose(filters[1], 3, strides=2, padding=pad3, activation='relu', name='deconv3'))

    model.add(Conv2DTranspose(filters[0], 5, strides=2, padding='same', activation='relu', name='deconv2'))

    model.add(Conv2DTranspose(input_shape[2], 5, strides=2, padding='same', name='deconv1'))
    return model

In [None]:
#assign the function to the model
autoencoder = auto_encoder()

In [None]:
#autoencoder summary
autoencoder.summary()

In [None]:
#compile the autoencoder
autoencoder.compile(optimizer='RMSprop', loss='mse')

In [None]:
#train the model on the using the split training data and evaluate the loss for training and validation
model = autoencoder.fit(train_X, train_ground, batch_size=256,epochs=100, validation_data = (valid_X, valid_ground))

In [None]:
from matplotlib import pyplot as plt
#plot the training loss and valisation loss
loss = model.history['loss']
val_loss = model.history['val_loss']
epochs = range(100)
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
#create the encoder model with output as latent space
intermediate_layer_model = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('latent_space').output)

In [None]:
#encoder summary
intermediate_layer_model.summary()

In [None]:
#find the latent space representaion of training data and test data
latent_space = intermediate_layer_model.predict(X_train)
latent_space_test = intermediate_layer_model.predict(X_test)

In [None]:
#create new k-means model for encoded layer with 10 clusters
nm = KMeans(n_clusters = 10, n_init=50)
#fit the model for the latent space respresentation of training data
new_model = nm.fit(latent_space)

In [None]:
#predict the clusters for the training and test latent space
latent_clusters = new_model.predict(latent_space)
latent_clusters_test = new_model.predict(latent_space_test)
#infer the cluster labels of the model based on the target labels 
latent_labels = cluster_labels(latent_clusters, y_train)
latent_labels_test = cluster_labels(latent_clusters_test, y_test)
#assign the cluster labels to the data 
predicted_new_labels = infer_labels(latent_clusters, latent_labels)
predicted_new_labels_test = infer_labels(latent_clusters_test, latent_labels_test)

In [None]:
Accuracy_train = accuracy_score(y_train, predicted_new_labels)
Accuracy_test = accuracy_score(y_test, predicted_new_labels_test)

print("Training Accuracy of convolution autoencoder with k_memans is", Accuracy_train)
print("Test Accuracy of convolution autoencoder with k_memans is", Accuracy_test)

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
#create a gaussian mixture model
gmm = GaussianMixture(n_components = 10)
#fit the gaussian mixture model
g_model = gmm.fit(latent_space)

In [None]:
#predict the gaussian mixture model clusters based on the latent space representation of the test data and training data
gmm_clusters = g_model.predict(latent_space)
gmm_clusters_test = g_model.predict(latent_space_test)
#infer the cluster labels of the gmm model based on the target data
gmm_labels = cluster_labels(gmm_clusters, y_train)
gmm_labels_test = cluster_labels(gmm_clusters_test, y_test)
#assign the cluster labels to the predicted data
pred_gmm = infer_labels(gmm_clusters, gmm_labels)
pred_gmm_test = infer_labels(gmm_clusters_test, gmm_labels_test)

In [None]:
Accuracy_train = accuracy_score(y_train, pred_gmm)
Accuracy_test = accuracy_score(y_test, pred_gmm_test)

print("Training Accuracy of convolution autoencoder with gaussian_mixture_model is",Accuracy_train)
print("Test Accuracy of convolution autoencoder with gaussian_mixture_model is",Accuracy_test)

In [None]:
from sklearn.metrics import confusion_matrix
#construct confusion matrix for the kmeans model and gaussian mixture model
cm_kmeans = confusion_matrix(y_test,predicted_new_labels_test)
cm_gmm = confusion_matrix(y_test, pred_gmm_test)

print("confusion matrix for encoded k-means model \n",cm_kmeans)
print("confusion matrix for encoded gmm model \n",cm_gmm)

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

km_cm = pd.DataFrame(cm_kmeans, index = [i for i in "ABCDEFGHIJ"],
                  columns = [i for i in "ABCDEFGHIJ"])
plt.figure(figsize = (10,7))
sn.heatmap(km_cm, annot=True)

gmm_cm = pd.DataFrame(cm_gmm, index = [i for i in "ABCDEFGHIJ"],
                  columns = [i for i in "ABCDEFGHIJ"])
plt.figure(figsize = (10,7))
sn.heatmap(gmm_cm, annot=True)