In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [2]:
#read the data into a dataframe
mnist_signs = pd.read_csv('C:/Users/Hamza-Acer/Desktop/sign_mnist.csv')

In [32]:
#convert the df into a numpy array
mnist_signs = mnist_signs.to_numpy()
#store the varuables into X...
X = mnist_signs[:,1:]
#store the labels into Y...
Y = mnist_signs[:,0]

In [33]:
# since we are doing cluster analysis it is a good practice to standardize our data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [34]:
#lets take a look at the standardised data set...
X
#you will see that the data is now between[-1,1]...

array([[-0.92555597, -0.76586906, -0.62437311, ...,  0.66073648,
         0.65537724,  0.64993246],
       [ 0.21973624,  0.19996474,  0.10972487, ..., -0.95020514,
        -0.42394252, -0.18396987],
       [ 0.98326438,  0.96767879,  0.91976401, ...,  0.51718722,
         0.5125261 ,  0.53979441],
       ...,
       [-0.42449063, -0.39439452, -0.42186333, ...,  0.7723859 ,
         0.76648369,  0.7600705 ],
       [ 0.43447853,  0.3980845 ,  0.38817582, ...,  0.35768805,
         0.3538026 ,  0.35098634],
       [ 0.00499395,  0.02660996,  0.00846997, ...,  0.29388839,
         0.27444086,  0.27231631]])

In [6]:
#import the UMAP library
import umap.umap_ as umap
#make a dimensionality reduction
reducer = umap.UMAP()

In [7]:
#we use the reducer function to reduce the 784 dimensions into 2
embedding = reducer.fit_transform(X)

In [8]:
#lets check the shape of both the original data and reduced data....
print('The original ',X.shape)
print('The reduced shape ',embedding.shape)

The original  (10000, 784)
The reduced shape  (10000, 2)


In [9]:
#we make a new dataframe from the embedding array
umap_df = pd.DataFrame(columns = ['umap1','umap2','color'])

umap_df['umap1'] = embedding[:,0]
umap_df['umap2'] = embedding[:,1]
umap_df['color'] = Y.astype(str)
size = np.empty(10000)
size.fill(1)

In [23]:
from sklearn.cluster import KMeans
# K-Means clustering for our dataset....
#we decide for 25 clusters since our data is labeled into 25 classes...
model = KMeans(n_clusters=25)
# fit the model
X = umap_df[['umap1','umap2']].to_numpy()
model.fit(X)
# predict labels with Kmeans clusters and store into yhat
yhat = model.predict(X)
# retrieve unique clusters
# make the plot in plotly scatter....
KMeansplot = px.scatter(x=X[:, 0], y=X[:, 1], color = yhat.astype(str), labels={"color": "Sign"})
KMeansplot.show()
KMeansplot.write_html("C:/Users/Hamza-Acer/Desktop/mnist_umap/KMeansplot_umap_mnist.html")

In [24]:
#lets try minibatch clustering
from sklearn.cluster import MiniBatchKMeans
model = MiniBatchKMeans(n_clusters=25)
# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)
# retrieve unique clusters
# make the plot in plotly scatter....
minibatchkmeans = px.scatter(x=X[:, 0], y=X[:, 1], color = yhat.astype(str), labels={"color": "Sign"})
minibatchkmeans.show()
minibatchkmeans.write_html("C:/Users/Hamza-Acer/Desktop/mnist_umap/minibatchkmeans_umap_mnist.html")

In [25]:
#lets try spectral clustring...
from sklearn.cluster import SpectralClustering
model = SpectralClustering(n_clusters=25)
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
# make the plot in plotly scatter....
spectral_clustering = px.scatter(x=X[:, 0], y=X[:, 1], color = yhat.astype(str), labels={"color": "Sign"})
spectral_clustering.show()
spectral_clustering.write_html("C:/Users/Hamza-Acer/Desktop/mnist_umap/spectral_clustering_umap_mnist.html")

In [26]:
#lets try gaussian matrix clustering
from sklearn.mixture import GaussianMixture
# define the model
model = GaussianMixture(n_components=25)
# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)
# make the plot in plotly scatter....
gaussian_matrix = px.scatter(x=X[:, 0], y=X[:, 1], color = yhat.astype(str), labels={"color": "Sign"})
gaussian_matrix.show()
gaussian_matrix.write_html("C:/Users/Hamza-Acer/Desktop/mnist_umap/gaussian_matrix_umap_mnist.html")