In [5]:
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.metrics import rand_score
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from utils import *

In [6]:
# importing the dataset
X,y = fetch_openml('mnist_784', version=1, return_X_y=True)
y = y.astype(int)
    
# deleting the columns with unique values and rescaling
# X = X[:, ~np.all(X[1:] == X[:-1], axis=0)]
X = X/255
scaler = StandardScaler()
X = scaler.fit_transform(X)

# check if mean = 0 and std = 1
print((X.mean(axis = 0)).mean())
print((X.std(axis = 0)).mean())

8.644688463608271e-19
0.9170918367346939


In [7]:
# splitting the data into train, validation and test
X_old, X_new, y_old, y_new = train_test_split(X, y, test_size = 0.4, random_state = 0)
X_train_80, X_test, y_train_80, y_test = train_test_split(X_new, y_new, test_size = 0.2, random_state = 0)
X_train, X_valid , y_train, y_valid = train_test_split(X_train_80, y_train_80, test_size = 0.25, random_state = 0)

In [8]:
print("original dataset: ", X.shape, y.shape)
print("reduced dataset: ", X_new.shape, y_new.shape)
print("train set: ", X_train.shape, y_train.shape)
print("validation set: ", X_valid.shape, y_valid.shape)
print("test set: ", X_test.shape, y_test.shape)

original dataset:  (70000, 784) (70000,)
reduced dataset:  (28000, 784) (28000,)
train set:  (16800, 784) (16800,)
validation set:  (5600, 784) (5600,)
test set:  (5600, 784) (5600,)


In [None]:
five_train_time = []
five_test_time = []
five_rand = []

seven_train_time = []
seven_test_time = []
seven_rand = []

ten_train_time = []
ten_test_time = []
ten_rand = []

twelve_train_time = []
twelve_test_time = []
twelve_rand = []

fifteen_train_time = []
fifteen_test_time = []
fifteen_rand = []

for n_comp in [2, 10, 50, 100, 150, 200]: 
    pca = PCA(n_components=n_comp)
    X_train_proj = pca.fit_transform(X_train)
    X_valid_proj = pca.fit_transform(X_valid)
    print('Projected train and validation sets : ',X_train_proj.shape, X_valid_proj.shape)
    
    for k in [5, 7, 10, 12, 15]:
            sc = SpectralClustering(n_clusters=k, assign_labels = 'cluster_qr', n_jobs = -1)

            start_train = time.time()
            sc.fit(X_train_proj)
            time_train = time.time() - start_train

            start_test = time.time()
            labels = sc.predict(X_valid_proj)
            time_test = time.time() - start_test

            rand = rand_score(y_valid, labels)

            print(f'n_comp : {n_comp} --- n_clusters : {k} --- rand index : {rand}')
            print('Train time : ', time_train)
            print('Test_time : ', time_test)

            if k == 5:
                five_train_time.append(time_train)
                five_test_time.append(time_test)
                five_rand.append(rand)
            else:
                if k == 7:
                    seven_train_time.append(time_train)
                    seven_test_time.append(time_test)
                    seven_rand.append(rand)
                else:
                    if k == 10:
                        ten_train_time.append(time_train)
                        ten_test_time.append(time_test)
                        ten_rand.append(rand)
                    else:
                        if k == 12:
                            twelve_train_time.append(time_train)
                            twelve_test_time.append(time_test)
                            twelve_rand.append(rand)
                        else:
                            fifteen_train_time.append(time_train)
                            fifteen_test_time.append(time_test)
                            fifteen_rand.append(rand)

In [None]:
n_comp = [2, 10, 50, 100, 150, 200]

matplot.subplots(figsize=(10, 5))
matplot.plot(n_comp, five_train_time,'--x' ,color='blue' , label="k = 5")
matplot.plot(n_comp, seven_train_time,'--x', color = 'red' ,label="k = 7")
matplot.plot(n_comp, ten_train_time,'--x' ,color = 'green', label="k = 10")
matplot.plot(n_comp, twelve_train_time,'--x' , color = 'purple', label="k = 12")
matplot.plot(n_comp, fifteen_train_time,'--x' , color = 'orange', label="k = 15")
matplot.grid(True)
matplot.xlabel("Number of components")
matplot.ylabel("Train time (s)")
matplot.legend()
matplot.title('Train time versus the number of components')
matplot.show()

In [None]:
matplot.subplots(figsize=(10, 5))
matplot.plot(n_comp, five_test_time,'--x' ,color='blue' , label="k = 5")
matplot.plot(n_comp, seven_test_time,'--x', color = 'red' ,label="k = 7")
matplot.plot(n_comp, ten_test_time,'--x' ,color = 'green', label="k = 10")
matplot.plot(n_comp, twelve_test_time,'--x' , color = 'purple', label="k = 12")
matplot.plot(n_comp, fifteen_test_time,'--x' , color = 'orange', label="k = 15")
matplot.grid(True)
matplot.xlabel("Number of components")
matplot.ylabel("Test time (s)")
matplot.legend()
matplot.title('Test time versus the number of components')
matplot.show()

In [None]:
matplot.subplots(figsize=(10, 5))
matplot.plot(n_comp, five_rand,'--x' ,color='blue' , label="k = 5")
matplot.plot(n_comp, seven_rand,'--x', color = 'red' ,label="k = 7")
matplot.plot(n_comp, ten_rand,'--x' ,color = 'green', label="k = 10")
matplot.plot(n_comp, twelve_rand,'--x' , color = 'purple', label="k = 12")
matplot.plot(n_comp, fifteen_rand,'--x' , color = 'orange', label="k = 15")
matplot.grid(True)
matplot.xlabel("Number of components")
matplot.ylabel("Rand index")
matplot.legend()
matplot.title('Rand index versus the number of components')
matplot.show()