In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
import numpy as np
import pandas as pd
import scipy
import scipy.sparse
import scipy.sparse.linalg

In [44]:


# Load the data
data = pd.read_csv("/content/drive/MyDrive/data.csv")

print("Shape of data {}".format(data.shape))

Shape of data (42000, 785)


In [45]:
print(data.head())

   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \
0       0  ...         0         0         0         0         0         0   
1       0  ...         0         0         0         0         0         0   
2       0  ...         0         0         0         0         0         0   
3       0  ...         0         0         0         0         0         0   
4       0  ...         0         0         0         0         0         0   

   pixel780  pixel781  pixel782  pixel783  
0         0         0         

In [46]:
# Convert into array
data = np.array(data)

# Split into samples and labels (X and Y)
X = data[:, 1:]
X = X.T

Y = data[:, 0]

print(X.shape, Y.shape)

d, N = X.shape

(784, 42000) (42000,)


In [47]:
a = [[[1, 2], [5, 6]], [[3, 4]]]
np.concatenate(a)

array([[1, 2],
       [5, 6],
       [3, 4]])

In [48]:
def get_data_from_index(X, Y, indexes):
    
    # start from empty lists for both samples and labels
    final_X = []
    final_Y = []

    # for each chosen label
    for k in indexes:
        # find which samples have label=k
        idxs_k = (Y == k)
        # slice the samples and append them to a list
        final_X.append( X[:, idxs_k] )
        # same thing to the labels
        final_Y.append( Y[idxs_k] )

    # concatenate together all the previous iterations
    X = np.concatenate(final_X, axis=1)
    Y = np.concatenate(final_Y)

    # return the new dataset and labels
    return X, Y

In [49]:
indeces = [0, 6, 9]
X, Y = get_data_from_index(X, Y, indeces)
print("X shape: {}, Y shape: {}".format(
    X.shape,
    Y.shape
))

print("Kept samples choosing indeces {}: {:.2f}%".format(
    indeces,
    X.shape[1]/N*100
))

X shape: (784, 12457), Y shape: (12457,)
Kept samples choosing indeces [0, 6, 9]: 29.66%


In [50]:
# 80% to train, the rest to test
train_split = 0.8
N_train = round(X.shape[1]*train_split)

In [51]:
def data_splits(X, Y, N_train):
    N = X.shape[1]

    # getting an array with indeces from 0 to N-1
    indeces = np.arange(N)
    # shuffling randomly
    np.random.shuffle(indeces)

    # get the first N_train for the train split (but now they are random)
    train_idx = indeces[:N_train]
    # the rest are for test split
    test_idx = indeces[N_train:]

    # slice the original datasets with an index array
    X_train = X[:, train_idx]  
    Y_train = Y[train_idx]
    
    X_test = X[:, test_idx]
    Y_test = Y[test_idx]

    # put in tuples the two splits
    return (X_train, Y_train), (X_test, Y_test)

In [52]:
(X_train, Y_train), (X_test, Y_test) = data_splits(X, Y, N_train)

print("X_train shape: {}, Y_train shape: {}".format(
    X_train.shape,
    Y_train.shape
))

X_train shape: (784, 9966), Y_train shape: (9966,)


In [53]:
k = 2

In [54]:
def PCA(X, k):

    # Find the centroid of the dataset
    centroid = np.mean(X, axis=1)

    # Translate the whole dataset so that its center is in 0
    X_c = X - centroid.reshape((d, 1))

    # Compute SVD on the shifted dataset matrix
    U, S, VT = np.linalg.svd(X_c, full_matrices=False)

    # Take only the first k columns on the U matrix: this is now the projection matrix for the PCA
    U_k = U[:, :k]

    print("Projection matrix shape: {}".format(U_k.shape))

    # Transpose the projection matrix and apply it to the dataset 
    return U_k.T @ X

In [55]:
X_PCA_train = PCA(X_train, k)

Projection matrix shape: (784, 2)


In [56]:
def LDA(X, Y, k):
    # get unique label values
    unique_idxs = np.unique(Y)
    
    # create the clusters divided by class
    clusters = []
    for i in unique_idxs:
        cluster = X[:, (Y==i)]
        clusters.append(cluster)

    # start with constructing the WITHIN-CLUSTER scatter matrix
    # compute the centroids for each cluster
    centroids = [np.mean(cluster, axis=1) for cluster in clusters]
    # shift each cluster by their centroid so their center is in 0
    shifted_clusters = [cluster - centroid.reshape((d, 1)) 
        for cluster, centroid in zip(clusters, centroids)]
    # concatenate the shifted clusters
    Xw = np.concatenate(shifted_clusters, axis=1)
    # compute the within-cluster scatter matrix (how far is each sample from its centroid, more or less)
    Sw = Xw @ Xw.T

    # second step: construction of the BETWEEN-CLUSTER scatter matrix
    # repeat each centroid as many times as the number of samples in their cluster
    repeated_centroids = [np.repeat(centroid.reshape(d, 1), cluster.shape[1], axis=1)
        for cluster, centroid in zip(clusters, centroids)]
    # concatenate them all
    Xbar = np.concatenate(repeated_centroids, axis=1)

    # find the global centroid of the data
    global_centroid = np.mean(X, axis=1)
    # shift the "repeated centroids matrix" by the global centroid
    Xbarc = Xbar - global_centroid.reshape((d, 1))
    # compute the between-cluster scatter matrix (how far is each centroid from the global one, more or less)
    Sb = Xbarc @ Xbarc.T

    try:
        # if the within-cluster scatter matrix is SPD, compute its cholesky decomposition
        L = np.linalg.cholesky(Sw)
    except:
        # otherwise, add a small perturbation in the form of the identity matrix
        epsilon = 1e-6
        # this shifts the eigenvalues to the right by epsilon
        # REMARK: for any matrix X, X@X^T is SPD (x^T @ A @ x >= 0), so only numerical error can make it non SPD
        # this is why epsilon is enough to bring it back to SPD
        Sw = Sw + epsilon * np.eye(Sw.shape[0])

        # once it is SPD, compute its cholesky decomposition
        L = np.linalg.cholesky(Sw)

    # Compute the first k eigenvector decomposition of L^-1 @ Sb @ L
    _, W = scipy.sparse.linalg.eigs(np.linalg.inv(L) @ Sb @ L, k=k)
    # Sb should be SPD and L^-1 @ Sb @ L is just a change of basis, so its eigenvalues should remain all POSITIVE and REAL
    # but numerical errors can add an imaginary component, so we assume that the latter is small and take only the real component
    W = np.real(W)
    
    # Compute Q, the projection matrix of LDA 
    Q = np.linalg.inv(L).T @ W

    print("Projection matrix shape: {}".format(Q.shape))

    # Compute the LDA projection on the initial dataset
    return Q.T @ X
    

In [57]:
X_LDA_train = LDA(X_train, Y_train, k)

Projection matrix shape: (784, 2)


In [65]:
def avg_distance_centroids(X, Y):
    # get unique label values
    unique_idxs = np.unique(Y)
    
    # create the clusters divided by class
    distances = []
    for i in unique_idxs:
        cluster = X[:, (Y==i)]
        d, N = cluster.shape
        centroid = np.mean(cluster, axis=1)

        distances.append([np.linalg.norm(cluster[:,j] - centroid) for j in range(N)])

    return np.mean(np.concatenate(distances))


In [66]:
avg_distance_centroids(X_PCA_train, Y_train)

539.7719925089318

In [67]:
avg_distance_centroids(X_LDA_train, Y_train)

0.01176687110870925

In [69]:
X_PCA_test = PCA(X_test, k)
avg_distance_centroids(X_PCA_test, Y_test)

Projection matrix shape: (784, 2)


573.7043782046113

In [70]:
X_LDA_test = LDA(X_test, Y_test, k)
avg_distance_centroids(X_LDA_test, Y_test)

Projection matrix shape: (784, 2)


0.02401642706352224