# [1] Run PCA
This notebook is where we run PCA on the downscaled images in order to create eigenvectors for analysis of model's performance on eigenvectors data rather than raw pixel data.

In [1]:
# Import libraries
import numpy as np
import os
import pickle
from skimage.transform import resize

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#loading in important variables
with open('OFFICIAL_keys.pkl', 'rb') as f:
    keys = pickle.load(f)
train_set = keys[:int(0.8*len(keys))]
test_set = keys[int(0.8*len(keys)):int(0.9*len(keys))+1]
validation_set = keys[int(0.9*len(keys))+1:]

scale = (162, 135)

with open('side_imgs.npy', 'rb') as f:
    side_imgs = pickle.load(f)
with open('below_imgs.npy', 'rb') as f:
    below_imgs = pickle.load(f) 
with open('concat_imgs.npy', 'rb') as f:
    concat_imgs = pickle.load(f) 

In [3]:
## PCA function copy/pasted from Max's MP3 (with code added to reshape the data)

# Implement the PCA Function

# Input - data of shape MxN (M=features, N=samples)
#       - n_comps = number of PCs to find

# Output - eigenvalues of shape (n_comps,)
#        - eigenvectors of shape (n_comps, M)

from scipy.sparse.linalg import eigsh

def pca(unshaped_data, n_comps):
    
    data = unshaped_data.reshape(*unshaped_data.shape[:1],-1).T
    print(data.shape)
    
    ##### TODO #####
    n = data.shape[1]
    m = data.shape[0]
    print(n, m)
    
    #Check data is centered
    X = center_mean(data)
    print(X.shape)
    
    #Compute Cov matrix of X: Cx
    Cx = (1/n) * np.matmul(X, X.T)
    print(Cx.shape)
    
    #Eigenvalue decomposition of Cx
    eigenvalues, eigenvectors = eigsh(A=Cx, k=n_comps)
    flipped_vals = np.flip(eigenvalues)
    vectors_flipped = eigenvectors.T
    vectors_reversed = np.flip(vectors_flipped, 0)
    return flipped_vals, vectors_reversed

### helper function to average data
def center_mean(data, print_means=False):
    row_means = np.mean(data, axis=1)
    if print_means: print(row_means)
    if np.any(row_means != 0): 
        to_return = np.empty(data.shape)
        for i in range(0, data.shape[1]):
            to_return[:, i] = data[:, i] - row_means[:]
        return to_return
    else:
        return data

In [3]:
## We conduct PCA ONLY on the training sets as the test and validation should be "unknown" to our code
train_side_imgs = side_imgs[train_set]
train_below_imgs = below_imgs[train_set]
train_concat_imgs = concat_imgs[train_set]

In [4]:
train_side_imgs.shape

(2060, 21870)

In [14]:
## Generate eigenvectors and values for side images
train_side_vals, train_side_vectors = pca(train_side_imgs, train_side_imgs.shape[1])

(21870, 2060)
2060 21870
(21870, 2060)
(21870, 21870)
Wall time: 14min 13s


In [15]:
## Dump eigenvectors and values for side images
with open('train_side_vals.npy', 'wb') as f:
    pickle.dump(train_side_vals, f)
with open('train_side_vectors.npy', 'wb') as f:
    pickle.dump(train_side_vectors, f)

Wall time: 26.2 s


In [16]:
## Generate eigenvectors and values for below images
train_below_vals, train_below_vectors = pca(train_below_imgs, train_below_imgs.shape[1])

(21870, 2060)
2060 21870
(21870, 2060)
(21870, 21870)
Wall time: 11min 38s


In [17]:
## Dump eigenvectors and values for below images
with open('train_below_vals.npy', 'wb') as f:
    pickle.dump(train_below_vals, f)
with open('train_below_vectors.npy', 'wb') as f:
    pickle.dump(train_below_vectors, f)

Wall time: 25.8 s


In [6]:
## Generate eigenvectors and values for concat images
train_concat_vals, train_concat_vectors = pca(train_concat_imgs, train_concat_imgs.shape[1])

(43740, 2060)
2060 43740
(43740, 2060)
(43740, 43740)




Wall time: 1h 54min 41s


In [7]:
## Dump eigenvectors and values for concat images
with open('train_concat_vals.npy', 'wb') as f:
    pickle.dump(train_concat_vals, f)
with open('train_concat_vectors.npy', 'wb') as f:
    pickle.dump(train_concat_vectors, f)

Wall time: 2min 9s
