In [111]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

Load the dataset

In [5]:
X = pd.read_csv("../Data/data.csv").drop(['Unnamed: 0'], axis = 1).to_numpy()
y = pd.read_csv("../Data/labels.csv").drop(['Unnamed: 0'], axis = 1).to_numpy()

In [117]:
class PCAObject:
    def __init__(self, PCs, X, label):
        self.pcs = PCs
        self.X = StandardScaler().fit_transform(X)
        self.cov = np.cov(X.T)
        self.label = label
    
    def calcExpVar(self):
        self.ev = []
        for pc in self.pcs:
            self.ev.append(pc @ self.cov @ pc.T)
        self.pev = [ev/sum(self.ev) for ev in self.ev]
        
    def calcNonZeroLoads(self):
        self.nonZeroLoads = [np.count_nonzero(pc) for pc in self.pcs]
        
    def plotNonZeroLoadtoPEV(self):
        if self.pev == None: self.calcExpVar()
        if self.nonZeroLoads == None: self.calcNonZeroLoads()
            
        plt.plot(self.nonZeroLoads, self.pev, label=self.label)
        plt.xlabel('Number of non-zero loadings')
        plt.xscale('log')
        plt.ylabel('Percentage of explained variance (PEV)')
        plt.title('Percentage of explained variance(PEV) vs non-zero loadings')

In [134]:
def reg_PCA(X, k = "all"): 
    """
    function takes an n x p feature matrix
    returns two arrays:
    - array with percentage of explained variance in first k principal directions (k_comp x 1)
    - array with principal directions (k_comp x p)
    """
    X = StandardScaler().fit_transform(X)
    if k == "all": k = min(X.shape[0],X.shape[1])
    pca = PCA(n_components = k)
    pca.fit(X)
    PEVs = pca.explained_variance_ratio_
    prin_comp = pca.components_
    EVs = pca.explained_variance_
    
    return PEVs, prin_comp, EVs

In [106]:
def threshold_PCA(X, thresh = 1e-1, k = "all"):
    """
    function takes
    - X: n x p feature matrix
    - thresh: float representing this non-zero cutoff
    - k: integer for number of principal directions wanted
    returns one array:
    - array with principal components in its columns (k x p)
    """
    if k == "all": k = min(X.shape[0],X.shape[1])

    pcs = reg_PCA(X)[1] 
    pcs = (np.abs(pcs) >= thresh).astype(int) * pcs
    
    return pcs[:k]

In [107]:
def nonZeroLoad_PCA(X,j, k = "max"):
    """
    function takes
    - X: n x p feature matrix
    - j: integer for number of non-zero loadings,
    - k: integer for number of principal directions wanted
    returns three arrays:
    - array with percentages of explained variance in first k principal directions (k x 1)
    - array with principal directions (k x p)
    - array with explained variances
    """   
    X_scaled = StandardScaler().fit_transform(X)
    if k == "all": k = min(X.shape[0],X.shape[1])
    
    PCA_PEV, PCA_PC, PCA_EV = reg_PCA(X,min(X.shape[0],X.shape[1]))
    
    total_var = sum(PCA_EV)
    
    thresh_PCA_PC = np.empty((0,PCA_PC.shape[1]))
    thresh_PCA_PEV = []
    thresh_PCA_EV = []
    
    PCA_PC_sorted = np.sort(np.absolute(PCA_PC), axis = 1)
    for m in range(k):
        thresh = PCA_PC_sorted[m][-j]
        thresh_PC = (np.absolute(PCA_PC[m]) >= thresh).astype(int)*PCA_PC[m]
        thresh_PCA_PC = np.vstack((thresh_PCA_PC, thresh_PC))
    
    return thresh_PCA_PC

In [139]:
PCs = reg_PCA(X)[1]
reg = PCAObject(PCs, X, "Regular")

In [142]:
reg.calcExpVar()

In [144]:
test = np.array([[1,2,2],[2,1,2],[2,2,1]])

In [145]:
np.diag(test)

array([1, 1, 1])

In [146]:
%%timeit
np.diag(PCs @ np.cov(X.T) @ PCs.T)

15.8 s ± 653 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [147]:
result1 = np.diag(PCs @ np.cov(X.T) @ PCs.T)

In [149]:
%%timeit
cache = []
cov = np.cov(X.T)
for pc in PCs:
    cache.append(pc @ cov @ pc.T)

KeyboardInterrupt: 

In [150]:
result2 = []
cov = np.cov(X.T)
for pc in PCs:
    result2.append(pc @ cov @ pc.T)

KeyboardInterrupt: 