# PCA

## Dependencies

In [1]:
import numpy as np
import pandas as pd

## Reading the dataset

In [2]:
df = pd.read_csv("data/data_pca_200x16.csv", sep=";", decimal=",")
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,y
0,-4.44,0.99,2.92,3.24,10.02,2.65,-2.1,-1.76,-3.24,6.95,-1.08,-2.11,-4.92,-13.5,-3.09,-0.47
1,13.92,1.51,2.93,3.81,-3.15,2.2,-5.39,1.84,-2.7,-0.39,0.02,5.01,-2.22,-0.55,-0.98,0.99
2,4.57,0.48,-0.62,4.52,1.41,1.35,4.95,1.51,4.12,-1.1,0.46,-0.85,2.38,-5.09,0.7,1.36
3,6.58,1.48,-0.9,1.74,0.79,1.98,4.1,0.93,0.71,-2.11,-1.8,5.43,2.15,-0.24,0.57,1.43
4,1.78,2.84,3.61,0.05,-0.33,2.49,0.72,1.28,2.06,3.86,0.57,3.33,-1.26,0.64,-6.14,1.03


## PCA Implementation

In [3]:
X = df.drop(columns=["y"])
X.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15
0,-4.44,0.99,2.92,3.24,10.02,2.65,-2.1,-1.76,-3.24,6.95,-1.08,-2.11,-4.92,-13.5,-3.09
1,13.92,1.51,2.93,3.81,-3.15,2.2,-5.39,1.84,-2.7,-0.39,0.02,5.01,-2.22,-0.55,-0.98
2,4.57,0.48,-0.62,4.52,1.41,1.35,4.95,1.51,4.12,-1.1,0.46,-0.85,2.38,-5.09,0.7
3,6.58,1.48,-0.9,1.74,0.79,1.98,4.1,0.93,0.71,-2.11,-1.8,5.43,2.15,-0.24,0.57
4,1.78,2.84,3.61,0.05,-0.33,2.49,0.72,1.28,2.06,3.86,0.57,3.33,-1.26,0.64,-6.14


### Standardization

In [4]:
X_std = (X - X.mean()) / X.std(ddof=0)

In [5]:
X_std.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15
0,-2.235019,0.4636,0.483222,0.434726,2.245338,0.457405,-1.237259,-1.581912,-2.806298,2.251705,-1.125015,-1.986186,-1.185078,-2.619375,-1.179561
1,1.624025,0.764451,0.487092,0.604389,-1.288367,0.184382,-2.250813,0.107288,-2.509887,0.055737,-0.166962,0.586004,-0.344089,-0.089022,-0.361216
2,-0.341229,0.168534,-0.886852,0.815725,-0.064852,-0.331329,0.934641,-0.047556,1.233673,-0.15668,0.216259,-1.530995,1.088707,-0.976111,0.290357
3,0.081248,0.747094,-0.995219,-0.011757,-0.231207,0.050904,0.672781,-0.319705,-0.638107,-0.45885,-1.752104,0.737735,1.017067,-0.028449,0.239938
4,-0.927652,1.533935,0.75027,-0.514795,-0.531719,0.36033,-0.368499,-0.155477,0.10292,1.327244,0.312064,-0.020917,-0.045071,0.143497,-2.362476


### Calculating Covariance Matrix ($\Sigma$)

In [6]:
X_matrix = X_std.values

n_samples = X_matrix.shape[0]

cov_matrix = (1 / (n_samples - 1)) * np.dot(X_matrix.T, X_matrix)

In [7]:
pd.DataFrame(data=cov_matrix, columns=X.columns, index = X.columns)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15
x1,1.005025,-0.005599,0.034246,-0.044132,-0.431448,-0.039461,0.05673,0.009708,0.088564,-0.09938,-0.053676,0.03517,-0.334585,0.385331,0.082552
x2,-0.005599,1.005025,0.006668,-0.025464,0.14017,0.048026,0.041874,-0.552227,-0.040499,0.330062,-0.002596,-0.034136,-0.120557,0.006781,-0.594338
x3,0.034246,0.006668,1.005025,-0.02915,-0.059646,-0.492418,-0.092849,-0.373881,-0.058573,0.032346,0.006696,0.039569,-0.015495,0.010664,-0.451772
x4,-0.044132,-0.025464,-0.02915,1.005025,0.002554,0.024129,-0.540517,0.01288,-0.108581,0.028746,-0.323212,-0.175902,0.095268,-0.018468,0.015287
x5,-0.431448,0.14017,-0.059646,0.002554,1.005025,-0.02992,-0.069102,0.017652,-0.096599,0.094534,-0.003304,-0.316824,-0.30713,-0.48779,-0.160511
x6,-0.039461,0.048026,-0.492418,0.024129,-0.02992,1.005025,0.062842,-0.029941,0.050136,0.169345,0.026418,0.019707,0.018112,0.058245,0.289427
x7,0.05673,0.041874,-0.092849,-0.540517,-0.069102,0.062842,1.005025,0.002318,0.457667,-0.026122,0.055193,0.109629,-0.014812,0.077207,0.033047
x8,0.009708,-0.552227,-0.373881,0.01288,0.017652,-0.029941,0.002318,1.005025,0.095835,-0.366124,-0.013495,-0.063274,-0.000346,-0.03551,0.320521
x9,0.088564,-0.040499,-0.058573,-0.108581,-0.096599,0.050136,0.457667,0.095835,1.005025,-0.016598,0.057345,0.066543,-0.033282,0.109837,0.067654
x10,-0.09938,0.330062,0.032346,0.028746,0.094534,0.169345,-0.026122,-0.366124,-0.016598,1.005025,-0.025176,-0.016339,0.009312,-0.046641,-0.612263


### Calculation of Eigenvalues and Eigenvectors of $\Sigma$

In [8]:
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

In [9]:
sorted_id = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_id]
eigenvectors = eigenvectors[:, sorted_id]

**Here, we are calculating Explained Variance By each EigenVector and then finding Total Explained Variance by top k vectors**

In [10]:
total_eigenvalues = sum(eigenvalues)
explained_variance_ratio = [eig / total_eigenvalues for eig in eigenvalues]

In [11]:
for i in range(len(explained_variance_ratio)):
    print(f"Variance Explained By {i+1} top Principal Components: ",sum(explained_variance_ratio[:i+1]))

Variance Explained By 1 top Principal Components:  0.17628641266195336
Variance Explained By 2 top Principal Components:  0.31718207263895265
Variance Explained By 3 top Principal Components:  0.43628896365121983
Variance Explained By 4 top Principal Components:  0.5398808632734743
Variance Explained By 5 top Principal Components:  0.6293090685656265
Variance Explained By 6 top Principal Components:  0.7039817897121133
Variance Explained By 7 top Principal Components:  0.7588235982843372
Variance Explained By 8 top Principal Components:  0.8117870670209071
Variance Explained By 9 top Principal Components:  0.8616041635258788
Variance Explained By 10 top Principal Components:  0.9055638359944721
Variance Explained By 11 top Principal Components:  0.9385077348808062
Variance Explained By 12 top Principal Components:  0.9622425503991617
Variance Explained By 13 top Principal Components:  0.9817501036619132
Variance Explained By 14 top Principal Components:  0.9963139086594862
Variance Exp

**We Choose 10 top EigenVectors because 10 of them explain 90% of variance in dataset**

In [12]:
eigenvectors = eigenvectors[:, :10]

In [13]:
X_pca = np.dot(X_std, eigenvectors)

### Calculating Principal Components

In [14]:
pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(10)])

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,4.578847,-3.848787,-0.541379,-0.634336,0.024733,0.638674,0.531792,-0.752758,-0.955924,1.841141
1,0.700449,-0.149085,2.766822,-0.285838,-0.977398,2.389334,0.062684,0.094682,-0.960747,0.005511
2,-0.316382,-0.702997,-0.772354,-0.412586,0.317753,-1.752143,0.720578,0.660028,1.072931,-0.216519
3,-0.309713,0.183985,0.359344,-1.028388,0.085893,-0.639136,0.468502,-1.770686,-1.007697,-0.738025
4,2.582146,1.014980,-0.194658,-0.569724,0.605275,-0.102109,-0.448854,0.847643,-0.507637,-0.310599
...,...,...,...,...,...,...,...,...,...,...
195,1.315034,2.113865,2.256166,-0.448177,-3.038597,0.564491,0.984751,0.275354,0.432974,0.072138
196,0.163221,-1.328622,-0.691175,-2.010488,0.056435,0.289135,-0.151212,-0.262841,-0.712698,0.128924
197,-0.586981,-1.639685,-3.452625,0.456339,-1.699563,0.870139,-0.718523,-0.116142,0.045282,1.151005
198,3.543372,0.235331,1.269946,-0.149015,-0.084944,-1.460793,1.184744,0.171195,0.783392,-1.717821


**PCA effectively summarized the dataset, eliminating redundancy by transforming correlated features into orthogonal axes. Components with high eigenvalues held most data variation.**