# PCA Practice

### PCA step by step, restoration of original data by Tanweer Ashif

Step 1: Create the data

In [1]:
import numpy as np
data = np.mat([[3, 4, 6, 7, 8, 3, 1, 4, 6, 12, 12, 3, 6, 8, 90],
              [12, 3, 5, 7, 8, 4, 2, 4, 12, 34, 56 ,76, 5, 3, 5]]).T
print(f"shape of data = {data.shape}")
print(f"\nHead of data:\n{data[:5]}")

shape of data = (15, 2)

Head of data:
[[ 3 12]
 [ 4  3]
 [ 6  5]
 [ 7  7]
 [ 8  8]]


Step 2: Standardize the data

In [2]:
orig_mean = data.mean()
print(f"Old mean = {orig_mean}")
orig_std = data.std()
print(f"Old std = {orig_std}")
data_std = (data - data.mean())/data.std()
# data_std = (data - np.mean(data))/np.std(data) # Can also be used.
print(f"New mean = {data_std.mean()}")
print(f"New std = {data_std.std()}")

Old mean = 13.633333333333333
Old std = 21.412587160100223
New mean = 5.736152293896643e-17
New std = 0.9999999999999999


Step 3: Create co-variance Matrix

In [3]:
cov_mat = np.cov(data_std, rowvar=False)
print(cov_mat)
print(cov_mat.shape)

[[ 1.04965721 -0.09610035]
 [-0.09610035  1.07258919]]
(2, 2)


Step 4: Calculate Eigen Vectors and Eigen Values

In [4]:
eigval,eigvec = np.linalg.eig(cov_mat)
sorting_index = np.argsort(-1.0*eigval)
eigval_std = eigval[sorting_index]
eigvec_std = eigvec[:,sorting_index].T
del eigval
del eigvec
print(f"eigval_std: {eigval_std}")
print(f"\neigvec_std:\n{eigvec_std}")

eigval_std: [1.15790516 0.96434125]

eigvec_std:
[[ 0.66390045 -0.74782096]
 [-0.74782096 -0.66390045]]


Step 5: Calculate PC1 and PC2

```PC1 = [data_std].[E1_std^T] - Result is colwise```

and

```PC2 = [data_std][E2_std^T] - Result is colwise```

In [5]:
# PC1
print(f"data_std shape = {data_std.shape}")
eigvec1_std = eigvec_std[:1,]
print(f"\neigvec1_std shape = {eigvec1_std.shape}")
PC1 = data_std@eigvec1_std.T
print(f"PC1:\n{PC1}")

# PC2
eigvec2_std = eigvec_std[1:2,]
print(f"\neigvec2_std shape = {eigvec2_std.shape}")
PC2 = data_std@eigvec2_std.T
print(f"PC2:\n{PC2}")

data_std shape = (15, 2)

eigvec1_std shape = (1, 2)
PC1:
[[-0.27264496]
 [ 0.07267946]
 [ 0.06484103]
 [ 0.02599745]
 [ 0.02207824]
 [ 0.00674994]
 [ 0.01458837]
 [ 0.03775509]
 [-0.17962951]
 [-0.7619346 ]
 [-1.53027058]
 [-2.50780417]
 [ 0.06484103]
 [ 0.19670005]
 [ 2.66927353]]

eigvec2_std shape = (1, 2)
PC2:
[[ 0.42200413]
 [ 0.66612611]
 [ 0.53426709]
 [ 0.43733243]
 [ 0.37140292]
 [ 0.67004532]
 [ 0.80190435]
 [ 0.63512096]
 [ 0.31723105]
 [-0.57442841]
 [-1.25654168]
 [-1.56232539]
 [ 0.53426709]
 [ 0.52642866]
 [-2.39937938]]


Step 6: Verification by restoring original matrix

```a
Since,
[PC_i] = [data_std].[Ei_std^T]
=> [PC_all] = [data_std].[E_std^-1]
=> [PC_all].E_std = [data_std]
=> [data_std] = [PC_all].E_std
and
[data] = [data_std]*Orig_std + Orig_Mean

In [6]:
PC_all = np.hstack((PC1,PC2))
# print(PC_all)
restored_std_data = PC_all@eigvec_std
restored_orig_data = restored_std_data*orig_std + orig_mean
print(f"\nRestored_orig_std:\n{restored_orig_data}")
print(f"\nOrig data:\n{data}")


Restored_orig_std:
[[ 3. 12.]
 [ 4.  3.]
 [ 6.  5.]
 [ 7.  7.]
 [ 8.  8.]
 [ 3.  4.]
 [ 1.  2.]
 [ 4.  4.]
 [ 6. 12.]
 [12. 34.]
 [12. 56.]
 [ 3. 76.]
 [ 6.  5.]
 [ 8.  3.]
 [90.  5.]]

Orig data:
[[ 3 12]
 [ 4  3]
 [ 6  5]
 [ 7  7]
 [ 8  8]
 [ 3  4]
 [ 1  2]
 [ 4  4]
 [ 6 12]
 [12 34]
 [12 56]
 [ 3 76]
 [ 6  5]
 [ 8  3]
 [90  5]]


### PCA using function

In [25]:
import numpy as np

def mypca(data, pca_dimension):
    data_std = (data-data.mean())/data.std() #1
    cov_mat = np.cov(data_std, rowvar=False) #2
    eigval, eigvec = np.linalg.eig(cov_mat) #3
    sorting_index = (-1*eigval).argsort(); del eigval
    eigvec_std = eigvec[:,sorting_index].T; del eigvec
    eigvec_std_selected = eigvec_std[:pca_dimension,:]
    pca = data_std@eigvec_std_selected.T #4
    return pca, eigvec_std_selected

def restoremat(std, mean, eigvec_std, pca_mat):
    pr, pc = pca.shape
    er, ec = eigvec_std.shape
    if pc != ec:
        print("\nCan't restore orig mat from partial components!")
        return
    data_std = pca_mat@eigvec_std
    orig_mat = (data_std)*std + mean
    return orig_mat

data = np.mat([[3, 4, 6, 7, 8, 3, 1, 4, 6, 12, 12, 3, 6, 8, 90],
              [12, 3, 5, 7, 8, 4, 2, 4, 12, 34, 56 ,76, 5, 3, 5]]).T
pca, eigvec_std = mypca(data, 1)
print(f"PCA:\n{pca}")
print(f"\nRestored Mat:\n{restoremat(data.std(), data.mean(), eigvec_std, pca)}")
print(f"\nOrig Mat:\n{data}")

PCA:
[[-0.27264496]
 [ 0.07267946]
 [ 0.06484103]
 [ 0.02599745]
 [ 0.02207824]
 [ 0.00674994]
 [ 0.01458837]
 [ 0.03775509]
 [-0.17962951]
 [-0.7619346 ]
 [-1.53027058]
 [-2.50780417]
 [ 0.06484103]
 [ 0.19670005]
 [ 2.66927353]]

Can't restore orig mat from partial components!

Restored Mat:
None

Orig Mat:
[[ 3 12]
 [ 4  3]
 [ 6  5]
 [ 7  7]
 [ 8  8]
 [ 3  4]
 [ 1  2]
 [ 4  4]
 [ 6 12]
 [12 34]
 [12 56]
 [ 3 76]
 [ 6  5]
 [ 8  3]
 [90  5]]
