# Principal Component Analysys (PCA)

In [1]:
import numpy as np

In [2]:
# Creating data
x = np.array([1.1, 1.3, 1.6, 1.4, 2.4, 2.8, 3.1, 3.4, 3.8, 4, 3.6, 4.1, 3.9, 4.6]) # x vector
y = np.array([20.1, 30, 30.6, 30.2, 40.0, 40.5, 41, 43.4, 46.2, 48, 50.3, 60, 62, 64.2]) # y vector
z = np.array([2.3, 3.3, 3.5, 3.2, 4.2, 4.5, 4, 4.1, 4.4, 4.8, 5.3, 6, 6.1, 6.3]) # z vector
g = np.array([122.3, 130.3, 130.5, 137, 142.2, 146.5, 149, 152, 157.4, 159.8, 165.3, 168, 169.1, 176.3]) # g vector

X = np.vstack([x, y, z, g]).T # creating matrix out of my feature vectors
X

array([[  1.1,  20.1,   2.3, 122.3],
       [  1.3,  30. ,   3.3, 130.3],
       [  1.6,  30.6,   3.5, 130.5],
       [  1.4,  30.2,   3.2, 137. ],
       [  2.4,  40. ,   4.2, 142.2],
       [  2.8,  40.5,   4.5, 146.5],
       [  3.1,  41. ,   4. , 149. ],
       [  3.4,  43.4,   4.1, 152. ],
       [  3.8,  46.2,   4.4, 157.4],
       [  4. ,  48. ,   4.8, 159.8],
       [  3.6,  50.3,   5.3, 165.3],
       [  4.1,  60. ,   6. , 168. ],
       [  3.9,  62. ,   6.1, 169.1],
       [  4.6,  64.2,   6.3, 176.3]])

In [3]:
rows, cols = X.shape # saving dimensions
X.shape

(14, 4)

# Feature Scaling

In [4]:
for j in range(cols): # iterating through columns
    mini = X[:,j].min() # minimum of current column
    maxi = X[:,j].max() # maximum of current column
    for i in range(rows): # iterating through rows
        X[i,j] = (X[i,j] - mini) / (maxi - mini) # scaling

Orginal_data = X # will be used later

# Mean Normalization

In [5]:
for j in range(cols): # iterating through columns
    mean = X[:,j].mean() # mean of current column
    for i in range(rows): # iterating through rows
        X[i,j] = X[i,j] - mean # subtracting mean

X

array([[-0.5244898 , -0.52656301, -0.53214286, -0.52050265],
       [-0.46734694, -0.30207321, -0.28214286, -0.3723545 ],
       [-0.38163265, -0.28846777, -0.23214286, -0.36865079],
       [-0.43877551, -0.29753806, -0.30714286, -0.24828042],
       [-0.15306122, -0.07531584, -0.05714286, -0.15198413],
       [-0.03877551, -0.06397797,  0.01785714, -0.0723545 ],
       [ 0.04693878, -0.0526401 , -0.10714286, -0.0260582 ],
       [ 0.13265306,  0.00178167, -0.08214286,  0.02949735],
       [ 0.24693878,  0.06527373, -0.00714286,  0.12949735],
       [ 0.30408163,  0.10609006,  0.09285714,  0.1739418 ],
       [ 0.18979592,  0.15824425,  0.21785714,  0.27579365],
       [ 0.33265306,  0.3781989 ,  0.39285714,  0.32579365],
       [ 0.2755102 ,  0.42355037,  0.41785714,  0.34616402],
       [ 0.4755102 ,  0.47343699,  0.46785714,  0.47949735]])

In [6]:
print("now means of our features are equal to 0")
for j in range(cols):
    print(f"{X[:,j].mean().round() = }")

now means of our features are equal to 0
X[:,j].mean().round() = 0.0
X[:,j].mean().round() = -0.0
X[:,j].mean().round() = 0.0
X[:,j].mean().round() = 0.0


# Calculating Covariance Matrix

In [7]:
X = X.T
X_cov = np.cov(X) # creating covariance matrix OR X @ X.T
X_cov.shape

(4, 4)

# Calculating Eigectors and Eigenvalues

In [8]:
eig_val, eig_vec = np.linalg.eig(X_cov) # can also use np.linalg.svd(X_cov)
eig_vec = eig_vec.T
eig_vec

array([[ 0.53950846,  0.48347554,  0.47423729,  0.50028094],
       [ 0.73149467, -0.30906677, -0.60237576,  0.08084998],
       [ 0.4041958 ,  0.11914566,  0.31559178, -0.85019521],
       [ 0.10233241, -0.81026752,  0.55914601,  0.14265471]])

# Transforming Original Data Into K Dimensional

In [9]:
N = 3 # how many features do I want back

components = eig_vec[:N, :]

In [10]:
# Fixing small error
for i in range(components.shape[0]): 
    if i % 2 == 0:
        components[i, :] *= -1 # multiplying by -1 odd rows
    
components

array([[-0.53950846, -0.48347554, -0.47423729, -0.50028094],
       [ 0.73149467, -0.30906677, -0.60237576,  0.08084998],
       [-0.4041958 , -0.11914566, -0.31559178,  0.85019521]])

In [11]:
new_data1 = np.dot(components, X).T # or eig_vec_1d @ X
new_data1

array([[ 1.05030655e+00,  5.75489687e-02,  1.45325590e-04],
       [ 7.18267157e-01, -1.08649838e-01, -2.64166147e-03],
       [ 6.39880920e-01, -7.99746290e-02, -5.15387629e-02],
       [ 6.50444035e-01, -6.40608729e-02,  9.86465219e-02],
       [ 2.22125228e-01, -6.65522879e-02, -4.03421011e-02],
       [ 7.95805532e-02, -2.51971836e-02, -4.38554182e-02],
       [ 6.39738948e-02,  1.13038226e-01, -1.04176880e-03],
       [-4.82305993e-02,  1.48350080e-01, -2.82796803e-03],
       [-2.26181459e-01,  1.75233000e-01,  4.76355993e-03],
       [-3.46402647e-01,  1.47773482e-01, -1.69691552e-02],
       [-4.20194015e-01, -1.90072877e-02,  7.01556887e-02],
       [-7.11614917e-01, -8.38619821e-02, -2.65120116e-02],
       [-7.24759030e-01, -1.53090761e-01,  6.10456628e-04],
       [-9.47195674e-01, -4.15489147e-02,  1.14072946e-02]])

# Testing Results Using PCA From Sklearn

In [12]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
new_data2 = pca.fit_transform(Orginal_data)
new_data2

array([[ 1.05030655e+00,  5.75489687e-02,  1.45325590e-04],
       [ 7.18267157e-01, -1.08649838e-01, -2.64166147e-03],
       [ 6.39880920e-01, -7.99746290e-02, -5.15387629e-02],
       [ 6.50444035e-01, -6.40608729e-02,  9.86465219e-02],
       [ 2.22125228e-01, -6.65522879e-02, -4.03421011e-02],
       [ 7.95805532e-02, -2.51971836e-02, -4.38554182e-02],
       [ 6.39738948e-02,  1.13038226e-01, -1.04176880e-03],
       [-4.82305993e-02,  1.48350080e-01, -2.82796803e-03],
       [-2.26181459e-01,  1.75233000e-01,  4.76355993e-03],
       [-3.46402647e-01,  1.47773482e-01, -1.69691552e-02],
       [-4.20194015e-01, -1.90072877e-02,  7.01556887e-02],
       [-7.11614917e-01, -8.38619821e-02, -2.65120116e-02],
       [-7.24759030e-01, -1.53090761e-01,  6.10456628e-04],
       [-9.47195674e-01, -4.15489147e-02,  1.14072946e-02]])

In [13]:
# Checking equality of numbers

data1 = np.round(new_data1, 6) # removing floating point error
data2 = np.round(new_data2, 6) 

print("Is equal: ")
for i, j in zip (data1, data2):
    print(i == j, i, j)

Is equal: 
[ True  True  True] [1.050307e+00 5.754900e-02 1.450000e-04] [1.050307e+00 5.754900e-02 1.450000e-04]
[ True  True  True] [ 0.718267 -0.10865  -0.002642] [ 0.718267 -0.10865  -0.002642]
[ True  True  True] [ 0.639881 -0.079975 -0.051539] [ 0.639881 -0.079975 -0.051539]
[ True  True  True] [ 0.650444 -0.064061  0.098647] [ 0.650444 -0.064061  0.098647]
[ True  True  True] [ 0.222125 -0.066552 -0.040342] [ 0.222125 -0.066552 -0.040342]
[ True  True  True] [ 0.079581 -0.025197 -0.043855] [ 0.079581 -0.025197 -0.043855]
[ True  True  True] [ 0.063974  0.113038 -0.001042] [ 0.063974  0.113038 -0.001042]
[ True  True  True] [-0.048231  0.14835  -0.002828] [-0.048231  0.14835  -0.002828]
[ True  True  True] [-0.226181  0.175233  0.004764] [-0.226181  0.175233  0.004764]
[ True  True  True] [-0.346403  0.147773 -0.016969] [-0.346403  0.147773 -0.016969]
[ True  True  True] [-0.420194 -0.019007  0.070156] [-0.420194 -0.019007  0.070156]
[ True  True  True] [-0.711615 -0.083862 -0.026