### Principal Component Analysis Example

In [1]:
import numpy as np
from sklearn import decomposition

np.set_printoptions(precision = 3, suppress = True)

---

PCA without using scikit-learn

In [2]:
# Define the matrix X
X = np.array([[10, 20, 10],
              [2, 5, 2],
              [8, 17, 7],
              [9, 20, 10],
              [12, 22, 11]])

# Convert X to a matrix format 
X = np.mat(X)

In [3]:
# Calculate the mean of each column
mean_values = X.mean(axis = 0)
print("Mean Values of Each Column: ")
print(mean_values)

Mean Values of Each Column: 
[[ 8.2 16.8  8. ]]


In [4]:
# Center the matrix by subtracting the column means from each element
centered_matrix = X - mean_values
print("Centered Matrix: ")
print(centered_matrix)

Centered Matrix: 
[[  1.8   3.2   2. ]
 [ -6.2 -11.8  -6. ]
 [ -0.2   0.2  -1. ]
 [  0.8   3.2   2. ]
 [  3.8   5.2   3. ]]


In [5]:
# Calculate the covariance matrix of X
covariance_X = np.cov(centered_matrix, rowvar = 0)
print("Covariance Matrix: ")
print(covariance_X)

Covariance Matrix: 
[[14.2  25.3  13.5 ]
 [25.3  46.7  24.75]
 [13.5  24.75 13.5 ]]


In [6]:
# Calculate the covariance matrix of X manually
N = np.shape(X)[0] # Calculate the number of rows
print("Covariance Matrix: ")
print(np.dot(centered_matrix.T, centered_matrix) / (N - 1))

Covariance Matrix: 
[[14.2  25.3  13.5 ]
 [25.3  46.7  24.75]
 [13.5  24.75 13.5 ]]


In [7]:
# Compute the eigenvalues and eigenvectors of the covariance matrix C
eigen_values, eigen_vectors = np.linalg.eig(covariance_X)

print("Eigen Values: ")
print(eigen_values)
print()
print("Eigen Vectors: ")
print(eigen_vectors)

Eigen Values: 
[73.718  0.384  0.298]

Eigen Vectors: 
[[ 0.434  0.9   -0.044]
 [ 0.795 -0.406 -0.451]
 [ 0.424 -0.161  0.891]]


In [8]:
# Transform the data into the new feature space based on the eigenvectors
new_features = eigen_vectors.T # Transpose of the eigenvectors matrix
principal_components  = np.dot(new_features, centered_matrix.T) # Project the centered data onto the new feature space
print("All Principal Components: ")
print(principal_components.T)

All Principal Components: 
[[  4.173   0.      0.259]
 [-14.615   0.172   0.252]
 [ -0.352  -0.1    -0.973]
 [  3.739  -0.9     0.303]
 [  7.055   0.828   0.159]]


In [9]:
# Take the top first principal component
reduced_features = eigen_vectors[:,0].T # Select the first eigenvector
reduced_principal_components = np.dot(reduced_features, centered_matrix.T) # Project the centered data onto the single principal component
print("First Principal Component: ")
print(reduced_principal_components.T)

First Principal Component: 
[[  4.173]
 [-14.615]
 [ -0.352]
 [  3.739]
 [  7.055]]


---

PCA with using scikit-learn

In [10]:
# Define the matrix X
X = np.array([[10, 20, 10],
              [2, 5, 2],
              [8, 17, 7],
              [9, 20, 10],
              [12, 22, 11]])

# Convert X to a NumPy array
X = np.asarray(X)

In [11]:
pca = decomposition.PCA(svd_solver = 'randomized') # Create a PCA instance
principal_components = pca.fit_transform(X) # Fit PCA on the dataset
print("Principal Components: ")
print(principal_components)

Principal Components: 
[[-4.173 -0.    -0.259]
 [14.615 -0.172 -0.252]
 [ 0.352  0.1    0.973]
 [-3.739  0.9   -0.303]
 [-7.055 -0.828 -0.159]]


---

PCA for dimensionality reduction

In [12]:
# Define the matrix M
X = np.array([[2.5, 2.4],
              [0.5, 0.7],
              [2.2, 2.9],
              [1.9, 2.2],
              [3.1, 3.0],
              [2.3, 2.7],
              [2, 1.6],
              [1, 1.1],
              [1.5, 1.6],
              [1.1, 0.9]])

In [13]:
# Calculate the mean of each column
mean_values = X.mean(axis = 0)
print("Mean Values of Each Column: ")
print(mean_values)

Mean Values of Each Column: 
[1.81 1.91]


In [14]:
# Center the matrix by subtracting the column means from each element
centered_matrix = X - mean_values
print("Centered Matrix: ")
print(centered_matrix)

Centered Matrix: 
[[ 0.69  0.49]
 [-1.31 -1.21]
 [ 0.39  0.99]
 [ 0.09  0.29]
 [ 1.29  1.09]
 [ 0.49  0.79]
 [ 0.19 -0.31]
 [-0.81 -0.81]
 [-0.31 -0.31]
 [-0.71 -1.01]]


In [15]:
# Calculate the covariance matrix of M    
covariance_X = np.cov(centered_matrix, rowvar = 0)
print("Covariance Matrix :")
print(covariance_X)

Covariance Matrix :
[[0.617 0.615]
 [0.615 0.717]]


In [16]:
# Compute the eigenvalues and eigenvectors of the covariance matrix C
eigen_values, eigen_vectors = np.linalg.eig(covariance_X)

print("Eigen Values: ")
print(eigen_values)
print()
print("Eigen Vectors: ")
print(eigen_vectors)

Eigen Values: 
[0.049 1.284]

Eigen Vectors: 
[[-0.735 -0.678]
 [ 0.678 -0.735]]


In [17]:
# Take the second principal component
new_feature = eigen_vectors[:,1].T # Transpose of the eigenvectors matrix
principal_component  = np.dot(new_feature, centered_matrix.T) # Project the centered data onto the single principal component
print("Second Principal Component: ")
print(np.mat(principal_component).T)

Second Principal Component: 
[[-0.828]
 [ 1.778]
 [-0.992]
 [-0.274]
 [-1.676]
 [-0.913]
 [ 0.099]
 [ 1.145]
 [ 0.438]
 [ 1.224]]


In [18]:
pca = decomposition.PCA(n_components = 1) # Create a PCA instance
principal_component = pca.fit_transform(X) # Fit PCA on the dataset
print("Principal Component: ")
print(principal_component)

Principal Component: 
[[-0.828]
 [ 1.778]
 [-0.992]
 [-0.274]
 [-1.676]
 [-0.913]
 [ 0.099]
 [ 1.145]
 [ 0.438]
 [ 1.224]]
