In [1]:
import numpy as np
import pandas as pd

In [2]:
def custom_pca(data, n_components=3):
    """
    Perform Principal Component Analysis (PCA) manually.

    Parameters:
    - data (numpy.ndarray): Input data with features.
    - n_components (int): Number of principal components to retain.

    Returns:
    - principal_components (numpy.ndarray): Data projected onto principal components.
    - eigenvalues (numpy.ndarray): Eigenvalues corresponding to principal components.
    - eigenvectors (numpy.ndarray): Eigenvectors corresponding to principal components.
    """

    # STEP 1:- Standardize the features
    mean_vec = np.mean(data, axis=0)
    std_dev = np.std(data, axis=0)
    standardized_data = (data - mean_vec) / std_dev

    # STEP 2:- Calculate the covariance matrix
    cov_matrix = np.cov(standardized_data, rowvar=False)

    # STEP 3:- Finding Eigenvalues and corresponding Eigenvectors
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

    # STEP 4:- Sort eigenvalues and corresponding eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Choose the number of components
    selected_eigenvectors = eigenvectors[:, :n_components]

    # Project the standardized data onto selected eigenvectors
    principal_components = np.dot(standardized_data, selected_eigenvectors)

    return principal_components, eigenvalues, eigenvectors


In [3]:
def variation_captured(eigenvalues,num_components):
    """
    Calculate the percentage of variation captured by the principal components.

    Parameters:
    - eigenvalues (numpy.ndarray): Eigenvalues corresponding to principal components.
    - num_components (int): Number of principal components to retain.

    Returns:
    - variation_captured (float): Percentage of variation captured by the principal components.
    """

    total_sum = np.sum(eigenvalues)
    variation_captured = np.sum(eigenvalues[:num_components]) / total_sum

    return variation_captured

In [4]:
#Testing the above code
df = pd.read_csv('datasets/iris.csv')
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [5]:
features = df.drop('variety', axis=1).values
target = df['variety'].values

In [6]:
principal_components, eigenvalues, eigenvectors = custom_pca(features, n_components=3)

In [7]:
#Variation captured by principal components
for i in range(1,4):
    val = variation_captured(eigenvalues, i)
    val = val*100
    val = round(val,2)
    print(f'Variation captured by {i} principal components: {val}%')

Variation captured by 1 principal components: 72.96%
Variation captured by 2 principal components: 95.81%
Variation captured by 3 principal components: 99.48%


In [8]:
#Eigenvalues and Eigenvectors of the covariance matrix
dict1 = {}
for i in range(len(eigenvalues)):
    dict1[eigenvalues[i]] = eigenvectors[:,i]
print("*"*100)
for key,value in dict1.items():
    print(f'Eigenvalue: {round(key,3)}')
    print(f'Eigenvector: {value}')
    print("*"*100)

****************************************************************************************************
Eigenvalue: 2.938
Eigenvector: [ 0.52106591 -0.26934744  0.5804131   0.56485654]
****************************************************************************************************
Eigenvalue: 0.92
Eigenvector: [-0.37741762 -0.92329566 -0.02449161 -0.06694199]
****************************************************************************************************
Eigenvalue: 0.148
Eigenvector: [-0.71956635  0.24438178  0.14212637  0.63427274]
****************************************************************************************************
Eigenvalue: 0.021
Eigenvector: [ 0.26128628 -0.12350962 -0.80144925  0.52359713]
****************************************************************************************************


In [9]:
data_converted = pd.DataFrame(principal_components, columns=['X_1', 'X_2', 'X_3'])
print("The updated features look like this:- ")
data_converted.head()

The updated features look like this:- 


Unnamed: 0,X_1,X_2,X_3
0,-2.264703,-0.480027,-0.127706
1,-2.080961,0.674134,-0.234609
2,-2.364229,0.341908,0.044201
3,-2.299384,0.597395,0.09129
4,-2.389842,-0.646835,0.015738


In [10]:
label_converted = pd.DataFrame(target,columns=['variety'])
label_converted.head()

Unnamed: 0,variety
0,Setosa
1,Setosa
2,Setosa
3,Setosa
4,Setosa


In [11]:
#Merging data_converted and label_converted along vertical axis
print("The final dataset looks like this:-")
final_df = pd.concat([data_converted, label_converted], axis=1)
final_df.head() 

The final dataset looks like this:-


Unnamed: 0,X_1,X_2,X_3,variety
0,-2.264703,-0.480027,-0.127706,Setosa
1,-2.080961,0.674134,-0.234609,Setosa
2,-2.364229,0.341908,0.044201,Setosa
3,-2.299384,0.597395,0.09129,Setosa
4,-2.389842,-0.646835,0.015738,Setosa
