In [1]:
# TP2: PCA - PRINCIPAL COMPONENT ANALYSIS (Manual Implementation)
# Purpose: Reduce dimensionality of data while preserving variance
# Use case: Simplify data, remove correlation, visualize high-dimensional data
# Method: Manual implementation using eigenvalue decomposition

# Import necessary libraries
import numpy as np  # For numerical operations and linear algebra
import pandas as pd  # For data manipulation

# STEP 1: Load the dataset
# Dataset should contain numerical features (continuous variables)
dataset=pd.read_csv("C:/Users/ASUS/Downloads/acp.csv")
print(dataset)

# STEP 2: Center the data (subtract mean from each column)
# PCA requires centered data (mean = 0)
# This removes the location effect and focuses on variance
m = dataset.mean()  # Calculate mean of each column
x_cnetre = dataset - m  # Subtract mean from each value (centering)
print(x_cnetre)  # Centered data now has mean ≈ 0


    x1   x2
0  2.5  2.4
1  0.5  0.7
2  2.2  2.9
3  1.9  2.2
4  3.1  3.0
5  2.3  2.7
6  2.0  1.6
7  1.0  1.1
8  1.5  1.6
9  1.1  0.9
     x1    x2
0  0.69  0.49
1 -1.31 -1.21
2  0.39  0.99
3  0.09  0.29
4  1.29  1.09
5  0.49  0.79
6  0.19 -0.31
7 -0.81 -0.81
8 -0.31 -0.31
9 -0.71 -1.01


In [2]:
# STEP 3: Calculate covariance matrix
# Covariance matrix shows how features vary together
# Diagonal: variance of each feature
# Off-diagonal: covariance between features
cov_matrix = x_cnetre.cov()
cov_matrix  # Display covariance matrix


Unnamed: 0,x1,x2
x1,0.616556,0.615444
x2,0.615444,0.716556


In [3]:
# STEP 4: Calculate eigenvalues and eigenvectors
# Eigenvalues: represent variance explained by each principal component
# Eigenvectors: represent direction of principal components
valp, vecp = np.linalg.eig(cov_matrix)  # Compute eigenvalues and eigenvectors

# STEP 5: Sort eigenvectors by eigenvalues (descending order)
# We want principal components ordered by importance (variance explained)
idx = valp.argsort()[::-1]  # Get indices that would sort eigenvalues descending
vecp = vecp[:, idx]  # Reorder eigenvectors columns

# Store eigenvectors as principal component directions
v = pd.DataFrame(data=vecp, columns=['pca1','pca2'])
v  # Display eigenvectors (principal component directions)


Unnamed: 0,pca1,pca2
0,-0.677873,-0.735179
1,-0.735179,0.677873


In [4]:
# STEP 6: Transform data to principal component space
# Project centered data onto principal components
# This gives us new coordinates in the PC space
x_pca = np.dot(x_cnetre, v)  # Matrix multiplication: centered_data × eigenvectors
x_pca = pd.DataFrame(x_pca, columns=['z1','z2'])
x_pca  # Display transformed data (PC scores)


Unnamed: 0,z1,z2
0,-0.82797,-0.175115
1,1.77758,0.142857
2,-0.992197,0.384375
3,-0.27421,0.130417
4,-1.675801,-0.209498
5,-0.912949,0.175282
6,0.099109,-0.349825
7,1.144572,0.046417
8,0.438046,0.017765
9,1.223821,-0.162675


In [5]:
# STEP 7: Keep only first principal component
# Dimensionality reduction: from 2D to 1D
# We keep the component with highest variance
x_pc1 = x_pca[['z1']]  # Extract only first principal component
x_pc1


Unnamed: 0,z1
0,-0.82797
1,1.77758
2,-0.992197
3,-0.27421
4,-1.675801
5,-0.912949
6,0.099109
7,1.144572
8,0.438046
9,1.223821


In [None]:
# STEP 8: Reconstruct data from first PC only
# Inverse transformation: go back to original space
# This shows approximation of original data using only 1 PC
v = v[['pca1']]  # Keep only first eigenvector

# Inverse transformation: PC_scores × eigenvector_transpose
x_rec = np.dot(x_pc1, v.transpose())
x_rec = pd.DataFrame(x_rec, columns=['x1','x2']) + m  # Add back the mean (de-centering)
x_rec  # Display reconstructed data (approximation of original)
