# PCA using the theory method

In [1]:
import pandas as pd
import numpy as np

## Step 1: Define your student marks

In [None]:
data = {
    'math': [90, 90, 60, 60, 30],
    'eng':  [60, 90, 60, 60, 30],
    'art':  [90, 30, 60, 90, 30]
}

In [3]:
df = pd.DataFrame(
    data,
    index=[f'Student_{i}' for i in range(5)],
    columns=['math', 'eng', 'art']
)

print("Step 1: Original data (5 students, 3 subjects):")
print(df)
print()

Step 1: Original data (5 students, 3 subjects):
           math  eng  art
Student_0    90   60   90
Student_1    90   90   30
Student_2    60   60   60
Student_3    60   60   90
Student_4    30   30   30



## Step 2: Center the data using pandas .mean()

In [None]:
mu = df.mean(axis=0)  # mean per subject (column)
df_centered = df - mu  # broadcasting: subtracts row-wise

print("Step 2: Centered data (mean subtracted):")
print(df_centered)
print()

Step 2: Centered data (mean subtracted):
           math   eng   art
Student_0  24.0   0.0  30.0
Student_1  24.0  30.0 -30.0
Student_2  -6.0   0.0   0.0
Student_3  -6.0   0.0  30.0
Student_4 -36.0 -30.0 -30.0



## Step 3: Standardize using pandas .std()

In [None]:
std = df_centered.std(axis=0, ddof=1)  # sample std per column
df_scaled = df_centered / std

print("Step 3: Standardized data (zero mean, unit std):")
print(df_scaled)
print()

Step 3: Standardized data (zero mean, unit std):
               math       eng  art
Student_0  0.956183  0.000000  1.0
Student_1  0.956183  1.414214 -1.0
Student_2 -0.239046  0.000000  0.0
Student_3 -0.239046  0.000000  1.0
Student_4 -1.434274 -1.414214 -1.0



## Step 4: Compute covariance matrix using pandas df.cov()

In [None]:
Sigma_df = df_scaled.cov(ddof=1)  # 3x3 covariance as DataFrame
Sigma = Sigma_df.values  # convert to NumPy only for eigh

print("Step 4: Covariance matrix Σ (from df_scaled.cov()):")
print(Sigma_df)
print()

Step 4: Covariance matrix Σ (from df_scaled.cov()):
          math       eng       art
math  1.000000  0.845154  0.298807
eng   0.845154  1.000000  0.000000
art   0.298807  0.000000  1.000000



## Step 5: Compute eigenvalues and eigenvectors (still need NumPy here)

In [None]:
eigenvals, W = np.linalg.eigh(Sigma)  # W columns = eigenvectors

# Sort in descending order of eigenvalues
idx = np.argsort(eigenvals)[::-1]
eigenvals = eigenvals[idx]
W = W[:, idx]

print("Step 5: Eigenvalues (variances along PCs):")
print(eigenvals)
print()

print("Step 5: Eigenvectors (principal components, columns):")
print(W)
print()

Step 5: Eigenvalues (variances along PCs):
[1.89642146 1.         0.10357854]

Step 5: Eigenvectors (principal components, columns):
[[-0.70710678  0.         -0.70710678]
 [-0.66666667 -0.33333333  0.66666667]
 [-0.23570226  0.94280904  0.23570226]]



## Step 6: Choose first k=2 principal components

In [None]:
k = 2
W_k = W[:, :k]  # 3x2 projection matrix

print("Step 6: Projection matrix W_k (first 2 PCs):")
print(W_k)
print()


Step 6: Projection matrix W_k (first 2 PCs):
[[-0.70710678  0.        ]
 [-0.66666667 -0.33333333]
 [-0.23570226  0.94280904]]



## Step 7: Project data onto 2D PCA space

In [None]:
X_scaled = df_scaled.values  # only here we convert to NumPy
X_reduced = X_scaled @ W_k  # 5x2

df_pca = pd.DataFrame(
    X_reduced,
    index=df.index,
    columns=['PC1', 'PC2']
)

print("Step 7: PCA‑reduced data (5 students × 2 principal components):")
print(df_pca)
print()

Step 7: PCA‑reduced data (5 students × 2 principal components):
                PC1       PC2
Student_0 -0.911826  0.942809
Student_1 -1.383230 -1.414214
Student_2  0.169031  0.000000
Student_3 -0.066671  0.942809
Student_4  2.192696 -0.471405



# Step 8: Explained variance (optional)

In [None]:
total_var = eigenvals.sum()
explained_ratio = eigenvals[:k] / total_var

print("Step 8: Explained variance ratio (PC1, PC2):")
print(explained_ratio)
print()

print("    Cumulative (PC1+PC2):", explained_ratio.sum())

Step 8: Explained variance ratio (PC1, PC2):
[0.63214049 0.33333333]

    Cumulative (PC1+PC2): 0.965473819000265
