# PCA manual implementation

Here, we implement Principal Component Analysis (or dimensionality reduction) for feature selection.

## Initial functions

In [1]:
# Import the required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [None]:
class PCA:
    
    def __init__(self, num_components):
        self.num_components = num_components
        self.components     = None
        self.mean           = None
        self.variance_share = None
    
    
    def fit(self, X):
        """
        Find principal components
        """
        
        # data centering
        self.mean = np.mean(X, axis = 0)
        X        -= self.mean
        
        # calculate eigenvalues & vectors
        cov_matrix      = np.cov(X.T)
        values, vectors = np.linalg.eig(cov_matrix)
        
        # sort eigenvalues & vectors 
        sort_idx = np.argsort(values)[::-1]
        values   = values[sort_idx]
        vectors  = vectors[:, sort_idx]
        
        # store principal components & variance
        self.components = vectors[:self.num_components]
        self.variance_share = np.sum(values[:self.num_components]) / np.sum(values)
    
    
    def transform(self, X):
        """
        Transform data
        """
        
        # data centering
        X -= self.mean
        
        # decomposition
        return np.dot(X, self.components.T)

## Dataset Operations

In [21]:
# Importing dataset     
df = pd.read_csv( "../ML-Project-CS361/cleaned_shifted_data.csv" ) 
drop_cols = [0,1,2,12,14,16]
drop_cols = df.columns[drop_cols]
drop_cols # Dropping unecessary columns

Index(['Timestamp', 'Unnamed: 0', 'Station', 'Checks', 'AQI_bucket_calculated',
       'AQI_bucket_calculated_shifted'],
      dtype='object')

In [22]:
# Drop the columns and make a new dataframe df1
df.drop(columns=drop_cols, inplace=True)

In [23]:
# The dataset has a size of 1,74,762 records, 10 features, 1 target variable
print(df.shape) 

(174762, 11)


In [24]:
# Separating the features and labels/target variables
X = df.drop('AQI_calculated_shifted',axis = 1)  # feature set
Y = df['AQI_calculated_shifted'] # target variable
print(X.shape)
print(Y.shape)

(174762, 10)
(174762,)


In [25]:
# Standardize the Data
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[-0.2284613 , -0.27947897, -0.40159894, ..., -0.46348286,
        -0.37032615, -0.70256623],
       [-0.2284613 , -0.27947897, -0.38000926, ..., -0.44749643,
        -0.38256664, -0.69301701],
       [-0.23480768, -0.2801947 , -0.37377224, ..., -0.44749643,
        -0.39087269, -0.69301701],
       ...,
       [ 0.18906354,  0.02470543, -0.16123382, ...,  0.06406919,
         1.86356376,  1.01629424],
       [ 0.22246553,  0.02470543, -0.16123382, ...,  0.11202847,
         0.88869596,  0.93035122],
       [ 0.17236255,  1.06251102, -0.16123382, ...,  0.15998775,
         0.53459599,  0.82530974]])

## PCA Functions

In [26]:
# carry out PCA to reduce the number of features from 10 to 7 components
pca = PCA(num_components = 7)  # initialize PCA object
pca.fit(X) # fit PCA on old data
X_pca = pca.transform(X) # transform datasets
X_pca

array([[ 0.15017421,  1.03560126, -1.25877373, ..., -0.14352518,
        -1.06010037, -0.4568703 ],
       [ 0.15744473,  1.08573516, -1.3682673 , ..., -0.23954184,
        -1.25922592, -0.63391133],
       [ 0.13229186,  1.0522327 , -1.28195604, ..., -0.17073515,
        -1.11831908, -0.55303796],
       ...,
       [ 1.15463999, -1.12783814,  0.85077822, ..., -0.31840259,
        -0.2475441 ,  0.22921917],
       [ 0.70140958, -0.83965458,  0.54882366, ..., -0.693188  ,
         0.60013108,  0.37158456],
       [ 0.12059157, -0.96928581,  1.00131221, ..., -0.10002021,
         0.38198446,  0.50255283]])

In [27]:
print(X_pca.shape)

(174762, 7)


## Explained Variance

In [28]:
# check explained variance
print(f"Explained variance: {pca.variance_share:.4f}")

Explained variance: 0.9473
