# PCA manual implementation

Here, we implement Principal Component Analysis (or dimensionality reduction) for feature selection.

## Initial functions

In [1]:
# Import the required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
class PCA:
    
    def __init__(self, num_components):
        self.num_components = num_components
        self.components     = None
        self.mean           = None
        self.variance_share = None
    
    
    def fit(self, X):
        """
        Find principal components
        """
        
        # data centering
        self.mean = np.mean(X, axis = 0)
        X        -= self.mean
        
        # calculate eigenvalues & vectors
        cov_matrix      = np.cov(X.T)
        values, vectors = np.linalg.eig(cov_matrix)
        
        # sort eigenvalues & vectors 
        sort_idx = np.argsort(values)[::-1]
        values   = values[sort_idx]
        vectors  = vectors[:, sort_idx]
        
        # store principal components & variance
        self.components = vectors[:self.num_components]
        self.variance_share = np.sum(values[:self.num_components]) / np.sum(values)
    
    
    def transform(self, X):
        """
        Transform data
        """
        
        # data centering
        X -= self.mean
        
        # decomposition
        return np.dot(X, self.components.T)

## Dataset Operations

In [3]:
from sklearn.preprocessing import OneHotEncoder
df = pd.read_csv("cleaned_shifted_data.csv")
    
oe = OneHotEncoder(sparse=False)
encoded = oe.fit_transform(pd.DataFrame(df['Station']))

In [4]:
oe.get_feature_names()

array(['x0_IITG ', 'x0_LGBI Airport ', 'x0_Pan Bazaar ',
       'x0_Railway Colony '], dtype=object)

In [5]:

one_hot_df = pd.DataFrame(encoded, columns=oe.get_feature_names())
df = pd.concat([df, one_hot_df], axis=1)

df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['year'] = df['Timestamp'].dt.year
df['month'] = df['Timestamp'].dt.month
df['dayofweek'] = df['Timestamp'].dt.day_of_week

drop_cols = [0,1,2,12,14,16]
drop_cols = df.columns[drop_cols]
df.drop(drop_cols,axis=1,inplace=True)

X = df.drop('AQI_calculated_shifted',axis = 1)
y = df['AQI_calculated_shifted']

In [6]:
print(X.shape, y.shape)

(174762, 17) (174762,)


## PCA Functions

In [7]:
# carry out PCA to reduce the number of features from 10 to 7 components
pca = PCA(num_components = 7)  # initialize PCA object
pca.fit(X) # fit PCA on old data
X_pca = pca.transform(X) # transform datasets
X_pca

array([[ -1.13953143,  37.03042904, -58.15373783, ...,  11.9386136 ,
        -50.07669542,  13.07018644],
       [ -0.93831225,  36.90182928, -58.03797556, ...,  12.23933133,
        -49.83941661,   9.99124985],
       [ -0.61468367,  36.52321966, -58.09325186, ...,  12.29396125,
        -50.31117511,  12.02783208],
       ...,
       [  8.37122392,  30.68552213,  -1.69996473, ..., -13.40586962,
        -42.72771882,  39.25524837],
       [  7.9843526 ,  32.26506977, -36.15169875, ...,   3.02848392,
        -46.17233121,  36.62568228],
       [ -8.65880869, -40.55730187, -46.25307474, ...,  -0.26595098,
        -49.52923117,  34.90827246]])

In [8]:
print(X_pca.shape)

(174762, 7)


## Explained Variance

In [9]:
# check explained variance
print(f"Explained variance: {pca.variance_share:.4f}")

Explained variance: 0.9967
