# PCA manual implementation

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
class PCA:
    
    def __init__(self, num_components):
        self.num_components = num_components
        self.components     = None
        self.mean           = None
        self.variance_share = None
    
    
    def fit(self, X):
        """
        Find principal components
        """
        
        # data centering
        self.mean = np.mean(X, axis = 0)
        X        -= self.mean
        
        # calculate eigenvalues & vectors
        cov_matrix      = np.cov(X.T)
        values, vectors = np.linalg.eig(cov_matrix)
        
        # sort eigenvalues & vectors 
        sort_idx = np.argsort(values)[::-1]
        values   = values[sort_idx]
        vectors  = vectors[:, sort_idx]
        
        # store principal components & variance
        self.components = vectors[:self.num_components]
        self.variance_share = np.sum(values[:self.num_components]) / np.sum(values)
    
    
    def transform(self, X):
        """
        Transform data
        """
        
        # data centering
        X -= self.mean
        
        # decomposition
        return np.dot(X, self.components.T)

In [3]:
# Importing dataset     
df = pd.read_csv( "../ML-Project-CS361/cleaned_shifted_data.csv" ) 
drop_cols = [0,1,2,12,14,16]
drop_cols = df.columns[drop_cols]
drop_cols

Index(['Timestamp', 'Unnamed: 0', 'Station', 'Checks', 'AQI_bucket_calculated',
       'AQI_bucket_calculated_shifted'],
      dtype='object')

In [4]:
# Drop the columns and make a new dataframe df1
df.drop(columns=drop_cols, inplace=True)

In [5]:
print(df.shape)

(174762, 11)


In [6]:
X_old = df.values #converting pandas df to np array

In [7]:
# Step 1: Standardize the Data
scaler = StandardScaler()
X_old = scaler.fit_transform(X_old)
X_old

array([[-0.2284613 , -0.27947897, -0.40159894, ..., -0.37032615,
        -0.70256623,  1.49358304],
       [-0.2284613 , -0.27947897, -0.38000926, ..., -0.38256664,
        -0.69301701,  1.50309952],
       [-0.23480768, -0.2801947 , -0.37377224, ..., -0.39087269,
        -0.69301701,  1.51261601],
       ...,
       [ 0.18906354,  0.02470543, -0.16123382, ...,  1.86356376,
         1.01629424,  0.76081353],
       [ 0.22246553,  0.02470543, -0.16123382, ...,  0.88869596,
         0.93035122,  0.77033001],
       [ 0.17236255,  1.06251102, -0.16123382, ...,  0.53459599,
         0.82530974,  0.77033001]])

In [8]:
# initialize PCA object
pca = PCA(num_components = 8)  # going from 11 components to 8

# fit PCA on old data
pca.fit(X_old)

# check explained variance
print(f"Explained variance: {pca.variance_share:.4f}")

Explained variance: 0.9557


In [9]:
# transform datasets
X_old = pca.transform(X_old)
print(X_old.shape)

(174762, 8)


In [10]:
X_old

array([[-0.00484615,  1.1270521 ,  0.87822209, ..., -0.37474863,
        -0.26732537,  0.67720497],
       [ 0.09766021,  1.24426063,  0.88052331, ..., -0.45278461,
        -0.42716843,  0.84328912],
       [ 0.02080232,  1.14134262,  0.88572797, ..., -0.40153736,
        -0.36330119,  0.73122791],
       ...,
       [ 1.18277395,  0.18107491,  0.06534939, ..., -0.66376994,
        -0.25179809, -1.05262943],
       [ 0.93074087, -0.34013869,  0.92170258, ...,  0.83614103,
         0.69411113, -0.75672463],
       [ 0.50473442, -0.70233713,  1.40261295, ...,  0.66622291,
         0.8220411 , -0.5789685 ]])