In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [96]:
X, y = load_digits(return_X_y=True)

In [97]:
class PCA_:
    def __init__(self,n_components):
        self.n_components=n_components
        self.components=None
        self.retained_variance=None
        
    def _centering(self,x):
        mean=np.mean(x,axis=0)
        return x- mean
        
    def fit(self,x):
        
        #center data by subtract the mean
        x_centered=self._centering(x)
        #get covariance matrix
        cov_matrix=np.cov(x_centered,rowvar=False)
        #get eigen values and eigen vectors
        eigen_vals,eigen_vectors =np.linalg.eig(cov_matrix)
    
        # eigen vectors already return sorted so i will not sort it again
        # sorted_indecies=np.argsort(eigen_vals)[::-1]
        # eigen_vals=eigen_vals[sorted_indecies]
        # eigen_vectors=eigen_vectors[:,sorted_indecies]
    
        pca_components=eigen_vectors[:,:self.n_components]
        self.components=pca_components
        
        total_variance=np.sum(eigen_vals)
        self.retained_variance=(np.sum(eigen_vals[:self.n_components])/total_variance)*100
        
    def fit_transform(self,x):
        self.fit(x)
        return self.transform(x)

        
    def transform(self,xt):
        x_centered=self._centering(xt)
        x_reduced=np.dot(x_centered,self.components)
        return x_reduced

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [99]:
pca_=PCA_(n_components=10)

In [100]:
pca_.fit(X_train)

In [101]:
xtrain_pca=pca_.transform(X_train)
xtest_pca=pca_.transform(X_test)

In [102]:
xtrain_pca.shape

(1347, 10)

In [103]:
xtest_pca.shape

(450, 10)

In [104]:
# prediction
clf = LogisticRegression(max_iter=10000)
clf.fit(xtrain_pca, y_train)

In [105]:
pca_.n_components

10

In [106]:
y_pred = clf.predict(xtest_pca)
print(accuracy_score(y_test, y_pred))

0.9177777777777778


In [107]:
pca_.retained_variance

np.float64(73.85968534346992)

In [108]:
# --- PCA ---
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

clf_pca = LogisticRegression(max_iter=1000)
clf_pca.fit(X_train_pca, y_train)
y_pred_pca = clf_pca.predict(X_test_pca)

# Accuracy
print("Accuracy with SKLearn PCA:", accuracy_score(y_test, y_pred_pca))

Accuracy with SKLearn PCA: 0.9244444444444444
