In [96]:
import pandas as pd
import numpy as np 

In [97]:
np.random.seed(23)

muVec1 = np.array([0, 0, 0])
covMat1 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
class1Sample = np.random.multivariate_normal(muVec1, covMat1, 20)

df = pd.DataFrame(class1Sample, columns=["feature1", "feature2", "feature3"])
df["target"] = 1
muVec2 = np.array([1, 1, 1])
covMat2 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
class2Sample = np.random.multivariate_normal(muVec1, covMat1, 20)

df1 = pd.DataFrame(class2Sample, columns=["feature1", "feature2", "feature3"])
df1["target"] = 0
df = pd.concat([df, df1], ignore_index=True)
df = df.sample(40)

In [98]:
df.head()

Unnamed: 0,feature1,feature2,feature3,target
2,-0.367548,-1.13746,-1.322148,1
34,-0.822939,-1.598109,0.226512,0
14,0.420623,0.41162,-0.071324,1
11,1.968435,-0.547788,-0.679418,1
12,-2.50623,0.14696,0.606195,1


In [99]:
import plotly.express as px

px.scatter_3d(x=df["feature1"], y=df["feature2"], z=df["feature3"],color=df["target"])

### Apply Principle Component Analysis (PCA)

In [100]:
# step 1 - standard scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df.iloc[:, :3] = scaler.fit_transform(df.iloc[:, :3])

In [101]:
# step 2 - Find Covariance Matrix
covarianceMatrix = np.cov([df.iloc[:, 0], df.iloc[:,1], df.iloc[:,2]])
covarianceMatrix

array([[ 1.02564103,  0.06781177, -0.12497686],
       [ 0.06781177,  1.02564103, -0.15241116],
       [-0.12497686, -0.15241116,  1.02564103]])

In [102]:
# step 3 - Find Eigen Vector and Eigen Values
eigenValues, eigenVector = np.linalg.eig(covarianceMatrix)

In [103]:
eigenValues

array([1.25911792, 0.95953081, 0.85827434])

In [104]:
eigenVector

array([[-0.51038783, -0.78846385,  0.34326234],
       [-0.569092  ,  0.60894401,  0.55255904],
       [ 0.64470037, -0.08667156,  0.75950607]])

In [105]:
# For 2D Choose 2 Eigen Vectoe with Maximum eigen values
pc = eigenVector[0:2]
pc

array([[-0.51038783, -0.78846385,  0.34326234],
       [-0.569092  ,  0.60894401,  0.55255904]])

In [106]:
# Transform The Data From 3D to 2D
new2DDf = df.iloc[:,:3].dot(pc.T)
new2DDf.rename(columns={0:"PC1",1:"PC2"}, inplace=True)
new2DDf["target"] =  df["target"]
new2DDf.head()

Unnamed: 0,PC1,PC2,target
2,0.560815,-1.471948,1
34,1.740278,-0.6434,0
14,-0.706342,-0.080489,1
11,-0.786474,-1.876564,1
12,1.105696,1.622028,1


In [107]:
new2DDf.iloc[:,:2].var(), new2DDf.iloc[:,:2].std()

(PC1    1.206510
 PC2    0.954675
 dtype: float64,
 PC1    1.098413
 PC2    0.977075
 dtype: float64)

In [108]:
px.scatter(x=new2DDf.iloc[:,0],y=new2DDf.iloc[:,1], color=new2DDf.iloc[:,2])

In [109]:
# Similarly From 3D to 1D
pc = eigenVector[1]
new1DDF = pd.DataFrame(df.iloc[:, :3].dot(pc.T))
new1DDF.rename(columns={0: "PC1"}, inplace=True)
new1DDF["target"] = df["target"]
new1DDF.head()

Unnamed: 0,PC1,target
2,-1.471948,1
34,-0.6434,0
14,-0.080489,1
11,-1.876564,1
12,1.622028,1


### PCA on MNIST Dataset

In [110]:
df = pd.read_csv(r"The Data\train.csv")
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
px.imshow(df.iloc[13051,1:].values.reshape(28,28))

In [112]:
x = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [113]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(
    x, y, test_size=0.2, random_state=42)

In [114]:
xTrain.shape

(33600, 784)

In [115]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(xTrain,yTrain)

In [116]:
import time
start = time.time()
yPred = knn.predict(xTest)
print(time.time() - start)

16.025180339813232


In [117]:
from sklearn.metrics import accuracy_score
accuracy_score(yTest, yPred)

0.9648809523809524

In [118]:
scaler = StandardScaler()
xTrain = scaler.fit_transform(xTrain)
xTest = scaler.transform(xTest)

In [119]:
pd.DataFrame(xTrain).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.034769,-0.026785,-0.018757,-0.012827,-0.010173,-0.007711,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.034769,-0.026785,-0.018757,-0.012827,-0.010173,-0.007711,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.034769,-0.026785,-0.018757,-0.012827,-0.010173,-0.007711,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.034769,-0.026785,-0.018757,-0.012827,-0.010173,-0.007711,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.034769,-0.026785,-0.018757,-0.012827,-0.010173,-0.007711,0.0,0.0,0.0,0.0


In [120]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
xTrain = pca.fit_transform(xTrain)
xTest = pca.transform(xTest)

In [121]:
xTrain.shape

(33600, 100)

In [122]:
knn = KNeighborsClassifier()
knn.fit(xTrain,yTrain)

In [123]:
start = time.time()
yPred = knn.predict(xTest) 
print(time.time() - start)

1.0874614715576172


In [124]:
accuracy_score(yTest, yPred)

0.9538095238095238

### Visualization

In [125]:
pca = PCA(n_components=2)
xTraint = pca.fit_transform(xTrain)
xTestt = pca.transform(xTest)

In [126]:
xTraint

array([[-2.71872136, -0.48899993],
       [-0.67710289, -6.75243327],
       [-3.03309418,  6.50877028],
       ...,
       [ 2.14891545,  0.7801802 ],
       [ 1.05953352,  0.94777986],
       [17.70252852,  1.96197397]])

In [127]:
px.scatter(x=xTraint[:,0],y=xTraint[:,1],color=yTrain)

In [128]:
pca = PCA(n_components=3)
xTrain = pca.fit_transform(xTrain)
xTest = pca.transform(xTest)

In [129]:
xTrain

array([[-2.71862133, -0.49005038,  1.13503075],
       [-0.67697936, -6.75301986, -2.33487245],
       [-3.03322685,  6.50962654,  7.49133528],
       ...,
       [ 2.14881923,  0.78097762, -0.74642225],
       [ 1.05958429,  0.94725787,  3.94879954],
       [17.70259569,  1.96212223, -4.94352699]])

In [130]:
px.scatter_3d(x=xTrain[:,0],y=xTrain[:,1],z=xTrain[:,2],color=yTrain)

In [131]:
# Eigen values 
pca.explained_variance_

array([40.67111198, 29.17023396, 26.74459601])

In [132]:
# Eigen Vectors
pca.components_.shape

(3, 100)

In [133]:
pca.explained_variance_ratio_

array([0.0805567 , 0.05777707, 0.05297265])

In [134]:
df

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41997,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [135]:
# 90% Variance Should be Explained
x = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [136]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [137]:
x.shape

(42000, 784)

In [138]:
pca = PCA(n_components=None)
x = pca.fit_transform(x)

In [139]:
pca.explained_variance_

array([4.06964787e+01, 2.91114657e+01, 2.67833371e+01, 2.08147194e+01,
       1.81000206e+01, 1.57876737e+01, 1.38244007e+01, 1.25432643e+01,
       1.10638975e+01, 1.00889267e+01, 9.63617203e+00, 8.65579470e+00,
       8.04120472e+00, 7.88086691e+00, 7.43637560e+00, 7.16743699e+00,
       6.73538375e+00, 6.61651973e+00, 6.42354578e+00, 6.26826675e+00,
       5.93960379e+00, 5.74928832e+00, 5.48826880e+00, 5.32649477e+00,
       5.15217038e+00, 4.94730998e+00, 4.88853571e+00, 4.70777145e+00,
       4.46528559e+00, 4.36351702e+00, 4.32543150e+00, 4.22712324e+00,
       4.08726514e+00, 4.06176768e+00, 3.99903435e+00, 3.86804997e+00,
       3.81925839e+00, 3.71256507e+00, 3.57437538e+00, 3.45887625e+00,
       3.41436841e+00, 3.36945857e+00, 3.25693182e+00, 3.24008824e+00,
       3.18312949e+00, 3.16286640e+00, 3.14244041e+00, 3.09287815e+00,
       3.06368054e+00, 3.02342271e+00, 2.96849737e+00, 2.91830693e+00,
       2.84948896e+00, 2.82806029e+00, 2.79589667e+00, 2.76696531e+00,
      

In [140]:
a =np.cumsum(sorted(pca.explained_variance_ratio_, reverse=True))

In [141]:
len(a[a<=0.9])
# maximum feature should be 228 = len(a[a<=0.9])

228

### PCA Using SVD (Singular Value Decomposition)

In [142]:
from sklearn.datasets import load_iris
iris = load_iris()
x = iris.data
x.shape

(150, 4)

In [143]:
# Mean Centering
xMean = np.mean(x, axis=0)
Xc = x - xMean
Xc.shape

(150, 4)

In [144]:
u, s, vT = np.linalg.svd(Xc)
vT

array([[ 0.36138659, -0.08452251,  0.85667061,  0.3582892 ],
       [-0.65658877, -0.73016143,  0.17337266,  0.07548102],
       [ 0.58202985, -0.59791083, -0.07623608, -0.54583143],
       [ 0.31548719, -0.3197231 , -0.47983899,  0.75365743]])

In [145]:
v = vT.T
v # These are Required Eigen Vectors

array([[ 0.36138659, -0.65658877,  0.58202985,  0.31548719],
       [-0.08452251, -0.73016143, -0.59791083, -0.3197231 ],
       [ 0.85667061,  0.17337266, -0.07623608, -0.47983899],
       [ 0.3582892 ,  0.07548102, -0.54583143,  0.75365743]])

In [146]:
xTransformed = np.dot(Xc, v[:,:2]) # The PC1 and PC2
xTransformed = pd.DataFrame(xTransformed, columns=["PC1","PC2"])
xTransformed

Unnamed: 0,PC1,PC2
0,-2.684126,-0.319397
1,-2.714142,0.177001
2,-2.888991,0.144949
3,-2.745343,0.318299
4,-2.728717,-0.326755
...,...,...
145,1.944110,-0.187532
146,1.527167,0.375317
147,1.764346,-0.078859
148,1.900942,-0.116628


In [147]:
px.scatter(x=xTransformed["PC1"], y=xTransformed["PC2"],color=iris.target)

# End