### Code used in Industry - Using Package

In [1]:
# import all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

Load the dataset using `load_iris`

In [2]:
from sklearn.datasets import load_iris
X,y=load_iris(return_X_y=True)
X.shape

(150, 4)

Check if our data is in standard normal form

In [4]:
X_df=pd.DataFrame(X)

In [5]:
X_df.describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


Not in std normal form, => convert into std normal form

In [7]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_std=scaler.fit_transform(X_df)

In [8]:
X=pd.DataFrame(X_std)

In [9]:
X.describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,-1.468455e-15,-1.823726e-15,-1.610564e-15,-9.473903e-16
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


Now we can see that our data is in std normal form.
Mean -> `0`
Standard Deviation -> `1`

**We can perform PCA now .**

In [None]:
pca=PCA(n_components=2) #we are saying give us only PC1 and PC2, no need for PC3 and PC4
X_pca=pca.fit_transform(X)
print(X_pca.shape)
X_pca

(150, 2)


array([[-2.26470281,  0.4800266 ],
       [-2.08096115, -0.67413356],
       [-2.36422905, -0.34190802],
       [-2.29938422, -0.59739451],
       [-2.38984217,  0.64683538],
       [-2.07563095,  1.48917752],
       [-2.44402884,  0.0476442 ],
       [-2.23284716,  0.22314807],
       [-2.33464048, -1.11532768],
       [-2.18432817, -0.46901356],
       [-2.1663101 ,  1.04369065],
       [-2.32613087,  0.13307834],
       [-2.2184509 , -0.72867617],
       [-2.6331007 , -0.96150673],
       [-2.1987406 ,  1.86005711],
       [-2.26221453,  2.68628449],
       [-2.2075877 ,  1.48360936],
       [-2.19034951,  0.48883832],
       [-1.898572  ,  1.40501879],
       [-2.34336905,  1.12784938],
       [-1.914323  ,  0.40885571],
       [-2.20701284,  0.92412143],
       [-2.7743447 ,  0.45834367],
       [-1.81866953,  0.08555853],
       [-2.22716331,  0.13725446],
       [-1.95184633, -0.62561859],
       [-2.05115137,  0.24216355],
       [-2.16857717,  0.52714953],
       [-2.13956345,

**Let us extract the PC1**

In [17]:
print(X_pca[:,0].shape)
print(X_pca[:,0])

(150,)
[-2.26470281 -2.08096115 -2.36422905 -2.29938422 -2.38984217 -2.07563095
 -2.44402884 -2.23284716 -2.33464048 -2.18432817 -2.1663101  -2.32613087
 -2.2184509  -2.6331007  -2.1987406  -2.26221453 -2.2075877  -2.19034951
 -1.898572   -2.34336905 -1.914323   -2.20701284 -2.7743447  -1.81866953
 -2.22716331 -1.95184633 -2.05115137 -2.16857717 -2.13956345 -2.26526149
 -2.14012214 -1.83159477 -2.61494794 -2.44617739 -2.10997488 -2.2078089
 -2.04514621 -2.52733191 -2.42963258 -2.16971071 -2.28647514 -1.85812246
 -2.5536384  -1.96444768 -2.13705901 -2.0697443  -2.38473317 -2.39437631
 -2.22944655 -2.20383344  1.10178118  0.73133743  1.24097932  0.40748306
  1.0754747   0.38868734  0.74652974 -0.48732274  0.92790164  0.01142619
 -0.11019628  0.44069345  0.56210831  0.71956189 -0.0333547   0.87540719
  0.35025167  0.15881005  1.22509363  0.1649179   0.73768265  0.47628719
  1.2341781   0.6328582   0.70266118  0.87427365  1.25650912  1.35840512
  0.66480037 -0.04025861  0.13079518  0.02345

In [19]:
evr=pca.explained_variance_ratio_
evr

array([0.72962445, 0.22850762])

### We can clearly see that :
- PC1 contains `72.96%` of the data variance
and 
- PC2 contains `22.85%` of the data variance

**Thus, PC1 and PC2 together contain `95.81%` of the data variance**

**which is very good because we reduced the dimensionality from 4 to 2 and still retained `95.81%` of the data variance**