# Principle Component Analysis

In [2]:
import numpy as np  
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

### Load the iris dataset

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
df = pd.read_csv(url, names=names)
df.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Extract data from dataframe

In [4]:
x = df.drop('Class', axis=1)
y = df['Class']

### Split the data in 70:30 ratio

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=0)

### Scale the data

In [6]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

### use Principle Component Analysis for transform x_train & x_test

In [7]:
pca = PCA()
x_train = pca.fit_transform(x_train)
x_test = pca.fit_transform(x_test)

### Take a look at variance explained by each Principle Component

In [8]:
explained_varience = pca.explained_variance_ratio_
explained_varience

array([0.77039307, 0.17850944, 0.0432369 , 0.00786059])

### takes no. of component and create RFC Model and calculate accurancy

In [9]:
def perform_pca(n):
    
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=0)
    
    pca = PCA(n_components=n)
    pca_x_train = pca.fit_transform(x_train)
    pca_x_test = pca.transform(x_test)

    rfc_model = RandomForestClassifier(max_depth=2, random_state=0)
    rfc_model.fit(pca_x_train, y_train)
    y_pred = rfc_model.predict(pca_x_test)

    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print(f"Accuracy is : {accuracy_score(y_test, y_pred)}\n")

for n in range(1,5): 
    perform_pca(n)

[[16  0  0]
 [ 0 15  3]
 [ 0  1 10]]
Accuracy is : 0.9111111111111111

[[16  0  0]
 [ 0 15  3]
 [ 0  1 10]]
Accuracy is : 0.9111111111111111

[[16  0  0]
 [ 0  9  9]
 [ 0  1 10]]
Accuracy is : 0.7777777777777778

[[16  0  0]
 [ 0 15  3]
 [ 0  1 10]]
Accuracy is : 0.9111111111111111

