# Principle Component Analysis

An attepmt to reduce dimentionality in data by effectively forming a change of basis. The reduction of features can decrease training time in some algorithms.
It can be computationally expensive and might lose some information.

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [6]:
iris = sns.load_dataset('iris')
CI = iris.copy()                    #copy the dataset, its good practise
CI.head()                           #Inspect the data, identify the data and labels

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [33]:
#Seperate the data and labels into two objects
data = CI.drop('species', axis=1)
labels = CI['species']

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#splitting the data and normalizing it.
def split_and_scale(data,labels,size):
    data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=size, random_state=0)
    #test train split with 0.8 test and 0.2 train

    #PCA works best on normalized data hence.
    scaler = StandardScaler()
    data_train = scaler.fit_transform(data_train) #Here scaler is fitted to the data and the dataset is transformed
    data_test = scaler.transform(data_test) #Scaled using the previous fit so to be consistant.
    return data_train, data_test, labels_train, labels_test

In [12]:
from sklearn.decomposition import PCA

def PCA_initialization(n_components,train,test):
    pca = PCA(n_components=n_components)
    train = pca.fit_transform(train)
    test = pca.transform(test)
    explained_variance = pca.explained_variance_ratio_
    return train, test, explained_variance

In [16]:
#running pca with no components does not transform the data and only performs the analysis.

data_train, data_test, labels_train, labels_test = split_and_scale(data,labels,0.2)

data_train, data_test, explained_variance = PCA_initialization(None,data_train,data_test)
explained_variance

#here we see that the first feature contributes 0.74 of the variance ect.

array([0.72229951, 0.2397406 , 0.03335483, 0.00460506])

In [35]:
data_train_collection = []
data_test_collection = []
labels_train_collection = []
labels_test_collection = []
EV_collection = []

for i in range(1,5):
    data_train, data_test, labels_train, labels_test = split_and_scale(data,labels,0.2)
    data_train, data_test, explained_variance = PCA_initialization(i,data_train,data_test)
    data_train_collection.append(data_train.copy())
    labels_test_collection.append(labels_test.copy())
    labels_train_collection.append(labels_train.copy())
    data_test_collection.append(data_test.copy())
    EV_collection.append(explained_variance.copy())
    print(f'For {i} components the principle variance is {explained_variance}')

#note the reduction of features with each component

For 1 components the principle variance is [0.72229951]
For 2 components the principle variance is [0.72229951 0.2397406 ]
For 3 components the principle variance is [0.72229951 0.2397406  0.03335483]
For 4 components the principle variance is [0.72229951 0.2397406  0.03335483 0.00460506]


In [39]:
#example with classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def classifier_accuracy(data_train, data_test, labels_train, labels_test):
    classifier = RandomForestClassifier(max_depth=2, random_state=0)
    classifier.fit(data_train, labels_train)

    # Predicting the Test set results
    labels_pred = classifier.predict(data_test)
    print(f'Accuracy {accuracy_score(labels_test, labels_pred)}')

In [40]:
for data_train, data_test, labels_train, labels_test in zip(
                                                            data_train_collection,
                                                            data_test_collection,
                                                            labels_train_collection,
                                                            labels_test_collection):
    classifier_accuracy(data_train, data_test, labels_train, labels_test)

Accuracy 0.9333333333333333
Accuracy 0.8
Accuracy 0.8
Accuracy 0.9
