Covers PCA & CVT

# Principle Component Analysis (PCA)
A dimensionality reduction technique that transforms data into a smaller set of uncorrelated variables (*principle components*) while preserving as much variance as possible.


In [10]:
''' Loading Data & Encoding Features '''
# Necessary Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Load iris data into dataframe
loaded_data = load_iris(as_frame=True)
iris_data = loaded_data.frame

# Since PCA does not take non-numerical or boolean data, must label encode
iris_data.replace({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica':2}, inplace=True)

# Extract X, y
X = iris_data.iloc[:, :-1]
y = iris_data.iloc[:, -1]

## Using PCA
Once n_components is determined, use it as a PCA parameter to reduce dimensionality of X.

In [None]:
from sklearn.decomposition import PCA

# Run PCA on X
pca = PCA()
X_new = pca.fit_transform(X)

# Show the explained covariance of each feature
print(pca.explained_variance_ratio_)

# Calculate the number of features needed for minimum variance
minimum_variance = .95
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.90) + 1
print(f"Number of Components Needed for {minimum_variance} variance: {n_components}")

# Now, transform dataset with given n_components
pca = PCA(n_components=n_components)
X_new = pca.fit_transform(X)
print(X_new.shape)  # Showing reduced dimensionality

''' Now, this dataset's dimensionality has been reduced and is ready for training '''

# Cross Validation Test (CVT)
For CVT, the data is partitioned into ***k*** (roughly) equal sized segments/folds. Then, iterations of training and validation are performed on those folds, with **1 validation fold** and **k-1 training folds**

In [21]:
''' Using Data above, creating simple SVM model to test CVT '''
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_new, y, test_size=.3, random_state=42)
model = SVC()

In [22]:
''' Using Cross Validation Test (CVT) to test model '''
from sklearn.model_selection import cross_val_score

# Get the results from CVT & print them
results = cross_val_score(model, X_new, y, cv=5)  # 5 'folds'
print(f"Average Accuracy: {results.mean()}")


Average Accuracy: 0.9133333333333334
