

# This notebook was done as a refresher on using Support Vector Machine (SVM) classifiers and using the Principal Component Analysis.

## The data used was given from my advanced machine learning class. About 400 grayscale JPG images that are 64x64 pixels with 40 classes. The csv that came with the images contained the labels for each image.



In [1]:

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from sklearn.decomposition  import  PCA
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing


In [2]:

# folder path to the image data used for this keras neural network
# there are 400 images, with 40 unique individuals
folder_path = r"C:\Users\KennoHead\Desktop\Data Science and Machine Learning Refresher\Face\\"

# processing the image data before using it for the keras neural network
# each face image is 64 x 64 pixels, all in JPG files and in grayscale
pixel_matrix = []

for i in range (0, 400, 1):
    
    current_image_data = cv2.imread(folder_path + str(i) + ".jpg", 0)
    pixel_matrix.append(current_image_data.flatten())

# reading the csv file that contains the labels for the images
# and merging the pixel data with the labels
pixels_labels = pd.read_csv(r"C:\Users\KennoHead\Desktop\Data Science and Machine Learning Refresher\Face\label.csv")
pixels_df = pd.DataFrame(pixel_matrix)
pixels_df = pd.concat([pixels_df, pixels_labels], axis = 1)


In [3]:

# setting up the training and testing sets
pixels_features_cols = list(pixels_df.columns)
pixels_features_cols.remove('Label')

pixels_features = pixels_df[pixels_features_cols]
pixels_labels = pixels_df['Label']

# the features should be scaled, the range of values on the pixels (0-255) would affect
# the pca method
pixels_features_scaled = preprocessing.scale(pixels_features)
features_training, features_testing, labels_training, labels_testing = train_test_split(pixels_features_scaled, pixels_labels, test_size = .3, random_state = 0)


In [4]:

# k was picked to be 35 components in this case

k = 35
faces_PCA = PCA(n_components = k)

# PCA should be applied to both the training and testing sets of the features 
features_training_pca = faces_PCA.fit_transform(features_training)
features_testing_pca = faces_PCA.transform(features_testing)

print('Process done!')

Process done!


In [5]:

# C is ratio to allow "bad" values within classes/ or the misclassification of samples
# higher C values allow low bias and high variance, leading to overfitting
# vice versa, lower C values increase bias and lower variance, leading to underfitting.
# note that C and gamma have an inverse relationship
# rbf is the radial basis function
svmc = svm.SVC(C = 1.0, kernel = 'rbf', gamma = 5e-4, random_state = 0)

svmc.fit(features_training_pca, labels_training)
svmc_predictions = svmc.predict(features_testing_pca)
svmc_acc = accuracy_score(labels_testing, svmc_predictions)
svmc_cm = confusion_matrix(labels_testing, svmc_predictions)

print(svmc_acc)
print(svmc_cm)


0.9
[[0 0 0 ... 0 0 0]
 [0 3 0 ... 0 0 0]
 [0 0 3 ... 0 0 0]
 ...
 [0 0 0 ... 3 0 0]
 [0 0 0 ... 0 4 0]
 [1 0 0 ... 0 0 1]]


In [6]:

# using grid search cv on svm classifier
# in this case, looking for the best C parameter
# while using k = 40 components from PCA
from sklearn.model_selection import GridSearchCV

c_values = [ 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5]
parameter_grid = dict(C = c_values)

svm_classifier = svm.SVC (kernel = 'rbf', gamma = 5e-4, random_state = 0)

k2 = 40
pca_for_cv = PCA(n_components = k2)
features_training_pca_cv = pca_for_cv.fit_transform(pixels_features_scaled)

# k = 10 rounds cross validaiton
grid_search_cv = GridSearchCV(svm_classifier, parameter_grid, cv = 10, scoring = 'accuracy')

start_time = time.perf_counter()
grid_search_cv.fit(features_training_pca_cv, pixels_labels)
end_time = time.perf_counter()

best_parameters = grid_search_cv.best_params_
best_score = grid_search_cv.best_score_

grid_search_cv_time_minutes = (end_time - start_time) / 60
print('Grid Search CV time in minutes: ' + str(grid_search_cv_time_minutes))

Grid Search CV time in minutes: 0.06034146926666668


In [7]:

print(best_parameters)
print(best_score)


{'C': 10.0}
0.9649999999999999


In [11]:

# testing out the new C value
# in this case, it seems the accuracy did not improve much
svm_classifier2 = svm.SVC(kernel = 'rbf', gamma = 5e-4, random_state = 0)
pca2 = PCA(n_components = k2)

features_training_pca2 = pca2.fit_transform(features_training)
features_testing_pca2 = pca2.transform(features_testing)

svmc2 = svm.SVC( C = 10, kernel = 'rbf', gamma = 5e-4, random_state = 0)

svmc2.fit(features_training_pca2, labels_training)
svmc_predictions2 = svmc2.predict(features_testing_pca2)
svmc_acc2 = accuracy_score(labels_testing, svmc_predictions2)
svmc_cm2 = confusion_matrix(labels_testing, svmc_predictions2)

print(svmc_acc)
print(svmc_cm)


0.9
[[0 0 0 ... 0 0 0]
 [0 3 0 ... 0 0 0]
 [0 0 3 ... 0 0 0]
 ...
 [0 0 0 ... 3 0 0]
 [0 0 0 ... 0 4 0]
 [1 0 0 ... 0 0 1]]
