## PCA + Standard Classifiers

### Feature Extraction with PCA

In [None]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import decomposition
from sklearn.model_selection import train_test_split

In [None]:
#Read the csv file
df = pd.read_csv('data/labels.csv')

#Sort by frequency and extract top 10
dist = df.groupby('breed').count().rename(columns={'id':'freq'})
most_common = dist.sort_values(by='freq',ascending=False)
top_10 = [i for i in most_common[:10].index]

In [None]:
#Select rows with breeds in top 10
df = df[df['breed'].isin(top_10)]
df.reset_index(drop=True, inplace=True)
data_length = len(df)

#Define dictionaries to convert between class value and the breed name
breed = top_10
class_length = len(breed)
class_to_num = dict(zip(breed, range(class_length)))
num_to_class = dict(zip(range(class_length), breed))

#Set the dimension at 200
dim = 200

X = np.zeros((data_length, dim, dim, 3),dtype=np.uint8)
y = np.zeros((data_length, class_length),dtype=np.uint8)

X_flat = np.zeros((data_length, dim*dim*3),dtype=np.uint8)

In [None]:
for i in tqdm(range(data_length)):
    #Read in the image
    image = cv2.imread('data/train/{}.jpg'.format(df['id'][i]))
    #Resize
    resized = cv2.resize(image,(dim, dim))
    #Remove single-dimensional entries
    np.squeeze(np.array(resized).astype(np.float32))        
    #Flatten
    flat_arr = resized.ravel()

    X_flat[i] = flat_arr
    #Increment the categorical value for the corresponding breed by 1
    y[i][class_to_num[df['breed'][i]]] = 1

In [None]:
#Check the shape
X_flat.shape

In [None]:
n_components_ = 50

In [None]:
pca = decomposition.PCA(n_components=n_components_)
#pca = decomposition.TruncatedSVD(n_components=n_components_, algorithm='randomized')
pca.fit(X_flat)

In [None]:
#Check how much variance of the original data is explained by the components
pca.explained_variance_ratio_.sum()

In [None]:
X_reduced = np.zeros((X_flat.shape[0], n_components_),dtype=np.float32)

for i in tqdm(range(len(X_flat))):
    pca_arr = pca.transform(X_flat[i].reshape(1, -1))
    X_reduced[i] = pca_arr

In [None]:
np.save('data/features/pca_train.npy',X_reduced)
X_reduced = np.load('data/features/pca_train.npy')

In [None]:
y_categorical = np.array([None]*data_length)
for i in range(len(y)):
    y_categorical[i] = df['breed'][i]

y_categorical = [class_to_num[i] for i in y_categorical]

In [None]:
#Split the data into train and test, then normalise them
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_categorical, test_size=0.2)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 200.0
X_test /= 200.0

## GridSearchCV

Add more classifiers and check the performance

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, fbeta_score
from sklearn.model_selection import GridSearchCV

In [None]:
clf_svc = SVC()
clf_nb = GaussianNB()
clf_sgd = SGDClassifier(max_iter=10000, tol=1e-3)
clf_ada = AdaBoostClassifier()
clf_rf = RandomForestClassifier()
clfs = [clf_svc, clf_nb, clf_sgd, clf_ada, clf_rf]

In [None]:
#Find the baselines without parameter-tuning
for clf in clfs:
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print(accuracy_score(pred, y_test))

In [None]:
#Find the best parameters with GridSearchCV
parameters = {'C': [0.1, 1],
              'kernel': ['rbf', 'linear'],
              'probability': [True]
              }
grid = GridSearchCV(estimator=clf_svc, param_grid=parameters, verbose=5, n_jobs=-1)
grid.fit(X_train, y_train) 

In [None]:
#Display and write the result into a txt file
print(grid)
print(grid.best_estimator_)

with open('data/parameters/svc.txt', 'a') as file:
    file.write(str(grid))
    file.write(str(grid.best_estimator_)) 