## PCA + Standard Classifiers

### Feature Extraction with PCA

In [74]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import decomposition
from sklearn.model_selection import train_test_split

In [75]:
#Read the csv file
df = pd.read_csv('data/labels.csv')

#Sort by frequency and extract top 10
dist = df.groupby('breed').count().rename(columns={'id':'freq'})
most_common = dist.sort_values(by='freq',ascending=False)
#top_10 = [i for i in most_common[:10].index]
target_10 = ['siberian_husky', 'maltese_dog', 'afghan_hound', 'airedale',
             'bernese_mountain_dog', 'beagle', 'blenheim_spaniel', 'pomeranian',
             'basenji', 'samoyed']

In [76]:
#Select rows with breeds in target 10
df = df[df['breed'].isin(target_10)]
df.reset_index(drop=True, inplace=True)
data_length = len(df)

#Define dictionaries to convert between class value and the breed name
#breed = top_10
breed = target_10
class_length = len(breed)
class_to_num = dict(zip(breed, range(class_length)))
num_to_class = dict(zip(range(class_length), breed))

#Set the dimension at 200
dim = 200

X = np.zeros((data_length, dim, dim, 3),dtype=np.uint8)
y = np.zeros((data_length, class_length),dtype=np.uint8)

X_flat = np.zeros((data_length, dim*dim*3),dtype=np.uint8)

In [77]:
for i in tqdm(range(data_length)):
    #Read in the image
    image = cv2.imread('data/train/{}.jpg'.format(df['id'][i]))
    #Resize
    resized = cv2.resize(image,(dim, dim))
    #Remove single-dimensional entries
    np.squeeze(np.array(resized).astype(np.float32))        
    #Flatten
    flat_arr = resized.ravel()

    X_flat[i] = flat_arr
    #Increment the categorical value for the corresponding breed by 1
    y[i][class_to_num[df['breed'][i]]] = 1

100%|██████████| 1086/1086 [00:05<00:00, 210.43it/s]


In [78]:
#Check the shape
X_flat.shape

(1086, 120000)

In [79]:
n_components_ = 50

In [80]:
pca = decomposition.PCA(n_components=n_components_)
#pca = decomposition.TruncatedSVD(n_components=n_components_, algorithm='randomized')
pca.fit(X_flat)

PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [81]:
#Check how much variance of the original data is explained by the components
pca.explained_variance_ratio_.sum()

0.73050941052261409

In [82]:
X_reduced = np.zeros((X_flat.shape[0], n_components_),dtype=np.float32)

for i in tqdm(range(len(X_flat))):
    pca_arr = pca.transform(X_flat[i].reshape(1, -1))
    X_reduced[i] = pca_arr

100%|██████████| 1086/1086 [00:38<00:00, 28.51it/s]


In [37]:
np.save('data/features/pca_train.npy',X_reduced)
X_reduced = np.load('data/features/pca_train.npy')

In [83]:
y_categorical = np.array([None]*data_length)
for i in range(len(y)):
    y_categorical[i] = df['breed'][i]

y_categorical = [class_to_num[i] for i in y_categorical]

In [84]:
#Split the data into train and test, then normalise them
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_categorical, test_size=0.2)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255.0
X_test /= 255.0

## GridSearchCV

Add more classifiers and check the performance

In [85]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, fbeta_score
from sklearn.model_selection import GridSearchCV

In [88]:
clf_svc = SVC(kernel='linear',C=0.000125, gamma=100000)
clf_nb = GaussianNB()
clf_sgd = SGDClassifier(max_iter=10000, tol=1e-3)
clf_ada = AdaBoostClassifier()
clf_rf = RandomForestClassifier()
clfs = [clf_svc, clf_nb, clf_sgd, clf_ada, clf_rf]

In [90]:
#Find the baselines without parameter-tuning
for clf in clfs:
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print(accuracy_score(pred, y_test))

0.238532110092
0.233944954128
0.211009174312
0.215596330275
0.183486238532


In [None]:
#Find the best parameters with GridSearchCV
parameters = {'C': [0.1, 1],
              'kernel': ['rbf', 'linear'],
              'probability': [True]
              }
grid = GridSearchCV(estimator=clf_svc, param_grid=parameters, verbose=5, n_jobs=-1)
grid.fit(X_train, y_train) 

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] C=0.1, kernel=rbf, probability=True .............................
[CV] C=0.1, kernel=rbf, probability=True .............................
[CV] C=0.1, kernel=rbf, probability=True .............................
[CV] C=0.1, kernel=linear, probability=True ..........................
[CV]  C=0.1, kernel=rbf, probability=True, score=0.1103448275862069, total=   0.5s
[CV] C=0.1, kernel=linear, probability=True ..........................
[CV]  C=0.1, kernel=rbf, probability=True, score=0.11188811188811189, total=   0.5s
[CV] C=0.1, kernel=linear, probability=True ..........................
[CV]  C=0.1, kernel=rbf, probability=True, score=0.11301369863013698, total=   0.5s
[CV] C=1, kernel=rbf, probability=True ...............................
[CV]  C=1, kernel=rbf, probability=True, score=0.11301369863013698, total=   0.5s
[CV] C=1, kernel=rbf, probability=True ...............................
[CV]  C=1, kernel=rbf, probability=True

[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed:    5.2s remaining:    2.6s


[CV]  C=0.1, kernel=linear, probability=True, score=0.21328671328671328, total=   5.2s


In [24]:
#Display and write the result into a txt file
print(grid)
print(grid.best_estimator_)

with open('data/parameters/svc.txt', 'a') as file:
    file.write(str(grid))
    file.write(str(grid.best_estimator_)) 

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.1, 1], 'kernel': ['rbf', 'linear'], 'probability': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=5)
SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
