## PCA + Standard Classifiers

### Feature Extraction with PCA

In [1]:
import cv2
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import decomposition, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
#Read the csv file
df = pd.read_csv('data/labels.csv')

#Sort by frequency and extract top 10
dist = df.groupby('breed').count().rename(columns={'id':'freq'})
most_common = dist.sort_values(by='freq',ascending=False)
top_10 = [i for i in most_common[:10].index]

In [3]:
#Select rows with breeds in top 10
df = df[df['breed'].isin(top_10)]
df.reset_index(drop=True, inplace=True)
data_length = len(df)

#Define dictionaries to convert between class value and the breed name
breed = top_10
class_length = len(breed)
class_to_num = dict(zip(breed, range(class_length)))
num_to_class = dict(zip(range(class_length), breed))

#Set the dimension at 150
dim = 150

X = np.zeros((data_length, dim, dim, 3),dtype=np.uint8)
y = np.zeros((data_length, class_length),dtype=np.uint8)

X_flat = np.zeros((data_length, dim*dim*3),dtype=np.uint8)

In [4]:
for i in tqdm(range(data_length)):
    #Read in the image
    image = cv2.imread('data/train/{}.jpg'.format(df['id'][i]))
    #Resize
    resized = cv2.resize(image,(dim, dim))
    #Remove single-dimensional entries
    np.squeeze(np.array(resized).astype(np.float32))        
    #Flatten
    flat_arr = resized.ravel()

    X_flat[i] = flat_arr
    #Increment the categorical value for the corresponding breed by 1
    y[i][class_to_num[df['breed'][i]]] = 1

100%|██████████| 1141/1141 [00:04<00:00, 254.56it/s]


In [5]:
#Check the shape
X_flat.shape

(1141, 67500)

In [6]:
n_components_ = 50

In [7]:
pca = decomposition.PCA(n_components=n_components_)
#pca = decomposition.TruncatedSVD(n_components=n_components_, algorithm='randomized')
pca.fit(X_flat)

PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [8]:
#Check how much variance of the original data is explained by the components
pca.explained_variance_ratio_.sum()

0.7286405952151277

In [9]:
X_reduced = np.zeros((X_flat.shape[0], n_components_),dtype=np.float32)

for i in tqdm(range(len(X_flat))):
    pca_arr = pca.transform(X_flat[i].reshape(1, -1))
    X_reduced[i] = pca_arr

100%|██████████| 1141/1141 [00:15<00:00, 73.44it/s]


In [10]:
np.save('data/features/pca_train.npy',X_reduced)
X_reduced = np.load('data/features/pca_train.npy')

In [11]:
y_categorical = np.array([None]*data_length)
for i in range(len(y)):
    y_categorical[i] = df['breed'][i]

y_categorical = [class_to_num[i] for i in y_categorical]

In [12]:
#Split the data into train and test, then normalise them
X_train, X_test, y_train, y_test = train_test_split(X_flat, y_categorical, test_size=0.2) #train_test_split(X_reduced, y_categorical, test_size=0.2)#train_test_split(X_flat, y_categorical, test_size=0.2)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 200.0
X_test /= 200.0

## GridSearchCV

Add more classifiers and check the performance

In [13]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss

from sklearn.model_selection import GridSearchCV

In [14]:
clf_svc = SVC(C = 0.000125,
              kernel = 'linear',
              probability = True,
              verbose = True)

clf_knn = KNeighborsClassifier(n_neighbors = 34)

clf_dt = DecisionTreeClassifier(max_depth = 5)

In [15]:

#Find the best parameters with GridSearchCV
cs = [0.000125]#[1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]
gammas = ['auto']#[0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]
kernels = ['linear']#, 'poly', 'sigmoid', 'rbf']

parameters_svc = {'C': cs, #0.000125
              'kernel': kernels,
              'probability': [True],
              'verbose': [True]
              }
grid_svc = GridSearchCV(error_score='accuracy_score', estimator=clf_svc, param_grid=parameters_svc, cv=10 , verbose=10, n_jobs=-1)
grid_svc.fit(X_train, y_train)
clf_svc = grid_svc.best_estimator_
print(grid_svc.best_estimator_)
print(grid_svc.best_score_)


Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] C=0.000125, kernel=linear, probability=True, verbose=True .......
[CV] C=0.000125, kernel=linear, probability=True, verbose=True .......
[CV] C=0.000125, kernel=linear, probability=True, verbose=True .......
[CV] C=0.000125, kernel=linear, probability=True, verbose=True .......
[LibSVM][CV]  C=0.000125, kernel=linear, probability=True, verbose=True, score=0.29473684210526313, total=16.5min
[CV] C=0.000125, kernel=linear, probability=True, verbose=True .......
[LibSVM][CV]  C=0.000125, kernel=linear, probability=True, verbose=True, score=0.2553191489361702, total=16.5min
[LibSVM][CV]  C=0.000125, kernel=linear, probability=True, verbose=True, score=0.3191489361702128, total=16.5min
[CV] C=0.000125, kernel=linear, probability=True, verbose=True .......
[CV] C=0.000125, kernel=linear, probability=True, verbose=True .......
[LibSVM][CV]  C=0.000125, kernel=linear, probability=True, verbose=True, score=0.3333333333333333, tot

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 38.0min remaining: 38.0min


[LibSVM][CV]  C=0.000125, kernel=linear, probability=True, verbose=True, score=0.21978021978021978, total=16.9min
[CV] C=0.000125, kernel=linear, probability=True, verbose=True .......
[LibSVM][CV]  C=0.000125, kernel=linear, probability=True, verbose=True, score=0.26373626373626374, total=16.9min


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 38.2min remaining: 16.4min


[LibSVM][CV]  C=0.000125, kernel=linear, probability=True, verbose=True, score=0.29213483146067415, total=17.1min
[LibSVM][CV]  C=0.000125, kernel=linear, probability=True, verbose=True, score=0.26436781609195403, total= 6.9min
[LibSVM][CV]  C=0.000125, kernel=linear, probability=True, verbose=True, score=0.3488372093023256, total= 6.9min


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 45.9min finished


[LibSVM]SVC(C=0.000125, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True)
0.28399122807


In [16]:

with open('data/accuracies/grid_accuracies_svc.csv', 'wt') as csvfile:
    csvfile.truncate()
    writer = csv.writer(csvfile, delimiter=';')
    writer.writerow(['mean_fit_time', 'param_C', 'param_kernel', 'mean_test_score'])
    #for results in grid_svc.cv_results_:
        #print(results)
    for score in range(len(cs)*len(kernels)*len(gammas)):
        writer.writerow([grid_svc.cv_results_['mean_fit_time'][score], 
                            grid_svc.cv_results_['param_C'][score], 
                            grid_svc.cv_results_['param_kernel'][score], 
                            grid_svc.cv_results_['mean_test_score'][score]])


In [17]:
'''
scores_mean = grid_svc.cv_results_['mean_test_score']
scores_mean = np.array(scores_mean).reshape(len(kernels),len(cs))

scores_sd = grid_svc.cv_results_['std_test_score']
scores_sd = np.array(scores_sd).reshape(len(kernels),len(cs))

# Plot Grid search scores
_, ax = plt.subplots(1,1)

# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
for idx, val in enumerate(kernels):
    ax.plot(cs, scores_mean[idx,:], '-o', label= 'kernels' + ': ' + str(val))

ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
ax.set_xlabel('C', fontsize=16)
ax.set_ylabel('CV Average Score', fontsize=16)
ax.legend(loc="best", fontsize=15)
ax.grid('on')
plt.show()
'''

'\nscores_mean = grid_svc.cv_results_[\'mean_test_score\']\nscores_mean = np.array(scores_mean).reshape(len(kernels),len(cs))\n\nscores_sd = grid_svc.cv_results_[\'std_test_score\']\nscores_sd = np.array(scores_sd).reshape(len(kernels),len(cs))\n\n# Plot Grid search scores\n_, ax = plt.subplots(1,1)\n\n# Param1 is the X-axis, Param 2 is represented as a different curve (color line)\nfor idx, val in enumerate(kernels):\n    ax.plot(cs, scores_mean[idx,:], \'-o\', label= \'kernels\' + \': \' + str(val))\n\nax.set_title("Grid Search Scores", fontsize=20, fontweight=\'bold\')\nax.set_xlabel(\'C\', fontsize=16)\nax.set_ylabel(\'CV Average Score\', fontsize=16)\nax.legend(loc="best", fontsize=15)\nax.grid(\'on\')\nplt.show()\n'

In [18]:

#Find the best parameters with GridSearchCV
max_depths = [5]#[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #[3]#
min_samples_splits = [2]#[2, 3, 4, 5, 6] #[2]#
min_samples_leafs = [1]#[1, 2, 3, 4, 5] #[1]#
criterions = ['gini']#['entropy','gini'] #['gini']#
max_features = [None] #['auto', 'log2', None]
max_leaf_nodes = [None]



parameters_dt = {'max_depth': max_depths,
                 'min_samples_split': min_samples_splits,
                 'min_samples_leaf': min_samples_leafs,
                 'criterion': criterions, 
                 'max_features': max_features,
                 'max_leaf_nodes': max_leaf_nodes,
              }
grid_dt = GridSearchCV(error_score='accuracy_score', estimator=clf_dt, param_grid=parameters_dt, cv=10 , verbose=1, n_jobs=-1)
grid_dt.fit(X_train, y_train)
clf_dt = grid_dt.best_estimator_
print(grid_dt.best_estimator_)
print(grid_dt.best_score_)


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.0min finished


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.198464912281


In [19]:

with open('data/accuracies/grid_accuracies_dt.csv', 'wt') as csvfile:
    csvfile.truncate()
    writer = csv.writer(csvfile, delimiter=';')
    writer.writerow(['mean_fit_time', 'param_max_depth', 'param_min_samples_split', 'param_min_samples_leaf', 'param_criterion', 'mean_test_score'])
    #for results in grid_dt.cv_results_:
        #print(results)
    for score in range(len(max_depths)*len(min_samples_splits)*len(min_samples_leafs)*len(criterions)*len(max_features)*len(max_leaf_nodes)):
        writer.writerow([grid_dt.cv_results_['mean_fit_time'][score], 
                            grid_dt.cv_results_['param_max_depth'][score], 
                            grid_dt.cv_results_['param_min_samples_split'][score], 
                            grid_dt.cv_results_['param_min_samples_leaf'][score], 
                            grid_dt.cv_results_['param_criterion'][score],                          
                            grid_dt.cv_results_['mean_test_score'][score]])


In [20]:

#Find the best parameters with GridSearchCV
neighbours = [34]#[n+1 for n in range(50)] #[34]#

parameters_knn = {'n_neighbors': neighbours}

grid_knn = GridSearchCV(error_score='accuracy_score', estimator=clf_knn, param_grid=parameters_knn, cv=10 , verbose=1, n_jobs=-1)
grid_knn.fit(X_train, y_train)
clf_knn = grid_knn.best_estimator_
print(grid_knn.best_estimator_)
print(grid_knn.best_score_)


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.4min finished


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=34, p=2,
           weights='uniform')
0.186403508772


In [21]:

with open('data/accuracies/grid_accuracies_knn.csv', 'wt') as csvfile:
    csvfile.truncate()
    writer = csv.writer(csvfile, delimiter=';')
    writer.writerow(['mean_fit_time', 'param_n_neighbors', 'mean_test_score'])
    #for results in grid_knn.cv_results_:
        #print(results)
    for score in range(len(neighbours)):
        writer.writerow([grid_knn.cv_results_['mean_fit_time'][score], 
                            grid_knn.cv_results_['param_n_neighbors'][score], 
                            grid_knn.cv_results_['mean_test_score'][score]])


In [22]:
# fitting svc
clf_svc.fit(X_train, y_train)
clf_dt.fit(X_train, y_train)
clf_knn.fit(X_train, y_train)
  
#prediction with grid parameters         
prediction_svc = np.array(clf_svc.predict(X_test))
prediction_proba_svc = np.array(clf_svc.predict_proba(X_test))

prediction_dt = np.array(clf_dt.predict(X_test))
prediction_proba_dt = np.array(clf_dt.predict_proba(X_test))  

prediction_knn = np.array(clf_knn.predict(X_test))
prediction_proba_knn = np.array(clf_knn.predict_proba(X_test))   
        
#calculate accuracy
path_name = 'data/accuracies/accuracy_and_logloss.txt'
    
loss_svc = log_loss(y_test, prediction_proba_svc, labels=[class_to_num[b] for b in breed])
accuracy_svc = accuracy_score(y_test, prediction_svc)

    
loss_dt = log_loss(y_test, prediction_proba_dt, labels=[class_to_num[b] for b in breed])
accuracy_dt = accuracy_score(y_test, prediction_dt)

loss_knn = log_loss(y_test, prediction_proba_knn, labels=[class_to_num[b] for b in breed])
accuracy_knn = accuracy_score(y_test, prediction_knn)

with open(path_name, 'a') as file:
    file.truncate()
    file.write('Log Loss SVC '+ ':\t\t\t' + str(loss_svc)  + '\n')
    print('Log Loss SVC '+ ':\t' + str(loss_svc))
    file.write('Accuracy Score SVC '+ ':\t' + str(accuracy_svc) + '\n')
    print('Accuracy Score SVC '+ ':\t' + str(accuracy_svc))
    
    file.write('Log Loss DT '+ ':\t\t\t' + str(loss_dt)  + '\n')
    print('Log Loss DT '+ ':\t\t' + str(loss_dt))
    file.write('Accuracy Score DT '+ ':\t\t' + str(accuracy_dt) + '\n')
    print('Accuracy Score DT '+ ':\t' + str(accuracy_dt))
    
    file.write('Log Loss KNN '+ ':\t\t\t' + str(loss_knn)  + '\n')
    print('Log Loss KNN '+ ':\t\t' + str(loss_knn))
    file.write('Accuracy Score KNN '+ ':\t' + str(accuracy_knn) + '\n')
    print('Accuracy Score KNN '+ ':\t' + str(accuracy_knn))  

[LibSVM]Log Loss SVC :	1.9903648556
Accuracy Score SVC :	0.266375545852
Log Loss DT :		6.49124885138
Accuracy Score DT :	0.17903930131
Log Loss KNN :		3.13283876867
Accuracy Score KNN :	0.192139737991
