## KNN

In [1]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import decomposition
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
#Read the csv file
df = pd.read_csv('data/labels.csv')

#Sort by frequency and extract top 10
dist = df.groupby('breed').count().rename(columns={'id':'freq'})
most_common = dist.sort_values(by='freq',ascending=False)
top_10 = [i for i in most_common[:10].index]
#print(dist.head(),most_common.head(),top_10)

In [3]:
#Select rows with breeds in top 10
df = df[df['breed'].isin(top_10)]
df.reset_index(drop=True, inplace=True)
data_length = len(df)

#Define dictionaries to convert between class value and the breed name
breed = top_10
class_length = len(breed)
class_to_num = dict(zip(breed, range(class_length)))
num_to_class = dict(zip(range(class_length), breed))

#Set the dimension at 200
dim = 200

X = np.zeros((data_length, dim, dim, 3),dtype=np.uint8)
y = np.zeros((data_length, class_length),dtype=np.uint8)

X_flat = np.zeros((data_length, dim*dim*3),dtype=np.uint8)

In [4]:
for i in tqdm(range(data_length)):
    #Read in the image
    image = cv2.imread('data/train/{}.jpg'.format(df['id'][i]))
    #Resize
    resized = cv2.resize(image,(dim, dim))
    #Remove single-dimensional entries
    np.squeeze(np.array(resized).astype(np.float32))        
    #Flatten
    flat_arr = resized.ravel()

    X_flat[i] = flat_arr
    #Increment the categorical value for the corresponding breed by 1
    y[i][class_to_num[df['breed'][i]]] = 1

100%|██████████| 1141/1141 [00:05<00:00, 225.38it/s]


In [5]:
#Check the shape
X_flat.shape

(1141, 120000)

In [6]:
n_components_ = 50

In [7]:
pca = decomposition.PCA(n_components=n_components_)
#pca = decomposition.TruncatedSVD(n_components=n_components_, algorithm='randomized')
pca.fit(X_flat)

PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [8]:
#Check how much variance of the original data is explained by the components
pca.explained_variance_ratio_.sum()

0.72856398595015315

In [9]:
X_reduced = np.zeros((X_flat.shape[0], n_components_),dtype=np.float32)

for i in tqdm(range(len(X_flat))):
    pca_arr = pca.transform(X_flat[i].reshape(1, -1))
    X_reduced[i] = pca_arr

100%|██████████| 1141/1141 [00:35<00:00, 31.80it/s]


In [10]:
np.save('data/features/pca_train.npy',X_reduced)
X_reduced = np.load('data/features/pca_train.npy')

In [11]:
y_categorical = np.array([None]*data_length)
for i in range(len(y)):
    y_categorical[i] = df['breed'][i]

y_categorical = [class_to_num[i] for i in y_categorical]

In [12]:
#Split the data into train and test, then normalise them
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_categorical, test_size=0.2)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 200.0
X_test /= 200.0
#print(X_train,y_train)

#KNN

In [13]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, fbeta_score
from sklearn.model_selection import GridSearchCV

In [14]:
for n_neighbors in range(1,10):
    knn_estimator = KNeighborsClassifier(n_neighbors)
    knn_estimator.fit(X_train, y_train)
    prediction = knn_estimator.predict(X_test)
    print("k= {} acc: {}".format(n_neighbors, accuracy_score(y_test, prediction)))

k= 1 acc: 0.2314410480349345
k= 2 acc: 0.24890829694323144
k= 3 acc: 0.30131004366812225
k= 4 acc: 0.2794759825327511
k= 5 acc: 0.25327510917030566
k= 6 acc: 0.2183406113537118
k= 7 acc: 0.22707423580786026
k= 8 acc: 0.2314410480349345
k= 9 acc: 0.23580786026200873
