In [None]:

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt 
from tqdm import tqdm 
import numpy as np 
import pandas as pd
import cv2
import glob
import os


**Section 1 - Loading the Data**

In [None]:
dirName = "../src/data/archive/zero-indexed-files.txt"
imgPath = "../src/data/archive/Garage_classification/load/"

df_raw = pd.read_csv(dirName,sep=' ')
df_raw['image'] = imgPath + df_raw['image'].astype(str)

plt.clf()
plt.rc('axes', axisbelow=True)
plt.grid(linestyle='dotted')
temp = plt.bar(list(range(6)),
                np.unique(df_raw['class'],return_counts=True)[1])
_ = plt.title("Distribution of Classes in Raw Dataset")
_ = plt.xlabel("Class")
_ = plt.ylabel("Count")

plt.show()

In our dataset the class labels are assigned as follows:

1. Glass

2. Paper

3. Cardboard

4. Plastic

5. Metal

6. Trash

Above, we've prepared a dataframe containing filepaths and their respective classes. Now we need to extract our design matrix.

Previously, we introduced SIFT, an algorithm for keypoint and image descriptor generation. The algorithm outputs a keypoint object, a datatype that efficiently encodes both the keypoints and image descriptors for each image. However, if we use KNN, we must transform our data into a discrete set of features whose entries can be evaluated with a distance metric.

**Section 2 - Data Quantization**
The process of generating said feature space involves three steps.

1. **Extracting Keypoints & Descriptors**

2. **Clustering** (Feature Reduction)

3. **Normalization & Discretization**

**Section 2.1 - Keypoints and Descriptors**

Given the definitions of Keypoints and Descriptors in the previous challenge, let's consider the code needed to compute these as:

In [None]:
# SIFT obtains & returns image descriptors
def SIFT(img):
    # normalize
    norm = cv2.normalize(img,np.zeros(img.shape), 0, 255, cv2.NORM_MINMAX)
    sift = cv2.SIFT_create() 
    kps,des = sift.detectAndCompute(norm,None) 
    if (len(kps) < 1): 
        print("NULL HERE")

    return kps,des

**Section 2.2 - K-Means Clustering**

K-Means clustering is an important component in understanding how peer-class images interact with one another through their clusterings. We can present this with the following function, using sklearn:

In [None]:
# Using K-Means clustering for feature reduction. 
# Optimal K is determined by elbow method (see elbow_kmeans.py)
def cluster(descriptors,k = 15):
    clusters = KMeans(k,random_state=42).fit(descriptors)
    return clusters 

**Section 2.3 - Normalization & Discretization**

It's important for us to normalize our data; or in other words transform our images into binary, which according to the SIFT paper, provides us with more floating point precision between [0, 1]. Given these descriptors, to mitigate noise and to reduce dimensionality, we might cluster these descriptors and apply discretization. In essence, given a cluster of descriptors, we can drop these descriptors in a corresponding bin in a histogram, which allows us to decrease dimensionality while also increasing the order of our descriptors.

In [None]:
# Data binning through normalized histograms. 
def binData(keypoints,descriptors,clusters):
    hists = []
    for kps,des in zip(keypoints,descriptors):
        hist = np.zeros(len(clusters.labels_))
        normFact = np.size(kps)
        bin = clusters.predict([des])
        hist[bin] += 1/normFact
        hists.append(hist)
    return hists

**Section 3 - KNN Classification**

Now, we will classify using KNN. In this code block, we combine every topic introduced so far; obtaining the data, performing K-Means, and Discretization to prepare for Cross Validation and KNN with an optimal K.

In [None]:
df = pd.read_csv(dirName,sep=' ')
   
df['image'] = imgPath + df['image'].astype(str)
df['image'] = df['image'].apply(lambda x: cv2.imread(x))

print(df.head())

train_X,test_X,train_Y,test_Y = train_test_split(df['image'],df['class'],
                                                 test_size=0.33,random_state=42,stratify=df['class'])


# Fetch keypoints from training data
train_keys = []
train_des = []
for sample in train_X: 
    kps,des = SIFT(sample)
    train_keys.append(kps)
    for d in des: 
        train_des.append(d)

# find optimal clustering

# cluster data with said optimal value (from elbow)
kmeans = cluster(train_des,k = 60)

# Histogram with new clusters
train_hists = binData(train_keys,train_des,kmeans)

#Now Histogram the testing data using kmeans from training
test_keys = []
test_des = []
for sample in test_X: 
    kps,des = SIFT(sample)
    test_keys.append(kps)
    for d in des: 
        test_des.append(d)

test_hists = binData(test_keys,test_des,kmeans)

**Section 3.1 - K-Fold Cross Validation**

We need to define a Cross Validation function:

In [None]:
def crossValidate(X,Y,folds=10,kmax = 10):
    kscores = []
    for i in tqdm(range(1,kmax)):
        knn = KNeighborsClassifier(n_neighbors=i,n_jobs=8) # 5 parallel tasks to speed things up
        cv = cross_val_score(knn,X,Y,cv=folds,scoring="accuracy")
        kscores.append(cv.mean())
    
    plt.plot(list(range(1,kmax)),kscores)  
    plt.savefig("Optimal_neighbors_sift.png")
    plt.show()


print("CV")
crossValidate(train_hists,train_Y,kmax=50)

**Section 3.2 - Classifying with Optimized K**
By looking at our plots, we determined that the optimal K=11. Thus, we can approach using 11NN.

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(train_hists,train_Y)

res = knn.predict(test_hists)

print(classification_report(test_Y,res,target_names=["Glass","Paper","Cardboard","Plastic","Metal","Trash"]))