# Image Classification using Visual Bags of Words

#Setting up

In [1]:
import os
import pandas as pd
import numpy as np
import cv2
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split,cross_validate
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix,roc_curve,auc
from matplotlib.legend_handler import HandlerLine2D
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

#Read images and hyperparameters

image sets and data files are not uploaded due to large memory space

In [None]:
rand_st=75
k=50
knn=0
dt=0
svm=0
rf=0

infile=pd.read_csv('all_bow.csv')
train_images= '../new_train_images'
test_images='../new_test_images'

train_img=[]
for i in os.listdir(train_images):
    if not i.startswith('.') and os.path.isfile(os.path.join(train_images, i)):
            train_img.append(i)
test_img=[]
for i in os.listdir(test_images):
    if not i.startswith('.') and os.path.isfile(os.path.join(test_images, i)):
            test_img.append(i)


#Step one--Extract features and clustering

In [None]:
extractor = cv2.xfeatures2d.SIFT_create()
def extract_words(path,im_set):
    des_list = []
    pred_list=[]
    for pet in im_set:        
        im=os.path.join(path,pet)
        img=cv2.imread(im)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        kp, des = extractor.detectAndCompute(img, None)
        kmeans = KMeans(n_clusters=k,random_state=rand_st).fit(des)
        prediction=kmeans.predict(des)
        des_list.append(des)
        pred_list.append(prediction)        
    return des_list,pred_list
step1=extract_words(train_images,train_img)    
descriptors=step1[0]
words=step1[1]

#Step two--Vector quantization (new dataframe)

In [None]:
def count_table(word_list):
    wordfreq=[]
    for i in range(len(word_list)):
        count = np.unique(word_list[i], return_counts=True)
        wordfreq.append(count[1])
    return wordfreq

#Step three--Validation setup

In [None]:
df=count_table(words)
train= pd.DataFrame(np.vstack(df))
target_train=pd.read_csv('train_list.csv',header=None)
train['target']=target_train.iloc[:,1]
x_train=train.iloc[:,0:k]
train.loc[train['target']=='cat','label']=0
train.loc[train['target']=='dog','label']=1
y_train=train['label']
#sns.pairplot(x_train)
#correlations=x_train.corr().round(2)
dogs=train[train.target=='dog'].iloc[:,:k]
cats=train[train.target=='cat'].iloc[:,:k]

dog_count=dogs.sum(axis = 0, skipna = True) 
cat_count=cats.sum(axis = 0, skipna = True)

#TEST SET
step4=extract_words(test_images,test_img)
test_words=step4[1]
df2=count_table(test_words)
test = pd.DataFrame(np.vstack(df2))
target_test=pd.read_csv('test_list.csv',header=None)
test['target']=target_test.iloc[:,1]
x_test=test.iloc[:,0:k]
test.loc[test['target']=='cat','label']=0
test.loc[test['target']=='dog','label']=1
y_test=test['label']  
data=pd.concat([train,test])  
x_data=data.iloc[:,0:k]
y_data=data['label']


#Step four--Training classifiers and results

K Nearest Neighbours

In [None]:
if knn==1:
    neigh = KNeighborsClassifier(n_neighbors=8)
    neigh.fit(x_train, y_train) 
    pred=neigh.predict(x_test)
    scores=neigh.score(x_test, y_test)  
    print('KNN test accuracy',scores.round(3))
    print(confusion_matrix(y_test, pred))  
    print(classification_report(y_test, pred))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print('KNN test AUC:',roc_auc.round(3))
    scorers = {'Accuracy': 'accuracy', 'roc_auc': 'roc_auc'} 
    scores = cross_validate(neigh,x_data,y_data,cv=5,scoring=scorers)                                                                                              
    scores_Acc = scores['test_Accuracy']                                                                                                                                    
    print("KNN CV Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std()))                                                                                                    
    scores_AUC= scores['test_roc_auc']                                                                             
    print("KNN CV AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std()))  

Supported Vector Machine

In [None]:
if svm==1:
    svm=SVC(C=1, kernel='rbf', degree=3,random_state=rand_st)
    svm.fit(x_train, y_train) 
    pred=svm.predict(x_test)
    scores=svm.score(x_test, y_test)  
    print('SVM test accuracy',scores.round(3))
    print(confusion_matrix(y_test, pred))  
    print(classification_report(y_test, pred))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print('SVM AUC:',roc_auc)
    scorers = {'Accuracy': 'accuracy', 'roc_auc': 'roc_auc'} 
    scores = cross_validate(svm,x_data,y_data,cv=5,scoring=scorers)                                                                                              
    scores_Acc = scores['test_Accuracy']                                                                                                                                    
    print("SVM CV Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std()))                                                                                                    
    scores_AUC= scores['test_roc_auc']                                                                             
    print("SVM CV AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std()))   

Random Forest

In [None]:
if rf==1:
    rf=RandomForestClassifier(n_estimators=100, criterion='entropy',random_state=rand_st)
    rf.fit(x_train, y_train) 
    pred=rf.predict(x_test)
    scores=rf.score(x_test, y_test)  
    print('Random Forest test accuracy',scores.round(3))
    print(confusion_matrix(y_test, pred))  
    print(classification_report(y_test, pred))
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print('Random Forest AUC:',roc_auc)
    scorers = {'Accuracy': 'accuracy', 'roc_auc': 'roc_auc'} 
    scores = cross_validate(rf,x_data,y_data,cv=5,scoring=scorers)                                                                                              
    scores_Acc = scores['test_Accuracy']                                                                                                                                    
    print("Random Forest CV Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std()))                                                                                                    
    scores_AUC= scores['test_roc_auc']                                                                             
    print("Random Forest CV AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std()))  

SVM parameter tuning

In [None]:
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_
#svc_param_selection(x_train, y_train, 5)