# Exercise 4 - comparative experimentation

## Setup Image Data

In [56]:
import glob, os
from pathlib import Path

imagePath=".FIDS30/"
fileNames = glob.glob("FIDS30/*/*.jpg")
numberOfFiles=len(fileNames)
targetLabels=[]

print("Found " + str(numberOfFiles) + " files\n")

for fileName in fileNames:
    parts = Path(fileName).parts
    targetLabels.append(parts[-2])


from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(targetLabels) # this basically finds all unique class names, and assigns them to the numbers
print("Found the following classes: " + str(list(le.classes_)))

# now we transform our labels to integers
target = le.transform(targetLabels); 
print("Transformed labels (first elements: " + str(target[0:150]))

# If we want to find again the label for an integer value, we can do something like this:
# print list(le.inverse_transform([0, 18, 1]))

print("... done label encoding")


Found 971 files

Found the following classes: ['acerolas', 'apples', 'apricots', 'avocados', 'bananas', 'blackberries', 'blueberries', 'cantaloupes', 'cherries', 'coconuts', 'figs', 'grapefruits', 'grapes', 'guava', 'kiwifruit', 'lemons', 'limes', 'mangos', 'olives', 'oranges', 'passionfruit', 'peaches', 'pears', 'pineapples', 'plums', 'pomegranates', 'raspberries', 'strawberries', 'tomatoes', 'watermelons']
Transformed labels (first elements: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4]
... done label encoding


In [66]:
from PIL import Image
import numpy as np
import cv2
import datetime

data=[]
for index, fileName in enumerate(fileNames):
    imagePIL = Image.open(fileName)
    imagePIL = imagePIL.convert('RGB')     
    featureVector=imagePIL.histogram()
    
    if (len(featureVector) != 768): # just a sanity check; with the transformation to RGB, this should never happen
        print("Unexpected length of feature vector: " + str(len(featureVector)) + " in file: " + fileName)

    data.append((featureVector))

    
# Next, we extract a few more features using OpenCV

print ("Extracting features using OpenCV" + " (" + str(datetime.datetime.now()) + ")")
dataOpenCV_1D=[]
dataOpenCV_2D=[]
dataOpenCV_3D=[]

# use our own simple function to flatten the 2D arrays
flatten = lambda l: [item for sublist in l for item in sublist]

for fileName in fileNames:

    # the easiest way would to do the following:
    # imageOpenCV = cv2.imread(imagePath + fileName)

    # However, we have the same issue as before, and it is more difficult in OpenCV to convert to an RGB image
    # Thus we do this using PIL, and then convert to OpenCV ....
    imagePIL = Image.open(fileName)
    imagePIL = imagePIL.convert('RGB')
    imageOpenCV = np.array(imagePIL) 
    # Convert RGB to BGR 
    imageOpenCV = imageOpenCV[:, :, ::-1].copy() 

    # Now we split the image in the three channels, B / G / R
    chans = cv2.split(imageOpenCV)
    colors = ("b", "g", "r")

    # First we do also features per channel, but this time, we aggregate them into a smaller number of bins
    # I.e. we do not have 256 values per channel, but less
    featuresOpenCV_1D = []
    bins_1D=64
    for (chan, color) in zip(chans, colors): # we compute the histogram over each channel
        histOpenCV = cv2.calcHist([chan], [0], None, [bins_1D], [0, 256])
        featuresOpenCV_1D.extend(histOpenCV)
    featureVectorOpenCV_1D = flatten(featuresOpenCV_1D) # and append this to our feature vector
    
    dataOpenCV_1D.append(featureVectorOpenCV_1D) # now we append the feature vector to the dataset so far

    if (len(featureVectorOpenCV_1D) != bins_1D*3): # sanity check, in case we had a wrong number of channels...
        print("Unexpected length of feature vector: " + str(len(featureVectorOpenCV_1D)) + " in file: " + fileName)

    # Next - features that look at two channels at the same time
    # E.g. we look at when green and blue have both "high values"
    # We reduce the size of bins further, to not have a too long feature vector
    featuresOpenCV_2D = []
    bins2D=16
    # look at all combinations of channels (R & B, R & G, B & G)
    featuresOpenCV_2D.extend(cv2.calcHist([chans[1], chans[0]], [0, 1], None, [bins2D, bins2D], [0, 256, 0, 256]))
    featuresOpenCV_2D.extend(cv2.calcHist([chans[1], chans[2]], [0, 1], None, [bins2D, bins2D], [0, 256, 0, 256]))
    featuresOpenCV_2D.extend(cv2.calcHist([chans[0], chans[2]], [0, 1], None, [bins2D, bins2D], [0, 256, 0, 256]))
    # and add that to our dataset
    featureVectorOpenCV_2D = flatten(featuresOpenCV_2D)
    dataOpenCV_2D.append(featureVectorOpenCV_2D)

    # finally, we look at all three channels at the same time.
    # We further reduce our bin size, because otherwise, this would become very large...
    featuresOpenCV_3D = cv2.calcHist([imageOpenCV], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    # append to our dataset
    featureVectorOpenCV_3D = featuresOpenCV_3D.flatten()
    dataOpenCV_3D.append(featureVectorOpenCV_3D)

        
print( ".... done" + " (" + str(datetime.datetime.now()) + ")")

Extracting features using OpenCV (2019-01-20 23:36:12.743282)
.... done (2019-01-20 23:37:03.690376)


## Classify

In [109]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

from sklearn import neighbors
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import tree
from sklearn import ensemble
from sklearn import svm

scoring = ['precision_macro', 'recall_macro', 'accuracy']

In [151]:
trainingSets = [{'data': data, 'name': 'PIL_Histogram'},
                {'data': dataOpenCV_1D, 'name': 'dataOpenCV_1D'},
                {'data': dataOpenCV_2D,'name': 'dataOpenCV_2D'},
                {'data': dataOpenCV_3D, 'name': 'dataOpenCV_3D'}
               ]
classifiers = [{'classifier': neighbors.KNeighborsClassifier(n_neighbors=1), 'name': 'KNN/1'},
               {'classifier': neighbors.KNeighborsClassifier(n_neighbors=5), 'name': 'KNN/5'},
               {'classifier': neighbors.KNeighborsClassifier(n_neighbors=10), 'name': 'KNN/10'},
               {'classifier': naive_bayes.GaussianNB(), 'name': 'NaiveBayes'},
               {'classifier': tree.DecisionTreeClassifier() , 'name': 'DecisionTreeClassifier'},
               {'classifier': ensemble.RandomForestClassifier(n_estimators=10), 'name': 'RandomForest/10'},
               {'classifier': ensemble.RandomForestClassifier(n_estimators=20), 'name': 'RandomForest/20'},
               {'classifier': svm.SVC(), 'name': 'SVC'},              
               {'classifier': svm.LinearSVC(), 'name': 'LinearSVC'}
              ]

results = []

In [152]:
def CrossValidateWith(classifier, trainingSet, target):
    global scoring
    scores = cross_validate(classifier['classifier'], trainingSet['data'], target, cv=3, scoring=scoring, return_train_score=False)
    return {
        'Method' : classifier['name'],
        'Dataset' : trainingSet['name'],
        'Accuracy' : scores['test_accuracy'].mean(),
        'Precision' : scores['test_precision_macro'].mean(),
        'Training time' : scores['fit_time'].mean() ,
        'Testing time' : scores['score_time'].mean()
    }

In [153]:
for index, trainingSet in enumerate(trainingSets):
    print('TrainingSet: ' + trainingSet['name'])
    for classifier in classifiers:
        print('Using: ' + classifier['name'])
        result = CrossValidateWith(classifier, trainingSet, target)
        results.append(result)

TrainingSet: PIL_Histogram
Using: KNN/1
Using: KNN/5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Using: KNN/10
Using: NaiveBayes


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Using: DecisionTreeClassifier
Using: RandomForest/10


  'precision', 'predicted', average, warn_for)


Using: RandomForest/20
Using: SVC


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Using: LinearSVC




TrainingSet: dataOpenCV_1D
Using: KNN/1
Using: KNN/5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Using: KNN/10


  'precision', 'predicted', average, warn_for)


Using: NaiveBayes
Using: DecisionTreeClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Using: RandomForest/10
Using: RandomForest/20
Using: SVC


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Using: LinearSVC




TrainingSet: dataOpenCV_2D
Using: KNN/1
Using: KNN/5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Using: KNN/10
Using: NaiveBayes
Using: DecisionTreeClassifier
Using: RandomForest/10
Using: RandomForest/20
Using: SVC


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Using: LinearSVC




TrainingSet: dataOpenCV_3D
Using: KNN/1
Using: KNN/5


  'precision', 'predicted', average, warn_for)


Using: KNN/10


  'precision', 'predicted', average, warn_for)


Using: NaiveBayes
Using: DecisionTreeClassifier
Using: RandomForest/10
Using: RandomForest/20
Using: SVC


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Using: LinearSVC




In [144]:
results = []

In [142]:
results.append(1)

In [169]:
import pandas as pd
df = pd.DataFrame(results, columns=['Dataset','Method', 'Accuracy', 'Precision','Testing time','Training time'])
df

Unnamed: 0,Dataset,Method,Accuracy,Precision,Testing time,Training time
0,PIL_Histogram,KNN/1,0.100654,0.103504,0.333332,0.034668
1,PIL_Histogram,KNN/5,0.08321,0.115154,0.370668,0.031333
2,PIL_Histogram,KNN/10,0.085592,0.098181,0.380002,0.027331
3,PIL_Histogram,NaiveBayes,0.074141,0.104157,0.286668,0.023999
4,PIL_Histogram,DecisionTreeClassifier,0.127781,0.135339,0.054,0.528331
5,PIL_Histogram,RandomForest/10,0.200811,0.183683,0.059001,0.157666
6,PIL_Histogram,RandomForest/20,0.207067,0.194909,0.061,0.271665
7,PIL_Histogram,SVC,0.047375,0.001579,0.561,0.722666
8,PIL_Histogram,LinearSVC,0.156719,0.183974,0.032,4.946174
9,dataOpenCV_1D,KNN/1,0.116272,0.125878,0.087333,0.008665


In [172]:
df.sort_values('Accuracy',ascending=False)

Unnamed: 0,Dataset,Method,Accuracy,Precision,Testing time,Training time
33,dataOpenCV_3D,RandomForest/20,0.449372,0.444222,0.008335,0.064332
24,dataOpenCV_2D,RandomForest/20,0.386286,0.39541,0.036333,0.142999
32,dataOpenCV_3D,RandomForest/10,0.376532,0.37307,0.005002,0.031665
26,dataOpenCV_2D,LinearSVC,0.359318,0.401988,0.035334,1.738666
35,dataOpenCV_3D,LinearSVC,0.344533,0.378688,0.005,0.664666
23,dataOpenCV_2D,RandomForest/10,0.33869,0.350268,0.033334,0.079666
31,dataOpenCV_3D,DecisionTreeClassifier,0.28772,0.298982,0.002,0.080999
30,dataOpenCV_3D,NaiveBayes,0.263819,0.292638,0.173999,0.004666
22,dataOpenCV_2D,DecisionTreeClassifier,0.259323,0.255082,0.029665,0.253667
15,dataOpenCV_1D,RandomForest/20,0.22431,0.211641,0.015333,0.127337
