# Exercise 4 - comparative experimentation

## Setup Image Data

In [56]:
import glob, os
from pathlib import Path

imagePath=".FIDS30/"
fileNames = glob.glob("FIDS30/*/*.jpg")
numberOfFiles=len(fileNames)
targetLabels=[]

print("Found " + str(numberOfFiles) + " files\n")

for fileName in fileNames:
    parts = Path(fileName).parts
    targetLabels.append(parts[-2])


from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(targetLabels) # this basically finds all unique class names, and assigns them to the numbers
print("Found the following classes: " + str(list(le.classes_)))

# now we transform our labels to integers
target = le.transform(targetLabels); 
print("Transformed labels (first elements: " + str(target[0:150]))

# If we want to find again the label for an integer value, we can do something like this:
# print list(le.inverse_transform([0, 18, 1]))

print("... done label encoding")


Found 971 files

Found the following classes: ['acerolas', 'apples', 'apricots', 'avocados', 'bananas', 'blackberries', 'blueberries', 'cantaloupes', 'cherries', 'coconuts', 'figs', 'grapefruits', 'grapes', 'guava', 'kiwifruit', 'lemons', 'limes', 'mangos', 'olives', 'oranges', 'passionfruit', 'peaches', 'pears', 'pineapples', 'plums', 'pomegranates', 'raspberries', 'strawberries', 'tomatoes', 'watermelons']
Transformed labels (first elements: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4]
... done label encoding


In [66]:
from PIL import Image
import numpy as np
import cv2
import datetime

data=[]
for index, fileName in enumerate(fileNames):
    imagePIL = Image.open(fileName)
    imagePIL = imagePIL.convert('RGB')     
    featureVector=imagePIL.histogram()
    
    if (len(featureVector) != 768): # just a sanity check; with the transformation to RGB, this should never happen
        print("Unexpected length of feature vector: " + str(len(featureVector)) + " in file: " + fileName)

    data.append((featureVector))

    
# Next, we extract a few more features using OpenCV

print ("Extracting features using OpenCV" + " (" + str(datetime.datetime.now()) + ")")
dataOpenCV_1D=[]
dataOpenCV_2D=[]
dataOpenCV_3D=[]

# use our own simple function to flatten the 2D arrays
flatten = lambda l: [item for sublist in l for item in sublist]

for fileName in fileNames:

    # the easiest way would to do the following:
    # imageOpenCV = cv2.imread(imagePath + fileName)

    # However, we have the same issue as before, and it is more difficult in OpenCV to convert to an RGB image
    # Thus we do this using PIL, and then convert to OpenCV ....
    imagePIL = Image.open(fileName)
    imagePIL = imagePIL.convert('RGB')
    imageOpenCV = np.array(imagePIL) 
    # Convert RGB to BGR 
    imageOpenCV = imageOpenCV[:, :, ::-1].copy() 

    # Now we split the image in the three channels, B / G / R
    chans = cv2.split(imageOpenCV)
    colors = ("b", "g", "r")

    # First we do also features per channel, but this time, we aggregate them into a smaller number of bins
    # I.e. we do not have 256 values per channel, but less
    featuresOpenCV_1D = []
    bins_1D=64
    for (chan, color) in zip(chans, colors): # we compute the histogram over each channel
        histOpenCV = cv2.calcHist([chan], [0], None, [bins_1D], [0, 256])
        featuresOpenCV_1D.extend(histOpenCV)
    featureVectorOpenCV_1D = flatten(featuresOpenCV_1D) # and append this to our feature vector
    
    dataOpenCV_1D.append(featureVectorOpenCV_1D) # now we append the feature vector to the dataset so far

    if (len(featureVectorOpenCV_1D) != bins_1D*3): # sanity check, in case we had a wrong number of channels...
        print("Unexpected length of feature vector: " + str(len(featureVectorOpenCV_1D)) + " in file: " + fileName)

    # Next - features that look at two channels at the same time
    # E.g. we look at when green and blue have both "high values"
    # We reduce the size of bins further, to not have a too long feature vector
    featuresOpenCV_2D = []
    bins2D=16
    # look at all combinations of channels (R & B, R & G, B & G)
    featuresOpenCV_2D.extend(cv2.calcHist([chans[1], chans[0]], [0, 1], None, [bins2D, bins2D], [0, 256, 0, 256]))
    featuresOpenCV_2D.extend(cv2.calcHist([chans[1], chans[2]], [0, 1], None, [bins2D, bins2D], [0, 256, 0, 256]))
    featuresOpenCV_2D.extend(cv2.calcHist([chans[0], chans[2]], [0, 1], None, [bins2D, bins2D], [0, 256, 0, 256]))
    # and add that to our dataset
    featureVectorOpenCV_2D = flatten(featuresOpenCV_2D)
    dataOpenCV_2D.append(featureVectorOpenCV_2D)

    # finally, we look at all three channels at the same time.
    # We further reduce our bin size, because otherwise, this would become very large...
    featuresOpenCV_3D = cv2.calcHist([imageOpenCV], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    # append to our dataset
    featureVectorOpenCV_3D = featuresOpenCV_3D.flatten()
    dataOpenCV_3D.append(featureVectorOpenCV_3D)

        
print( ".... done" + " (" + str(datetime.datetime.now()) + ")")

Extracting features using OpenCV (2019-01-20 23:36:12.743282)
.... done (2019-01-20 23:37:03.690376)


## Classify

In [109]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

from sklearn import neighbors
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import tree
from sklearn import ensemble
from sklearn import svm

scoring = ['precision_macro', 'recall_macro', 'accuracy']

In [107]:
trainingSets = [{'data': data, 'name': 'PIL_Histogram'},
                {'data': dataOpenCV_1D, 'name': 'dataOpenCV_1D'},
                {'data': dataOpenCV_2D,'name': 'dataOpenCV_2D'},
                {'data': dataOpenCV_3D, 'name': 'dataOpenCV_3D'}
               ]
classifiers = [{'classifier': neighbors.KNeighborsClassifier(n_neighbors=1), 'name': 'KNN/1'},
               {'classifier': neighbors.KNeighborsClassifier(n_neighbors=5), 'name': 'KNN/5'},
               {'classifier': neighbors.KNeighborsClassifier(n_neighbors=10), 'name': 'KNN/10'},
               {'classifier': naive_bayes.GaussianNB(), 'name': 'NaiveBayes'},
               {'classifier': tree.DecisionTreeClassifier() , 'name': 'DecisionTreeClassifier'},
               {'classifier': ensemble.RandomForestClassifier(n_estimators=10), 'name': 'RandomForest/10'},
               {'classifier': ensemble.RandomForestClassifier(n_estimators=20), 'name': 'RandomForest/20'},
               {'classifier': svm.SVC(), 'name': 'SVC'},              
               {'classifier': svm.LinearSVC(), 'name': 'LinearSVC'}
              ]

results = []

In [124]:
def CrossValidateWith(classifier, trainingSet, target):
    global scoring
    scores = cross_validate(classifier['classifier'], trainingSet['data'], target, cv=3, scoring=scoring, return_train_score=False)
    return {
        'Method' : classifier['name'],
        'Dataset' : trainingSet['name'],
        'Accuracy' : scores['test_accuracy'].mean(),
        'Precision' : scores['test_precision_macro'].mean(),
        'Training time' : scores['fit_time'].mean() ,
        'Testing time' : scores['score_time'].mean()
    }

In [145]:
for index, trainingSet in enumerate(trainingSets):
    print('TrainingSet: ' + trainingSet['name'])
    for classifier in classifiers:
        print('Using: ' + classifier['name'])
        result = CrossValidateWith(classifier, trainingSet, target)
        results.append(result)

TrainingSet: PIL_Histogram
Using: KNN/1
{'Accuracy': 0.10065421163353094, 'Method': 'PIL_Histogram with KNN/1', 'Precision': 0.10350412042568906, 'Testing time': 0.32199939092000324, 'Training time': 0.029666821161905926}
Using: KNN/5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.08321004423527811, 'Method': 'PIL_Histogram with KNN/5', 'Precision': 0.11515352333381089, 'Testing time': 0.35900036493937176, 'Training time': 0.027664979298909504}
Using: KNN/10
{'Accuracy': 0.08559174397625291, 'Method': 'PIL_Histogram with KNN/10', 'Precision': 0.09818137209876976, 'Testing time': 0.38066673278808594, 'Training time': 0.02733302116394043}
Using: NaiveBayes


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.07414095407969215, 'Method': 'PIL_Histogram with NaiveBayes', 'Precision': 0.10415721022371911, 'Testing time': 0.29433361689249676, 'Training time': 0.03366645177205404}
Using: DecisionTreeClassifier
{'Accuracy': 0.13719539582138132, 'Method': 'PIL_Histogram with DecisionTreeClassifier', 'Precision': 0.13407207338966884, 'Testing time': 0.05833307902018229, 'Training time': 0.545332113901774}
Using: RandomForest/10
{'Accuracy': 0.17947818322255815, 'Method': 'PIL_Histogram with RandomForest/10', 'Precision': 0.16430723516249832, 'Testing time': 0.06100193659464518, 'Training time': 0.1569983959197998}
Using: RandomForest/20


  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.22392976835741996, 'Method': 'PIL_Histogram with RandomForest/20', 'Precision': 0.2148141877984948, 'Testing time': 0.062001307805379234, 'Training time': 0.28833166758219403}
Using: SVC


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.047374603532256275, 'Method': 'PIL_Histogram with SVC', 'Precision': 0.0015791534510752093, 'Testing time': 0.5593323707580566, 'Training time': 0.729333241780599}
Using: LinearSVC




{'Accuracy': 0.1394836643126007, 'Method': 'PIL_Histogram with LinearSVC', 'Precision': 0.17513124584203013, 'Testing time': 0.031332810719807945, 'Training time': 4.939670244852702}
TrainingSet: dataOpenCV_1D
Using: KNN/1
{'Accuracy': 0.11627244465827151, 'Method': 'dataOpenCV_1D with KNN/1', 'Precision': 0.12587763906391358, 'Testing time': 0.08900268872578938, 'Training time': 0.00966485341389974}
Using: KNN/5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.094553684411243, 'Method': 'dataOpenCV_1D with KNN/5', 'Precision': 0.10926469674175193, 'Testing time': 0.09366909662882487, 'Training time': 0.007665157318115234}
Using: KNN/10


  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.09095806729931451, 'Method': 'dataOpenCV_1D with KNN/10', 'Precision': 0.10780343679999309, 'Testing time': 0.10266788800557454, 'Training time': 0.007665475209554036}
Using: NaiveBayes
{'Accuracy': 0.07310679394573373, 'Method': 'dataOpenCV_1D with NaiveBayes', 'Precision': 0.10523569985724245, 'Testing time': 0.025665124257405598, 'Training time': 0.0073350270589192705}
Using: DecisionTreeClassifier


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.1565123608342814, 'Method': 'dataOpenCV_1D with DecisionTreeClassifier', 'Precision': 0.1576528494945061, 'Testing time': 0.01100007692972819, 'Training time': 0.13533329963684082}
Using: RandomForest/10


  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.18833033652648948, 'Method': 'dataOpenCV_1D with RandomForest/10', 'Precision': 0.17766593220760196, 'Testing time': 0.012667576471964518, 'Training time': 0.06966574986775716}
Using: RandomForest/20
{'Accuracy': 0.20702560091035538, 'Method': 'dataOpenCV_1D with RandomForest/20', 'Precision': 0.19178635018805298, 'Testing time': 0.016000668207804363, 'Training time': 0.12799859046936035}
Using: SVC


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.047374603532256275, 'Method': 'dataOpenCV_1D with SVC', 'Precision': 0.0015791534510752093, 'Testing time': 0.16233444213867188, 'Training time': 0.21699881553649902}
Using: LinearSVC




{'Accuracy': 0.13810634558595505, 'Method': 'dataOpenCV_1D with LinearSVC', 'Precision': 0.17849216054808714, 'Testing time': 0.008665164311726889, 'Training time': 1.3250171343485515}
TrainingSet: dataOpenCV_2D
Using: KNN/1
{'Accuracy': 0.15863693273729432, 'Method': 'dataOpenCV_2D with KNN/1', 'Precision': 0.20070779687519494, 'Testing time': 0.3236672878265381, 'Training time': 0.0323332150777181}
Using: KNN/5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.12858328623390616, 'Method': 'dataOpenCV_2D with KNN/5', 'Precision': 0.15115918765212918, 'Testing time': 0.38067126274108887, 'Training time': 0.03166659673055013}
Using: KNN/10
{'Accuracy': 0.13992157286027482, 'Method': 'dataOpenCV_2D with KNN/10', 'Precision': 0.16673210150386217, 'Testing time': 0.3900001049041748, 'Training time': 0.032000462214152016}
Using: NaiveBayes
{'Accuracy': 0.20666403949093096, 'Method': 'dataOpenCV_2D with NaiveBayes', 'Precision': 0.2774713496162094, 'Testing time': 0.2940022945404053, 'Training time': 0.024331092834472656}
Using: DecisionTreeClassifier
{'Accuracy': 0.2502434684136244, 'Method': 'dataOpenCV_2D with DecisionTreeClassifier', 'Precision': 0.2466433071476931, 'Testing time': 0.029999574025472004, 'Training time': 0.25300010045369464}
Using: RandomForest/10
{'Accuracy': 0.3213542543753262, 'Method': 'dataOpenCV_2D with RandomForest/10', 'Precision': 0.31368903408635096, 'Testing time': 0.0346686045328776, 'Training time': 0.

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.047374603532256275, 'Method': 'dataOpenCV_2D with SVC', 'Precision': 0.0015791534510752093, 'Testing time': 0.5473347504933676, 'Training time': 0.7176659901936849}
Using: LinearSVC




{'Accuracy': 0.3405922348495137, 'Method': 'dataOpenCV_2D with LinearSVC', 'Precision': 0.3923032806274284, 'Testing time': 0.035335540771484375, 'Training time': 1.7429980436960857}
TrainingSet: dataOpenCV_3D
Using: KNN/1
{'Accuracy': 0.20370083010722925, 'Method': 'dataOpenCV_3D with KNN/1', 'Precision': 0.23649620086277143, 'Testing time': 0.18999926249186197, 'Training time': 0.009000698725382486}
Using: KNN/5


  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.16020573927284468, 'Method': 'dataOpenCV_3D with KNN/5', 'Precision': 0.19770286770947473, 'Testing time': 0.2173341910044352, 'Training time': 0.008998632431030273}
Using: KNN/10


  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.14824039846784562, 'Method': 'dataOpenCV_3D with KNN/10', 'Precision': 0.1935205904395716, 'Testing time': 0.2310007413228353, 'Training time': 0.008998950322469076}
Using: NaiveBayes
{'Accuracy': 0.263818830213198, 'Method': 'dataOpenCV_3D with NaiveBayes', 'Precision': 0.29263779786519667, 'Testing time': 0.18800091743469238, 'Training time': 0.004332621892293294}
Using: DecisionTreeClassifier
{'Accuracy': 0.3058739736635277, 'Method': 'dataOpenCV_3D with DecisionTreeClassifier', 'Precision': 0.3116150667771755, 'Testing time': 0.0020018418629964194, 'Training time': 0.07999777793884277}
Using: RandomForest/10
{'Accuracy': 0.3840430998532261, 'Method': 'dataOpenCV_3D with RandomForest/10', 'Precision': 0.38263891556562174, 'Testing time': 0.006334861119588216, 'Training time': 0.03399991989135742}
Using: RandomForest/20


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.45198444667876414, 'Method': 'dataOpenCV_3D with RandomForest/20', 'Precision': 0.4480587435361519, 'Testing time': 0.008667071660359701, 'Training time': 0.06266578038533528}
Using: SVC


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'Accuracy': 0.047374603532256275, 'Method': 'dataOpenCV_3D with SVC', 'Precision': 0.0015791534510752093, 'Testing time': 0.34666816393534344, 'Training time': 0.4743309020996094}
Using: LinearSVC




{'Accuracy': 0.3530530545425144, 'Method': 'dataOpenCV_3D with LinearSVC', 'Precision': 0.3934420213371396, 'Testing time': 0.004666487375895183, 'Training time': 0.6483332316080729}




In [144]:
results = []

In [142]:
results.append(1)

In [150]:
import pandas as pd
pd.DataFrame(results)

Unnamed: 0,Accuracy,Method,Precision,Testing time,Training time
0,0.100654,PIL_Histogram with KNN/1,0.103504,0.321999,0.029667
1,0.08321,PIL_Histogram with KNN/5,0.115154,0.359,0.027665
2,0.085592,PIL_Histogram with KNN/10,0.098181,0.380667,0.027333
3,0.074141,PIL_Histogram with NaiveBayes,0.104157,0.294334,0.033666
4,0.137195,PIL_Histogram with DecisionTreeClassifier,0.134072,0.058333,0.545332
5,0.179478,PIL_Histogram with RandomForest/10,0.164307,0.061002,0.156998
6,0.22393,PIL_Histogram with RandomForest/20,0.214814,0.062001,0.288332
7,0.047375,PIL_Histogram with SVC,0.001579,0.559332,0.729333
8,0.139484,PIL_Histogram with LinearSVC,0.175131,0.031333,4.93967
9,0.116272,dataOpenCV_1D with KNN/1,0.125878,0.089003,0.009665
