In [1]:
# simple timer to time events
import time
class Timer:
    def __init__(self):
        self.start = time.time()
        
    def get(self):
        value = time.time() - self.start
        self.start = time.time()
        return value

In [2]:
# convert strings to ints
import numpy as np
def oneHotEncode(Y, classNames):
    Y_new = [None] * len(Y)
    for i in range(len(Y)):
        Y_new[i] = int(classNames.index(Y[i]))
    return Y_new

In [3]:
# convert ints to strings
def reverseOneHotEncode(Y, classNames):
    Y_new = [None] * len(Y)
    for i in range(len(Y)):
        Y_new[i] = classNames[Y[i]]
    return Y_new

In [4]:
# standardize each column
from sklearn import preprocessing
def standardize(X, wholeImage=False):
    if wholeImage:
        scaled = np.zeros(X.shape, dtype=float)
        for i in range(X.shape[0]):
            for j in range(3):
                img = X[i, :, :, j]
                scaled[i, :, :, j] = preprocessing.scale(img)
        return scaled
    return preprocessing.scale(X)

In [5]:
# get pearson correlation coefficients between two array
from scipy.stats import pearsonr
def correlations(X, Y, classNames, thresh):
    Y = np.array(oneHotEncode(Y, classNames))
    print(Y)
    nFeatures = X.shape[1]
    rs = [None] * X.shape[1]
    cols = []
    for i in range(nFeatures):
        rs[i] = pearsonr(X[:, i:i+1].flatten(), Y)[0]
        if abs(rs[i]) < thresh:
            cols.append(i)
    plt.scatter([x for x in range(nFeatures)], rs)
    plt.xlabel('Feature #')
    plt.ylabel('Pearson Correlation')
    return cols

In [6]:
# takes a numpy array image and displays
# optionaly set gray=True to print image in grayscale
from matplotlib import pyplot as plt
def showImage(img):
    plt.imshow(img, cmap=plt.cm.gray)
    plt.xticks([]), plt.yticks([])  # to hide tick values on X and Y axis
    plt.show()

In [7]:
# grab some random indicies of pictures to view
# pass in data['train'] or data['test']
from math import sqrt
from random import randrange
def viewFilters(filters):
    # data[className][leafNumber][ID]
    #plt.figure(figsize=(10,10))
    for i in range(len(filters)):
        plt.subplot(sqrt(nImages),sqrt(nImages),i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        rClass = [key for key in data.keys()][randrange(len(data.keys()))]
        rLeaf = [key for key in data[rClass].keys()][randrange(len(data[rClass].keys()))]
        rID = [key for key in data[rClass][rLeaf].keys()][randrange(len(data[rClass][rLeaf].keys()))]
        rImg = data[rClass][rLeaf][rID]
        plt.imshow(rImg, cmap=plt.cm.gray)
        plt.xlabel(rClass + '_' + str(rID))
    plt.show()

In [8]:
# grab some random indicies of pictures to view
# pass in data['train'] or data['test']
from math import sqrt
from random import randrange
def viewRandomImages(data, nImages, color=True):
    # data[className][leafNumber][ID]
    plt.figure(figsize=(10,10))
    for i in range(nImages):
        plt.subplot(sqrt(nImages),sqrt(nImages),i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        rClass = [key for key in data.keys()][randrange(len(data.keys()))]
        rLeaf = [key for key in data[rClass].keys()][randrange(len(data[rClass].keys()))]
        rID = [key for key in data[rClass][rLeaf].keys()][randrange(len(data[rClass][rLeaf].keys()))]
        rImg = data[rClass][rLeaf][rID]
        if color:
            plt.imshow(rImg)
        else:
            plt.imshow(rImg, cmap=plt.cm.gray)
        plt.xlabel(rClass + '_' + str(rID))
    plt.show()

In [9]:
# create a nested dictionary to store file names: 
#  maps a key <class_name> to sub-key <4-digit ID> to the value <file_name>
#  second nested level with sub-key <4-digit ID> will be sorted from smallest ID to largest
from collections import OrderedDict
from os import listdir, path
from os.path import isfile, join
import matplotlib.image as mpimg
import pickle
from IPython.display import display
from skimage.color import rgb2gray
from scipy.ndimage import zoom
def getData(img_folder, data_path, num_img_per_leaf, test_split, class_names, trailingName, color=True):
    
    # check if file exists already
    if path.exists(data_path):
        return pickle.load( open( data_path, "rb" ) )
        
    # create map to fill with data
    data = {} # 'train' or 'test' : className : leafNumber : ID : 256x256x3 image
    data['train'] = {}
    data['test'] = {}
    
    # get file names for each classification
    for class_name in class_names:
        print('fetching files from', class_name)
        
        # add to map
        data['train'][class_name] = {}
        data['test'][class_name] = {}

        # create a temporary dictionary that will be sorted by ID later
        temp_map = {}

        # get folder where images are saved for this class
        this_folder = img_folder + '/' + class_name

        # get a list of file names in this folder
        file_names = [f for f in listdir(this_folder) if isfile(join(this_folder, f))]
        
        # get 4-digit ID from each file name, and save in map
        for file_name in file_names:
            ID = int(file_name[len(file_name)-len(trailingName)-4:len(file_name)-len(trailingName)])
            temp_map[ID] = file_name 

        # sort dictionary
        nLeaf = 0
        tempC = 0
        leaf_map = {}
        map_ID_fileName = OrderedDict()
        for ID in sorted (temp_map.keys()):
            map_ID_fileName[ID] = temp_map[ID]
            leaf_map[ID] = nLeaf
            tempC += 1
            if tempC - num_img_per_leaf == 0:
                tempC = 0
                nLeaf += 1

        # get number of leaves to pull for this classification
        num_leaves = int(test_split * len(file_names) / num_img_per_leaf)

        # get number of images to pull
        num_images = num_img_per_leaf * num_leaves

        # create an ongoing list of leaves added to test set
        test_set = []

        # just pull out the first test_split% (the order of pictures taken was arbitrary, no need to randomize)
        image_counter = 0
        for ID in map_ID_fileName:
            test_set.append(ID)
            image_counter += 1
            if image_counter >= num_images:
                break

        # read all images into data
        for ID in map_ID_fileName:
            
            # read with mpimg because opencv messes the colors up for RGB
            if color:
                img = mpimg.imread(this_folder + "/" +  map_ID_fileName[ID]) / 255.
            else:
                img = rgb2gray(mpimg.imread(this_folder + "/" +  map_ID_fileName[ID]) / 255.)
            #img = img[80:144, 80:144]
            
            # save to training or test set
            leaf = leaf_map[ID]
            dataset = 'train'
            if ID in test_set:
                dataset = 'test'
            if leaf not in data[dataset][class_name]:
                data[dataset][class_name][leaf] = {}
            data[dataset][class_name][leaf][ID] = img

    pickle.dump(data, open(data_path, "wb" ) )
    return data

In [10]:
# generate gabor filters
def gaborFilters(filter_path):
    
    # check if file exists already
    if path.exists(filter_path):
        kernels = pickle.load( open( filter_path, "rb" ) )
    
    # create kernels
    else:
        kernels = []
        for theta in range(4):
            theta = theta / 4. * np.pi
            for offset in range(4):
                offset = offset / 4. * np.pi
                for sigma in (1, 3):
                    for freq in (0.05, 0.25):
                        kernel = np.real(gabor_kernel(freq, offset=offset, theta=theta, sigma_x=sigma, sigma_y=sigma))
                        kernels.append(kernel)
        pickle.dump(kernels, open(filter_path, "wb" ) )
    
    # plot kernels
    i=0
    plt.figure(figsize=(len(kernels), len(kernels)))
    for kernel in kernels:
        plt.subplot(sqrt(len(kernels)) ,sqrt(len(kernels)),i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(kernel, cmap=plt.cm.gray)
        #plt.xlabel(str(theta) + '_' + str(offset) + '_' + str(sigma) + '_' + str(frequency))
        i+=1
    plt.show()
    
    return kernels

In [11]:
# apply filters to images
from scipy import ndimage as ndi
from skimage import data
from skimage.util import img_as_float
from skimage.filters import gabor_kernel
from IPython.display import display, clear_output
def applyFilters(data, data_path, filters, color=True):
    
    # check if file exists already
    if path.exists(data_path):
        return pickle.load( open( data_path, "rb" ) )
    
    filteredData = {}
    for dataset in data:
        filteredData[dataset] = {}
        for className in data[dataset]:
            dis = display('aplying filters to: ' + dataset + ', ' + className, display_id=True)
            filteredData[dataset][className] = {}
            for idx, leaf in enumerate(data[dataset][className]):
                dis.update('aplying filters to: ' + dataset + ', ' + className 
                           + f' {100. * float(idx) / float(len(data[dataset][className])):.0f}' + "%")
                filteredData[dataset][className][leaf] = {}
                for ID in data[dataset][className][leaf]:
                    img = data[dataset][className][leaf][ID]
                    temp = [None] * (3 * len(filters))
                    for f, ffilter in enumerate(filters):
                        filtered = np.empty(1)
                        if color:
                            for i in range(3):
                                filtered = np.append(filtered, ndi.convolve(img[:,:,i], ffilter, mode='wrap'))
                        else:
                            filtered = ndi.convolve(img, ffilter, mode='wrap')
                        temp[3 * f] = filtered.sum()
                        temp[3 * f + 1] = filtered.mean()
                        temp[3 * f + 2] = filtered.var()
                    filteredData[dataset][className][leaf][ID] = temp
            dis.update('filters applied to: ' + dataset + ', ' + className)
                    
    pickle.dump(filteredData, open(data_path, "wb" ) )
    return filteredData

In [12]:
# prepare data for machine learning
def prepML(data, data_path, code=''):
    
    # check if file exists already
    if path.exists(data_path):
        return pickle.load( open( data_path, "rb" ) )
    
    if code == 'wholeImage':
        # get dimensions
        nTrain = 0
        nTest = 0
        for dataset in data:
            for className in data[dataset]:
                for leaf in data[dataset][className]:
                    shape = list(next(iter(data[dataset][className][leaf].values())).shape)
                    if dataset == 'train':
                        nTrain += len(data[dataset][className][leaf])
                    else:
                        nTest += len(data[dataset][className][leaf])
                        
                        
        # create ml data
        mlData = {}
        mlData['X_train'] = np.zeros([nTrain] + shape, dtype=float)
        mlData['Y_train'] = [None] * nTrain
        mlData['L_train'] = [None] * nTrain
        mlData['X_test'] = np.zeros([nTest] + shape, dtype=float)
        mlData['Y_test'] = [None] * nTest
        mlData['L_test'] = [None] * nTest
        train_idx = 0
        test_idx = 0
        for dataset in data:
            for className in data[dataset]:
                for leaf in data[dataset][className]:
                    for ID in data[dataset][className][leaf]:
                        if dataset == 'train':
                            mlData['X_train'][train_idx, :, :, :] = data[dataset][className][leaf][ID]
                            mlData['Y_train'][train_idx] = className
                            mlData['L_train'][train_idx] = leaf
                            train_idx += 1
                        else:
                            mlData['X_test'][test_idx, :, :, :] = data[dataset][className][leaf][ID]
                            mlData['Y_test'][test_idx] = className
                            mlData['L_test'][test_idx] = leaf
                            test_idx += 1
    elif code == 'VGG':
        from keras.applications.vgg16 import preprocess_input
        from sklearn.preprocessing import OneHotEncoder
        from sklearn.preprocessing import LabelEncoder
        # get dimensions
        nTrain = 0
        nTest = 0
        for dataset in data:
            for className in data[dataset]:
                for leaf in data[dataset][className]:
                    shape = list(next(iter(data[dataset][className][leaf].values())).shape)
                    if dataset == 'train':
                        nTrain += len(data[dataset][className][leaf])
                    else:
                        nTest += len(data[dataset][className][leaf])
                        
                        
        # create ml data
        mlData = {}
        mlData['X_train'] = np.zeros([nTrain] + shape, dtype=float)
        mlData['Y_train'] = [None] * nTrain
        mlData['L_train'] = [None] * nTrain
        mlData['X_test'] = np.zeros([nTest] + shape, dtype=float)
        mlData['Y_test'] = [None] * nTest
        mlData['L_test'] = [None] * nTest
        train_idx = 0
        test_idx = 0
        for dataset in data:
            for className in data[dataset]:
                for leaf in data[dataset][className]:
                    for ID in data[dataset][className][leaf]:
                        if dataset == 'train':
                            mlData['X_train'][train_idx, :, :, :] = preprocess_input(data[dataset][className][leaf][ID])
                            mlData['Y_train'][train_idx] = className
                            mlData['L_train'][train_idx] = leaf
                            train_idx += 1
                        else:
                            mlData['X_test'][test_idx, :, :, :] = preprocess_input(data[dataset][className][leaf][ID])
                            mlData['Y_test'][test_idx] = className
                            mlData['L_test'][test_idx] = leaf
                            test_idx += 1
        # onehot encode
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(mlData['Y_train'])
        onehot_encoder = OneHotEncoder(sparse=False)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        mlData['Y_train'] = onehot_encoder.fit_transform(integer_encoded)
        
        integer_encoded = label_encoder.transform(mlData['Y_test'])
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        mlData['Y_test'] = onehot_encoder.transform(integer_encoded)
    else:
        # get dimensions
        nTrain = 0
        nTest = 0
        nFeatures = 0
        for dataset in data:
            for className in data[dataset]:
                for leaf in data[dataset][className]:
                    if nFeatures == 0:
                        nFeatures = len(next(iter(data[dataset][className][leaf].values())))
                    if dataset == 'train':
                        nTrain += len(data[dataset][className][leaf])
                    else:
                        nTest += len(data[dataset][className][leaf])

        # create ml data
        mlData = {}
        mlData['X_train'] = np.zeros((nTrain, nFeatures), dtype=float)
        mlData['Y_train'] = [None] * nTrain
        mlData['L_train'] = [None] * nTrain
        mlData['ID_train'] = [None] * nTrain
        mlData['X_test'] = np.zeros((nTest, nFeatures), dtype=float)
        mlData['Y_test'] = [None] * nTest
        mlData['L_test'] = [None] * nTest
        mlData['ID_test'] = [None] * nTrain
        train_idx = 0
        test_idx = 0
        for dataset in data:
            for className in data[dataset]:
                for leaf in data[dataset][className]:
                    for ID in data[dataset][className][leaf]:
                        x = data[dataset][className][leaf][ID]
                        if dataset == 'train':
                            for idx, x_i in enumerate(x):
                                mlData['X_train'][train_idx][idx] = x_i
                            mlData['Y_train'][train_idx] = className
                            mlData['L_train'][train_idx] = leaf
                            mlData['ID_train'][train_idx] = ID
                            train_idx += 1
                        else:
                            for idx, x_i in enumerate(x):
                                mlData['X_test'][test_idx][idx] = x_i
                            mlData['Y_test'][test_idx] = className
                            mlData['L_test'][test_idx] = leaf
                            mlData['ID_test'][test_idx] = ID
                            test_idx += 1
                    
    mlData['X_train'] = standardize(mlData['X_train'])
    mlData['X_test'] = standardize(mlData['X_test'])
    
    pickle.dump(mlData, open(data_path, "wb" ), protocol=4 )
    return mlData

In [13]:
# plots the results returned from crossValidation_class()
# @results are results returned from crossValidation_class()
# @mlas are list of mlas used in results to output
def plotResults_class(results, mlas):
    x_labels = []
    for mla in mlas:
        x_labels.append(mla)
        x_labels.append(' ')
        x_labels.append(' ')
        x_labels.append(' ')
    x_nums = [x for x in range(5*len(mlas))]
    fig = plt.figure(figsize=(7.5,5))
    plt.xticks(x_nums, x_labels, rotation='vertical', fontsize=20)
    plt.yticks([x for x in range(70, 101, 5)], fontsize=20)
    plt.tick_params(axis='x', length=0)
    plt.ylabel('% Classification Accuracy', fontsize=20)
    plt.title('5-Fold Cross-Validation', fontsize=20)
    plt.grid(axis='y', linewidth=0.4)
    plt.ylim([70, 101])
    offset = 0
    for mla in mlas:
        x = [offset for _ in range(len(results[mla]))]
        y = 100. * np.array(results[mla])
        if int(max(y)) <= 70:
            plt.scatter(offset+0, 71, color='red', marker='v')
        else:
            plt.scatter(x, y, color='green', marker='_', s=1000)
        plt.axvline(x=offset, color='grey', linewidth=0.3, alpha=0.3)
        offset += 4
    fig.text(0, -.20, 'Illustrates distribution of accuracies for each MLA.\nResults from 100 random runs.\nA red triangle indicates results below 70% accuracy'
             , fontsize=20)

In [14]:
# cross validates by taking entire leaves out each fold
from random import randint
from sklearn.metrics import accuracy_score
def crossValidation(mla, X, Y, L):
    leafs = list(set(L))
    Y = np.array(Y)
    nFolds = 3
    nPer = int(len(leafs) / nFolds)
    pred_Y = []
    test_Y = []
    for f in range(nFolds):
        fLeafs = []
        if f < nFolds - 1:
            for r in range(nPer):
                rIdx = randint(0, len(leafs)-1)
                rLeaf = leafs[rIdx]
                fLeafs.append(rLeaf)
                leafs.pop(rIdx)
        else:
            fLeafs = leafs
        fRows = []
        for fLeaf in fLeafs:
            for idx, l in enumerate(L):
                if l == fLeaf:
                    fRows.append(idx)
        
        mla.fit(np.delete(X, fRows, 0), np.delete(Y, fRows, 0))
        pred_Y = np.concatenate((pred_Y, mla.predict(np.take(X, fRows, 0))))
        test_Y = np.concatenate((test_Y, np.take(Y, fRows, 0)))
    return accuracy_score(test_Y, pred_Y)

In [15]:
# runs cross validation of classification MLAs
# @name is name of MLA (see if statements below)
# @X is matrix of training features
# @y are labels
# @nIters is the number of times to re-run cross-validation
# @nFolds is number of folds in cross-validation
# @layers is a list of layer sizes to use for MLP
# @disp will display results
# returns list with average accuracy for each iteration
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
def crossValidation_class(name, X, Y, L, disp=True):
    
    # create dummy Machine Learning Algorithm (MLA) object
    mla = DecisionTreeClassifier()
    
    # keep track of results
    means = [None] * 100
    
    # get results
    if disp:
        dis = display('Training ML: ' + name, display_id=True)
    for i in range(100):
        if disp:
            dis.update('Training ML: ' + name + f' {i:.0f}%')
         # create MLA with new random seed
        if 'DTr' in name:
            mla = DecisionTreeClassifier(random_state = i)
        elif 'LRe' in name:
            mla = LogisticRegression()
        elif 'NBa' in name:
            mla = GaussianNB()
        elif 'RFo' in name:
            mla = RandomForestClassifier(random_state = i, n_estimators=100)
        elif 'ETr' in name:
            mla = ExtraTreesClassifier(random_state = i, n_estimators=100)
        elif 'SVM' in name:
            mla = SVC(random_state = i, gamma='auto', probability=True)
        elif 'KNN' in name:
            mla = KNeighborsClassifier(n_neighbors=2)
        elif 'MLP' in name:
            mla = MLPClassifier(random_state = i, hidden_layer_sizes=[32], max_iter=10000)
            
        means[i] = crossValidation(mla, X, Y, L)
        
    if disp:
        dis.update('Trained ML: ' + name + ' with ' + str(sum(means) / len(means)) + '% average accuracy')
        
    return means

In [16]:
# runs cross-validation multiple times for all data and mlas
import warnings
def mla_class_results(mlas, X, Y, L, results_path, disp=True):
    
    # check if file exists already
    if path.exists(results_path):
        results = pickle.load( open( results_path, "rb" ) )
        
    # otherwise rerun results
    else:
        results = {}

        # run new and plot results, if samples dictionary is filled
        warnings.filterwarnings('ignore') # ignore warnings that show when models do not converge

        for mla in mlas:
            results[mla] = crossValidation_class(mla, X, Y, L, disp)

        pickle.dump(results, open(results_path, "wb" ) )
       
    #if disp: 
        #plotResults_class(results, mlas) 
    return results

In [24]:
from PIL import Image
from resizeimage import resizeimage
def resizeImgs(data, width, data_path, color=False): 
    
    # check if file exists already
    if path.exists(data_path):
        return pickle.load( open( data_path, "rb" ) )
    
    resized = {}
    for dataset in data:
        resized[dataset] = {}
        for className in data[dataset]:
            dis = display('resizing: ' + dataset + ', ' + className, display_id=True)
            resized[dataset][className] = {}
            for idx, leaf in enumerate(data[dataset][className]):
                dis.update('resizing: ' + dataset + ', ' + className 
                           + f' {100. * float(idx) / float(len(data[dataset][className])):.0f}' + "%")
                resized[dataset][className][leaf] = {}
                for ID in data[dataset][className][leaf]:
                    if color:
                        resized[dataset][className][leaf][ID] = np.zeros((width, width, 3), dtype=float)
                        for i in range(3):
                            img = Image.fromarray(data[dataset][className][leaf][ID][:,:,i])
                            resized[dataset][className][leaf][ID][:,:,i] = np.array(resizeimage.resize_cover(img, [width, width], validate=False))
                    else:
                        img = Image.fromarray(data[dataset][className][leaf][ID])
                        resized[dataset][className][leaf][ID] = np.array(resizeimage.resize_cover(img, [width, width], validate=False))
            dis.update('resized: ' + dataset + ', ' + className)
    
    pickle.dump(resized, open(data_path, "wb" ) )
    return resized

In [27]:
def getGT(ID, data):
    for dataset in data:
        for className in data[dataset]:
            for leaf in data[dataset][className]:
                for ID2 in data[dataset][className][leaf]:
                    if ID2 == ID:
                        return className
    return 'null'

In [51]:
from collections import Counter
def hardVoter(ensemble_votes, data):
    agg_votes = {}
    agg_GT = {}
    for key in ensemble_votes:
        local_votes = []
        local_GT = []
        for ID in ensemble_votes[key]:
            gt = getGT(ID, data)
            if gt == 'null':
                print('NULL')
            pred = ensemble_votes[key][ID]
            local_votes.append(pred)
            local_GT.append(gt)
            if ID not in agg_votes:
                agg_votes[ID] = []
                agg_GT[ID] = gt
            agg_votes[ID].append(pred)
        print(key, accuracy_score(local_votes, local_GT))
    preds = []
    gts = []
    missed =[]
    for agg in agg_votes:
        if len(agg_votes[agg]) != 4:
            continue
        most_common,num_most_common = Counter(agg_votes[agg]).most_common(1)[0]
        print(agg, agg_GT[agg], most_common, agg_votes[agg])
        preds.append(most_common)
        gts.append(agg_GT[agg])
        if most_common != agg_GT[agg]:
            missed.append(agg)
    return preds, gts, missed

In [50]:
from math import ceil
def viewIDImages(data, missed, color=True):
    # data[className][leafNumber][ID]
    nImages = len(missed)
    plt.figure(figsize=(10,10))
    for i in range(nImages):
        plt.subplot(ceil(sqrt(nImages)), ceil(sqrt(nImages)),i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        ID = missed[i]
        for dataset in data:
            for className in data[dataset]:
                for leaf in data[dataset][className]:
                    for ID2 in data[dataset][className][leaf]:
                        if ID2 == ID:
                            if color:
                                plt.imshow(data[dataset][className][leaf][ID])
                            else:
                                plt.imshow(data[dataset][className][leaf][ID], cmap=plt.cm.gray)
                            plt.xlabel(className + '_' + str(ID))
                            break
    plt.show()