In [1]:
import glob

import logging

import numpy as np

import pandas as pd

import pickle

import matplotlib.pyplot as plt

from scipy import ndimage as ndi

from concurrent.futures import ProcessPoolExecutor

from skimage import measure
from skimage.measure import moments_hu, shannon_entropy
from skimage.measure import compare_ssim as ssim
from skimage.feature import canny, corner_harris, corner_peaks, peak_local_max, daisy, blob_doh, shape_index, hog, hessian_matrix_det, hessian_matrix, structure_tensor
from skimage.filters import rank, threshold_otsu, frangi, hessian, roberts, sobel, sobel_h, sobel_v, threshold_sauvola, gabor
from skimage.morphology import disk
from skimage.color import rgb2gray
from skimage.transform import hough_line, hough_line_peaks

from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn import metrics

To featurize each image, we first collect the paths to all images contained in each image category using .glob( ).

The path in .glob( ) should reflect the location of the image directory '50_categories'.

It is not necessary to run this cell, as the featured image data is saved and can be loaded.

In [None]:
paths = sorted(glob.glob('/Users/danslaughter/Desktop/AY250/hw6/50_categories/*/*'))

Next we define 3 functions:

1. im_x_corr: this function computes the cross correlation between the RGB channels of a given image
2. feature_func: this function computes a set of features of a given image
3. feature_func_set2: this function computes a second set of different features of a given image

The cells containing these 3 functions should be run, as the functions will be necessary to featurize the validation images.

In [None]:
def im_x_corr(im1, im2, im3):
    
    #compute the 3 products of the fft of each color channel
    products = [np.fft.fft2(im1) * np.fft.fft2(im2).conj(), np.fft.fft2(im1) * np.fft.fft2(im3).conj(), 
               np.fft.fft2(im2) * np.fft.fft2(im3).conj()]
    
    #return the shifted ifft of the 3 products
    return [np.fft.fftshift(np.fft.ifft2(products[0])), np.fft.fftshift(np.fft.ifft2(products[1])), 
           np.fft.fftshift(np.fft.ifft2(products[2]))]

In [None]:
def feature_func(img):
    
    #get image label from image path
    file_name = img.split('/')[-1].replace('.jpg', '').replace('_', '')
    label = ''.join(x for x in file_name if x.isalpha())
    
    #read in color image and create grayscale image
    im = plt.imread(img)
    im_gray = rgb2gray(im)

    #create an exception for images that are not RGB
    if len(np.shape(im)) != 3:
        
        pass
    
    else:
    
        im_R = im[:,:,0]
        im_G = im[:,:,1]
        im_B = im[:,:,2]
                
        #feature: mean value of RGB channels and grayscale image
        mean_R = np.mean(im_R)
        mean_G = np.mean(im_G)
        mean_B = np.mean(im_B)
        mean_gray = np.mean(im_gray)
        
        #feature: variance of RGB channels and grayscale image
        var_R = np.var(im_R)
        var_G = np.var(im_G)
        var_B = np.var(im_B)
        var_gray = np.var(im_gray)
        
        #feature: mean and variance of cross correlation between RGB channels
        x_corr = im_x_corr(im_R, im_G, im_B)
        cc_RG = x_corr[0]
        cc_RB = x_corr[1]
        cc_GB = x_corr[2]
        
        xcorr_RG_mean = np.mean(np.abs(cc_RG))
        xcorr_RB_mean = np.mean(np.abs(cc_RB))
        xcorr_GB_mean = np.mean(np.abs(cc_GB))
        
        xcorr_RG_var = np.var(np.abs(cc_RG))
        xcorr_RB_var = np.var(np.abs(cc_RB))
        xcorr_GB_var = np.var(np.abs(cc_GB))
        
        #feature: structual similarity index between RGB channels
        ssim_RG = ssim(im_R, im_G, data_range = im_G.max() - im_G.min())
        ssim_RB = ssim(im_R, im_B, data_range = im_B.max() - im_B.min())
        ssim_GB = ssim(im_G, im_B, data_range = im_B.max() - im_B.min())
        
        #feature: number of corners in the image
        corner_count = len(corner_peaks(corner_harris(im_gray), min_distance=4))
        
        #feature: number of extrema in the image
        size_filt = int(len(im_gray)/8)
        max_filt = ndi.maximum_filter(im_gray, size = size_filt, mode = 'constant')
        extrema_count = len(peak_local_max(im_gray, min_distance = size_filt))
        
        #feature: number of contours at the mean of the image
        mean_gray = np.mean(im_gray)
        contours_count = len(measure.find_contours(im_gray, mean_gray))
        
        #feature: mean and variance of entropy of the image
        disk_size = int(len(im_gray)/8)
        shift = (im_gray).astype(np.uint8)
        im_entropy = rank.entropy(shift, disk(disk_size))
        im_entropy_mean = np.mean(im_entropy)
        im_entropy_var = np.var(im_entropy)
        
        #feature: shannon entropy of the image
        shannonEntropy = shannon_entropy(im_gray)
        
        #feature: mean and variance of thresholded image
        thresh = threshold_otsu(im_gray)
        binary = im_gray > thresh
        binary_mean = np.mean(binary)
        binary_var = np.var(binary)
        
        #feature: mean and variance of frangi filtered image
        im_frangi = frangi(im_gray)
        frangi_mean = np.mean(im_frangi)
        frangi_var = np.var(im_frangi)
        
        #feature: mean and variance of hessian filtered image
        im_hessian = hessian(im_gray)
        hessian_mean = np.mean(im_hessian)
        hessian_var = np.var(im_hessian)
        
        #feature: mean and variance of canny edge filtered image
        edge_canny = canny(im_gray, sigma=3)
        edge_canny_mean = np.mean(edge_canny)
        edge_canny_var = np.var(edge_canny)
        
        #feature: number of "blobs" in the image
        blobs_count = len(blob_doh(im_gray, max_sigma=30, threshold=.01)[:,0])
        
        #feature: number of straight lines in the image
        h, theta, d = hough_line(im_gray)
        peaks = hough_line_peaks(h, theta, d)
        peak_count = np.shape(peaks)[1]
        
        #feature: mean and variance of sobel edge filtered image
        edge_sobel = sobel(im_gray)
        edge_sobel_mean = np.mean(edge_sobel)
        edge_sobel_var = np.var(edge_sobel)
        
        #feature: mean and variance of vertical/horizontal sobel edge filtered image
        edge_sobel_h = sobel_h(im_gray)
        edge_sobel_v = sobel_v(im_gray)
        
        edge_sobel_h_mean = np.mean(edge_sobel_h)
        edge_sobel_v_mean = np.mean(edge_sobel_v)
        
        edge_sobel_h_var = np.var(edge_sobel_h)
        edge_sobel_v_var = np.var(edge_sobel_v)
        
        #feature: number of DAISY descriptors of the image
        descs = daisy(im_gray, step = int(len(im_gray)/20), radius = int(len(im_gray)/8), rings = 3, 
                             histograms = 8, orientations = 8)
        descs_num = descs.shape[0] * descs.shape[1]

        #feature: mean and variance of 6 gabor filters applied to image 
        #(2 freqencies (0.1, 0.5), 3 theta orientations (0, 45, 90)) 
        #this computation is particularly slow, so it has been parallelized using concurrent.futures.
        iters_im = [im_gray, im_gray, im_gray]
        iters_low_freq = [0.1, 0.1, 0.1]
        iters_high_freq = [0.5, 0.5, 0.5]
        iters_theta = [0, np.pi / 4, np.pi / 2]
        
        executor = ProcessPoolExecutor(4) #set up a pool of 4 workers processes
        filt_R_C_low = list(executor.map(gabor, iters_im, iters_low_freq, iters_theta))
        filt_R_C_high = list(executor.map(gabor, iters_im, iters_high_freq, iters_theta))
        executor.shutdown() #close down pool
        
        filt_real_1_0_mean = np.mean(filt_R_C_low[0][0])
        filt_real_1_1_mean = np.mean(filt_R_C_low[1][0])
        filt_real_1_2_mean = np.mean(filt_R_C_low[2][0])
        
        filt_real_1_0_var = np.var(filt_R_C_low[0][0])
        filt_real_1_1_var = np.var(filt_R_C_low[1][0])
        filt_real_1_2_var = np.var(filt_R_C_low[2][0])
        
        filt_real_5_0_mean = np.mean(filt_R_C_high[0][0])
        filt_real_5_1_mean = np.mean(filt_R_C_high[1][0])
        filt_real_5_2_mean = np.mean(filt_R_C_high[2][0])
        
        filt_real_5_0_var = np.var(filt_R_C_high[0][0])
        filt_real_5_1_var = np.var(filt_R_C_high[1][0])
        filt_real_5_2_var = np.var(filt_R_C_high[2][0])
                
        return [mean_R, mean_G, mean_B, mean_gray, var_R, var_G, var_B, var_gray, xcorr_RG_mean, xcorr_RB_mean, 
                xcorr_GB_mean, xcorr_RG_var, xcorr_RB_var, xcorr_GB_var, ssim_RG, ssim_RB, ssim_GB, corner_count, 
                extrema_count, contours_count, im_entropy_mean, im_entropy_var, shannonEntropy, binary_mean, 
                binary_var, frangi_mean, frangi_var, hessian_mean, hessian_var, edge_canny_mean, edge_canny_var, 
                blobs_count, peak_count, edge_sobel_mean, edge_sobel_var, edge_sobel_h_mean, edge_sobel_v_mean, 
                edge_sobel_h_var, edge_sobel_v_var, descs_num, filt_real_1_0_mean, filt_real_1_1_mean, filt_real_1_2_mean, 
                filt_real_1_0_var, filt_real_1_1_var, filt_real_1_2_var, filt_real_5_0_mean, filt_real_5_1_mean, 
                filt_real_5_2_mean, filt_real_5_0_var, filt_real_5_1_var, filt_real_5_2_var, label]

In [None]:
def feature_func_set2(img):
    
    #get image label from image path
    file_name = img.split('/')[-1].replace('.jpg', '').replace('_', '')
    label = ''.join(x for x in file_name if x.isalpha())
    
    #read in color image and create grayscale image
    im = plt.imread(img)
    im_gray = rgb2gray(im)

    #create an exception for images that are not RGB
    if len(np.shape(im)) != 3:
        
        pass
    
    else:

        #feature: mean and variance of histogram of oriented gradients
        hist_grad = hog(im_gray, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1), 
                        block_norm = 'L2-Hys')
        hist_grad_mean = np.mean(hist_grad)
        hist_grad_var = np.var(hist_grad)
        
        #feature: mean and variance of the image DAISY descriptors
        descs = daisy(im_gray, step = int(len(im_gray)/20), radius = int(len(im_gray)/8), rings = 3, 
                      histograms = 8, orientations = 8)
        descs_mean = np.mean(descs)
        descs_var = np.var(descs)
        
        #features: mean and variance of the image hessian matrix determinant
        hess_det = hessian_matrix_det(im_gray, sigma=1)
        hess_det_mean = np.mean(hess_det)
        hess_det_var = np.var(hess_det)
        
        #feature: mean and variance of the 3 image structure tensor elements
        A00, A01, A11 = structure_tensor(im_gray, sigma=1, mode='constant', cval=0)
        A00_mean = np.mean(A00)
        A00_var = np.var(A00)
        A01_mean = np.mean(A01)
        A01_var = np.var(A01)
        A11_mean = np.mean(A11)
        A11_var = np.var(A11)
        
        #feature: mean and variance of the 3 image hessian matrix elements
        H00, H01, H11 = hessian_matrix(im_gray, sigma=1, mode='constant', cval=0, order='rc')
        H00_mean = np.mean(H00)
        H00_var = np.var(H00)
        H01_mean = np.mean(H01)
        H01_var = np.var(H01)
        H11_mean = np.mean(H11)
        H11_var = np.var(H11)
        
        #feature: the 6 hu moments of the image
        hu0, hu1, hu2, hu3, hu4, hu5, hu6 = moments_hu(im_gray)
                
        return [hist_grad_mean, hist_grad_var, descs_mean, descs_var, hess_det_mean, hess_det_var, A00_mean, A00_var, 
               A01_mean, A01_var, A11_mean, A11_var, H00_mean, H00_var, H01_mean, H01_var, H11_mean, H11_var, 
               hu0, hu1, hu2, hu3, hu4, hu5, hu6, label]

Next we create an empty pandas dataframe with columns named after the first set of features computed in feature_func( ), and another empty pandas dataframe with columns named after the second set of features computed in feature_func_set2( ). These dataframes are then populated with the features of each image by calling the two featurizing functions on each image path we collected earlier using .glob( ).

This cell does not need to be run, as the featurized data has been saved and can be loaded.

In [None]:
######### Note #########
# this cell takes ~8 hours to run and does not need to be run again
# the image features are saved and can be loaded in

#create two lists with the names of the image features contained in the two image feature sets
cols = ['mean_R', 'mean_G', 'mean_B', 'mean_gray', 'var_R', 'var_G', 'var_B', 'var_gray', 'xcorr_RG_mean', 'xcorr_RB_mean', 
        'xcorr_GB_mean', 'xcorr_RG_var', 'xcorr_RB_var', 'xcorr_GB_var', 'ssim_RG', 'ssim_RB', 'ssim_GB', 'corner_count', 
        'extrema_count', 'contours_count', 'im_entropy_mean', 'im_entropy_var', 'shannonEntropy', 'binary_mean', 
        'binary_var', 'frangi_mean', 'frangi_var', 'hessian_mean', 'hessian_var', 'edge_canny_mean', 'edge_canny_var', 
        'blobs_count', 'peak_count', 'edge_sobel_mean', 'edge_sobel_var', 'edge_sobel_h_mean', 'edge_sobel_v_mean', 
        'edge_sobel_h_var', 'edge_sobel_v_var', 'descs_num', 'filt_real_1_0_mean', 'filt_real_1_1_mean', 'filt_real_1_2_mean', 
        'filt_real_1_0_var', 'filt_real_1_1_var', 'filt_real_1_2_var', 'filt_real_5_0_mean', 'filt_real_5_1_mean', 
        'filt_real_5_2_mean', 'filt_real_5_0_var', 'filt_real_5_1_var', 'filt_real_5_2_var', 'label']

cols_set2 = ['hist_grad_mean', 'hist_grad_var', 'descs_mean', 'descs_var', 'hess_det_mean', 'hess_det_var', 'A00_mean', 'A00_var', 
        'A01_mean', 'A01_var', 'A11_mean', 'A11_var', 'H00_mean', 'H00_var', 'H01_mean', 'H01_var', 'H11_mean', 'H11_var', 
        'hu0', 'hu1', 'hu2', 'hu3', 'hu4', 'hu5', 'hu6', 'label']

#create two empty pandas dataframes that will contain the 2 sets of features of 
#each image with columns named using cols and cols_set2
im_feature_DF = pd.DataFrame(data = None, columns = cols)
im_feature_DF_set2 = pd.DataFrame(data = None, columns = cols_set2)

#populate the pandas dataframes by calling the featurizing functions on each image path
for index, path in enumerate(paths):
    feature_row_DF = pd.DataFrame(feature_func(path))
    feature_row_DF_set2 = pd.DataFrame(feature_func_set2(path))
    if len(feature_row_DF) == len(cols):
        im_feature_DF.loc[index] = (np.transpose(feature_row_DF.values)[0])
    if len(feature_row_DF_set2) == len(cols_set2):
        im_feature_DF_set2.loc[index] = (np.transpose(feature_row_DF_set2.values)[0])
        
#save the pandas dataframes as .csv files, so they can be loaded later
im_feature_DF.to_csv('/Users/danslaughter/Desktop/image_features_set_1_.csv')
im_feature_DF_set2.to_csv('/Users/danslaughter/Desktop/image_features_set_2_.csv')

We can load in the two data sets containing the featurized images using .read_csv( ). These dataframes are then reformatted and concatenated to form a single dataframe containing all 72 features. In order to assign which images will be used for training and which for testing, we can simply add a column to the dataframe with a boolean value to distinguish between training and testing. This boolean is assigned by sampling a number from a uniform distribution between 0 and 1 and checking if the number is less than or equal to 0.5. This randomly assigns a ~50/50 split to the featurized images. The testing and training data is then further split into the image feature data and the image category label.

This cell does not need to be run, as an optimized random forest classifier has been pickled and can be loaded in to use on the validation images.

In [2]:
#read in the .csv file containing the first set of features of all the images to a dataframe and reformat it
#the path in .read_csv() should reflect the location of the first image feature dataset on the machine
im_feature_DF_set1 = pd.read_csv('/Users/danslaughter/Desktop/image_features_set1.csv')
im_feature_DF_set1 = im_feature_DF_set1.drop(['Unnamed: 0'], axis = 1)
im_feature_DF_set1 = im_feature_DF_set1.drop(['label'], axis = 1)

#read in the .csv file containing the second set of features of all the images to a dataframe and reformat it
#the path in .read_csv() should reflect the location of the second image feature dataset on the machine
im_feature_DF_set2 = pd.read_csv('/Users/danslaughter/Desktop/image_features_set_2.csv')
im_feature_DF_set2 = im_feature_DF_set2.drop(['Unnamed: 0'], axis = 1)

#concatenate the two dataframes to make a single dataframe
im_feature_DF = pd.concat([im_feature_DF_set1, im_feature_DF_set2], axis = 1)

#assign a boolean value to a new column called 'is_train'
#boolean value generated by sampling a uniform distribution and checking if the value is <= 0.5
im_feature_DF['is_train'] = np.random.uniform(0, 1, len(im_feature_DF)) <= 0.5

#split dataframe into testing and training data
im_train, im_test = im_feature_DF[im_feature_DF['is_train'] == True], im_feature_DF[im_feature_DF['is_train'] == False]

#split training data into raw data and truth value (image category)
X_im_train = im_train.drop(['is_train'], axis = 1)
X_im_train = X_im_train.drop(['label'], axis = 1)
Y_im_train = im_train['label']
print("training size: " + str(len(Y_im_train)))

#split testing data into raw data and truth value (image category)
X_im_test = im_test.drop(['is_train'], axis = 1)
X_im_test = X_im_test.drop(['label'], axis = 1)
Y_im_test = im_test['label']
print("testing size: " + str(len(Y_im_test)))

training size: 2121
testing size: 2089


Below we execute a .GridSearchCV( ) using 5-fold cross validation to locate an optimum random forest classifier. We vary 4 parameters: the number of estimators (number of trees in the forest), the maximum number of features to be considered when finding the optimum split, the criteria used to determine the optimum split, and the minumum number of samples required to split a node. Once the best model is found, an estimator is retuned using the optimum parameters on the whole dataset, which then is used to fit the training data. Some metrics on the model's cross validated performance are also displayed. This optimum random forest classifier is then pickled, so that it can be loaded later for use.

This cell does not need to be run, since a random forest classifier is saved and can be loaded in to use on the validation images.

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

#explore 3 different forest sizes, 3 choices of max features to consider on a split, 
#2 different split criteria, and 3 min sample splits
parameters = {'n_estimators': [100, 125, 150],  'max_features': ['auto', 10, 12], 
             'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 3, 4]}
rf_tune = model_selection.GridSearchCV(RandomForestClassifier(), parameters, n_jobs = -1, cv = 5, verbose = 1)

#fit the optimum classifier identified by .GridSearchCV() to the training data
rf_opt = rf_tune.fit(X_im_train, Y_im_train)

print("Best zero-one score: " + str(rf_opt.best_score_) + "\n")
print("Optimal Model:\n" + str(rf_opt.best_estimator_) + "\n")
print("Cross Validation Results - Mean Test Scores:\n" + str(rf_opt.cv_results_['mean_test_score']))

#pickle the optimum random forest classifier
with open('/Users/danslaughter/Desktop/random_forest_opt.pickle', 'wb') as f:
    pickle.dump(rf_opt, f)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   39.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 11.1min finished


Best zero-one score: 0.314474304573

Optimal Model:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=12, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Cross Validation Results - Mean Test Scores:
[ 0.29891561  0.30221594  0.30221594  0.30174446  0.29891561  0.30740217
  0.30504479  0.29561528  0.3107025   0.30221594  0.30410184  0.3107025
  0.30551627  0.3088166   0.31164545  0.3088166   0.30834512  0.30645922
  0.30551627  0.3125884   0.3144743   0.30740217  0.30787364  0.30504479
  0.31164545  0.31164545  0.30787364  0.29184347  0.29090052  0.29608675
  0.2932579   0.30457331  0.3107025   0.29844413  0.30410184  0.31305988
  0.29608675  0.2951438   0.28524281  0.2942008

Next, the classifier is deployed on the test data and scored based on its zero-one loss and accuracy. The confusion matrix (a small portion of it) is also displayed. This random forest classifier seems to correctly classify test images ~31% of the time. Randomly guessing would have a sucess rate of 2%, so the random forest is ~15 times better than randomly guessing, but performs poorly compared to scores that could be expected from a convolutional neural network.

This cell does not need to be run.

In [4]:
#deploy the random forest on the test data
pred_rf = rf_opt.predict(X_im_test)

#compute the zero-one loss, accuracy, and confusion matrix
rf_01 = metrics.zero_one_loss(Y_im_test, pred_rf) # zero-one loss
rf_01_score = metrics.accuracy_score(Y_im_test, pred_rf) # zero-one score
rf_confmat = metrics.confusion_matrix(Y_im_test, pred_rf) # conf mat

print("Zero-One Loss: " + str(rf_01))
print("Zero-One Score: " + str(rf_01_score))
print("Confusion Matrix:")
print("[i, j] is the # of objects truly in group i but predicted to be in group j")
print(rf_confmat)

Zero-One Loss: 0.687888942078
Zero-One Score: 0.312111057922
Confusion Matrix:
[i, j] is the # of objects truly in group i but predicted to be in group j
[[251   0   0 ...,   0   0   0]
 [  0   5   0 ...,   1   0   1]
 [  2   0   0 ...,   1   1   0]
 ..., 
 [  1   0   0 ...,   6   0   0]
 [  2   1   0 ...,   1   7   0]
 [  0   0   0 ...,   1   0  17]]


Below we display the image features sorted by their importances. The number of DAISY descriptors is the most important predictor in classifying an image, followed by the shannon entropy of the image and the mean of the H00 Hessian matrix element of the image.

This cell does not need to be run.

In [5]:
list(sorted(zip(rf_opt.best_estimator_.feature_importances_, list(X_im_train)), reverse = True))

[(0.058580500682587126, 'descs_num'),
 (0.021384766978738375, 'shannonEntropy'),
 (0.01722454761311901, 'H00_mean'),
 (0.016638203755440555, 'hu3'),
 (0.016298386238601924, 'hessian_var'),
 (0.016102559968904317, 'hu4'),
 (0.016081986999648416, 'peak_count'),
 (0.016060960247314567, 'H11_mean'),
 (0.0158547160023868, 'hessian_mean'),
 (0.015623469597169046, 'ssim_RG'),
 (0.015511549275647069, 'hu5'),
 (0.01549280976654221, 'edge_sobel_h_mean'),
 (0.015257156522049004, 'ssim_GB'),
 (0.015192764000698519, 'descs_var'),
 (0.015147775347590162, 'ssim_RB'),
 (0.015044313431258012, 'filt_real_1_2_var'),
 (0.014877348791670176, 'contours_count'),
 (0.014801275303393334, 'filt_real_1_0_var'),
 (0.01441442969406545, 'hu0'),
 (0.014384054609466188, 'hu1'),
 (0.01424748126001244, 'edge_sobel_v_mean'),
 (0.014115114466849281, 'filt_real_1_1_var'),
 (0.014046178189347813, 'A01_mean'),
 (0.014023482866490571, 'mean_B'),
 (0.013937640608535971, 'extrema_count'),
 (0.013928880415925094, 'hist_grad_var

Last, we define a function, run_final_classifier( ), to be called on the validation image directory. This function takes as its argument the path to the directory containing the validation images. The path should end with the character '/', i.e. '/Users/me/Desktop/validation/'. The images contained in the directory are then featurized using the two featurizing functions defined earlier. Pandas dataframes are then populated with the resulting data. The optimum random forest classifier that was pickled is then loaded in and deployed on the featurized validation images. The predictions, along with the filenames, are then saved to a .txt file. 

This cell should be run after two changes are made. There are two paths that need to be edited to reflect the locations of the files on your machine: 
1. the location of the pickled random forest classifier
2. the location to save the output .txt file containing the predicted labels and image names

In [None]:
def run_final_classifier(dir_path):
    
    #collect the paths to all images contained in the validation directory
    final_test_paths = sorted(glob.glob(dir_path + '*'))
    
    #contruct a list of the image file names
    filenames = []
    for filename in final_test_paths:
        filenames.append(filename.split('/')[-1])
     
    #create two lists with the names of the image features contained in the two image feature sets
    cols_set1 = ['mean_R', 'mean_G', 'mean_B', 'mean_gray', 'var_R', 'var_G', 'var_B', 'var_gray', 'xcorr_RG_mean', 
                 'xcorr_RB_mean', 'xcorr_GB_mean', 'xcorr_RG_var', 'xcorr_RB_var', 'xcorr_GB_var', 'ssim_RG', 
                 'ssim_RB', 'ssim_GB', 'corner_count', 'extrema_count', 'contours_count', 'im_entropy_mean', 
                 'im_entropy_var', 'shannonEntropy', 'binary_mean', 'binary_var', 'frangi_mean', 'frangi_var', 
                 'hessian_mean', 'hessian_var', 'edge_canny_mean', 'edge_canny_var', 'blobs_count', 'peak_count', 
                 'edge_sobel_mean', 'edge_sobel_var', 'edge_sobel_h_mean', 'edge_sobel_v_mean', 'edge_sobel_h_var', 
                 'edge_sobel_v_var', 'descs_num', 'filt_real_1_0_mean', 'filt_real_1_1_mean', 'filt_real_1_2_mean', 
                 'filt_real_1_0_var', 'filt_real_1_1_var', 'filt_real_1_2_var', 'filt_real_5_0_mean', 
                 'filt_real_5_1_mean', 'filt_real_5_2_mean', 'filt_real_5_0_var', 'filt_real_5_1_var', 
                 'filt_real_5_2_var', 'label']
    
    cols_set2 = ['hist_grad_mean', 'hist_grad_var', 'descs_mean', 'descs_var', 'hess_det_mean', 'hess_det_var', 
                 'A00_mean', 'A00_var', 'A01_mean', 'A01_var', 'A11_mean', 'A11_var', 'H00_mean', 'H00_var', 
                 'H01_mean', 'H01_var', 'H11_mean', 'H11_var', 'hu0', 'hu1', 'hu2', 'hu3', 'hu4', 'hu5', 'hu6', 
                 'label']
    
    #create two empty pandas dataframes to contain the 2 sets of features of 
    #each image with columns named using cols_set1 and cols_set2
    im_feature_DF_set1 = pd.DataFrame(data = None, columns = cols_set1)
    im_feature_DF_set2 = pd.DataFrame(data = None, columns = cols_set2)
    
    #populate the pandas dataframes by calling the featurizing functions on each image path
    for index, path in enumerate(final_test_paths):
        feature_row_DF_set1 = pd.DataFrame(feature_func(path))
        feature_row_DF_set2 = pd.DataFrame(feature_func_set2(path))
        if len(feature_row_DF_set1) == len(cols_set1):
            im_feature_DF_set1.loc[index] = (np.transpose(feature_row_DF_set1.values)[0])
        if len(feature_row_DF_set2) == len(cols_set2):
            im_feature_DF_set2.loc[index] = (np.transpose(feature_row_DF_set2.values)[0])    
    
    im_feature_DF_set1 = im_feature_DF_set1.drop(['label'], axis = 1)

    #concatenate the two dataframes to make a single dataframe
    im_feature_DF = pd.concat([im_feature_DF_set1, im_feature_DF_set2], axis = 1)
        
    X_validation_im_test = im_feature_DF.drop('label', axis = 1)
    
    #load in pickled random forest classifier
    ############## edit path below ##############
    #change path in .load() to reflect location of file random_forest_opt.pickle
    with open('/Users/danslaughter/Desktop/random_forest_opt.pickle', 'rb') as f:
        random_forest_opt = pickle.load(f)
    
    #deploy the random forest on the featurized validation image data
    predict_random_forest = random_forest_opt.predict(X_validation_im_test)
    
    #create a list containing each predicted class and the corresponding file name
    predictions = list(zip(predict_random_forest, filenames))
    
    formatted_predictions = [('filename', 'predicted_class')] + [('-'*25,'')] + predictions
    
    #save predicted labels and corresponding image file names as a .txt file
    ############## edit path below ##############
    #change path in .savetxt() to reflect location to save validation predictions file
    np.savetxt('/Users/danslaughter/Desktop/validation_prediction.txt', formatted_predictions, fmt="%s")

Below the final classifier can be called on the directory containg the validation images. The path in the function run_final_classifier( ) should reflect the location of the validation image directory on your machine. This cell may take some time to run.

In [None]:
#change path in run_final_classifier() to reflect location of the validation image directory on your machine
run_final_classifier('/Users/danslaughter/Desktop/validation/')