In this notebook, we shall test the centered images on all major machine learning methods that predate neural networks. We do this in order to establish a baseline of performance for any later classifer that is developed.

In [1]:
import numpy as np
from scipy import *
import os
import h5py
from keras.utils import np_utils
import matplotlib.pyplot as plt
import pickle 
from skimage.transform import rescale
from keras.models import model_from_json
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV

Using TensorFlow backend.


In [2]:
file = open("train_x.dat",'rb')
train_x = pickle.load(file)
file.close()

file = open("train_y.dat",'rb')
train_y = pickle.load(file)
file.close()

file = open("test_x.dat",'rb')
test_x = pickle.load(file)
file.close()

file = open("test_y.dat",'rb')
test_y = pickle.load(file)
file.close()

file = open("raw_train_x.dat",'rb')
raw_train_x = pickle.load(file)
file.close()

file = open("raw_test_x.dat",'rb')
raw_test_x = pickle.load(file)
file.close()

In [3]:
##### HOG Images #####

In [None]:
# Defining hyperparameter range for Random Forrest Tree

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
# Random Forrest Tree
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 20, cv = 3, verbose=2, random_state=42, 
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(train_x, train_y)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 467.3min


In [None]:
rf_random.best_params_

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(train_x, train_y)
base_accuracy = evaluate(base_model, test_x, train_y)


best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_features, test_labels)