Importing the dataset and its corresponding labels

In [1]:
import pickle
import numpy as np
filename = 'data.sav'
loaded_data = pickle.load(open(filename, 'rb'))

filename = 'label.sav'
loaded_label = pickle.load(open(filename, 'rb'))

print(np.shape(loaded_data))
print(np.shape(loaded_label))

(50000, 224, 224)
(50000,)


Shuffling the dataset

In [2]:
indices = list(range(0, 50000))
np.random.shuffle(indices)

data = loaded_data[indices]
label = loaded_label[indices]

Spliting the dataset. One fifth of the dataset, i.e. 10000 images, are allocated as test data, while the other images are used for training and validation

In [3]:
m = int((len(label)*4)/5)

test_x = data[m:]
test_y = label[m:]

## Convolutional Neural Network

Training a convolutional neural network for 5 epochs with a mini-batch size of 64 using five-fold cross validation.

The architecture is as follows:
- One hidden convolutional layer with a kernel size of 3 and with 32 filters followed by a MaxPooling layer
- One hidden dense neural netwok
- One output layer
- The activation of the hidden layers is a Relu
- The activation of the output layer is a Softmax
- The loss function is a categorical cross-entropy funtion
- The optimizer of this model is Adam

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten

def createmodel():
    model = Sequential()

    model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(224, 224)))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(50, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', 
                  metrics=['accuracy'])
    return model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
from sklearn.model_selection import KFold
from keras.utils.np_utils import to_categorical

X = data[:m]
y = label[:m]
kf = KFold(n_splits=5)
best_accuracy = 0
best_loss = 0
best_model = createmodel()

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    mod = createmodel();
    mod.fit(X_train, to_categorical(y_train),
                      validation_data=(X_test, to_categorical(y_test)),
                      epochs=5, batch_size=64, shuffle = True)
    loss, accuracy = mod.evaluate(X_test, to_categorical(y_test), verbose=0)
    if(accuracy > best_accuracy):
        best_accuracy = accuracy
        best_loss = loss
        best_model = mod

print("Best accuracy: ", best_accuracy)
print("Best loss: ", best_loss)

TRAIN: [ 8000  8001  8002 ... 39997 39998 39999] TEST: [   0    1    2 ... 7997 7998 7999]
Train on 32000 samples, validate on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN: [    0     1     2 ... 39997 39998 39999] TEST: [ 8000  8001  8002 ... 15997 15998 15999]
Train on 32000 samples, validate on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN: [    0     1     2 ... 39997 39998 39999] TEST: [16000 16001 16002 ... 23997 23998 23999]
Train on 32000 samples, validate on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN: [    0     1     2 ... 39997 39998 39999] TEST: [24000 24001 24002 ... 31997 31998 31999]
Train on 32000 samples, validate on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
TRAIN: [    0     1     2 ... 31997 31998 31999] TEST: [32000 32001 32002 ... 39997 39998 39999]
Train on 32000 samples, validate on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Best accuracy:  0.508
Best

Testing the best model gotten from cross validation on the test data and reporting the accuracy and loss

In [6]:
loss, accuracy = best_model.evaluate(test_x, to_categorical(test_y), verbose=0)
print("Accuracy = ", accuracy)
print("Loss = ", loss)

Accuracy =  0.5006
Loss =  8.04937689666748


## Random Tree Classifier

Using randomized search to tune the hyperparameters of a random tree classifier.

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint

def randomsearch(hyperparameter_grid):
    clf = RandomForestClassifier(n_estimators=20)
    rs = RandomizedSearchCV(estimator=clf, param_distributions=hyperparameter_grid, n_iter=20, cv=5)
    return rs

param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

rs = randomsearch(param_dist).fit(X.reshape(40000, 50176), y)

Testing the best estimator gotten from the randomized search and reporting the accuracy and f1 score.

In [8]:
import sklearn
from sklearn import metrics
from sklearn.metrics import confusion_matrix

pred = rs.best_estimator_.predict(test_x.reshape(10000, 50176))
tn, fp, fn, tp = confusion_matrix(test_y, pred).ravel()
print("%22s | %15s | %15s" % ('Confusion Matrix', 'Actual Positive', 'Actual Negative'))
print("%22s | %15d | %15d" % ('Classified as Positive', tp, fp))
print("%22s | %15d | %15d" % ('Classified as Negative', fn, tn))
accuracy = sklearn.metrics.accuracy_score(test_y, pred)
fscore = sklearn.metrics.f1_score(test_y, pred, average='macro')
print("Accuracy = ", accuracy)
print("F score = ", fscore)

      Confusion Matrix | Actual Positive | Actual Negative
Classified as Positive |            2861 |            1347
Classified as Negative |            2145 |            3647
Accuracy =  0.6508
F score =  0.6486292455337377


## Support Vector Machine (SVM)

Tried using SVM models using rbf and polynomial kernels but neither of them converged. Methodology used is below:

In [None]:
import sklearn
from sklearn.svm import SVC
from sklearn.model_selection import KFold

X = data[:m]
y = label[:m]
kf = KFold(n_splits=5)
best_accuracy = 0
best_f1score = 0
best_suppvec = 0
#best_model = SVC(C=0.001, kernel = 'rbf')
best_model = SVC(C=0.001, kernel = 'poly')

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #best_model = SVC(C=0.001, kernel = 'rbf')
    svm = SVC(C=1e10, kernel = 'poly')
    svm.fit(X_train.reshape(32000, 50176), y_train)
    pred = model.predict(X_test.reshape(8000, 50176))
    supportvectors_RSVC = svm.support_vectors_
    accuracy_RSVC = sklearn.metrics.accuracy_score(valid_y, pred)
    f1_score_RSVC = sklearn.metrics.f1_score(valid_y, pred, average='macro')
    if(f1_score_RSVC > best_f1score):
        best_accuracy = accuracy_RSVC
        best_f1score = f1_score_RSVC
        best_suppvec = len(supportvectors_RSVC)
        best_model = svm

print("Best Total Number of Support Vectors = ", best_suppvec)
print("Best Accuracy = ", accuracy_RSVC)
print("Best F1 Score = ", f1_score_RSVC)

TRAIN: [ 8000  8001  8002 ... 39997 39998 39999] TEST: [   0    1    2 ... 7997 7998 7999]


In [None]:
pred = best_model.predict(test_x.reshape(10000, 50176))
print("Predicted class: ", pred)
print()
tn, fp, fn, tp = confusion_matrix(test_y, pred).ravel()
print("%22s | %15s | %15s" % ('Confusion Matrix', 'Actual Positive', 'Actual Negative'))
print("%22s | %15d | %15d" % ('Classified as Positive', tp, fp))
print("%22s | %15d | %15d" % ('Classified as Negative', fn, tn))
print()
accuracy = sklearn.metrics.accuracy_score(test_y, pred)
f1_score = sklearn.metrics.f1_score(test_y, pred, average='macro')
print("Accuracy = ", accuracy)
print("F score = ", f1_score)