In [91]:
# imports:
import numpy as np
import pickle
import random
from sklearn import metrics
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm, tree
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from skimage import feature
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import mode
import matplotlib.pyplot as plt # for data visualization purposes

In [78]:
# functions:

dir = 'G:/University/Ph.D/AI for Sybersecurity/Project/Cleaned Datasets/'

def save_obj(obj, name):
    with open(dir + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open(dir + name + '.pkl', 'rb') as f:
        return pickle.load(f)


In [3]:
# Load the cleaned dataset form the Pickle files:

dataset = load_obj('trainX')
labels = load_obj('trainY')
test_dataset = load_obj('testX')
test_labels = load_obj('testY')

print('shape of the train set: ' + str(dataset.shape))
print('shape of the train labels: ' + str(labels.shape))
print('shape of the test set: ' + str(test_dataset.shape))
print('shape of the test labels: ' + str(test_labels.shape))

shape of the train set: (1796, 224, 224)
shape of the train labels: (1796,)
shape of the test set: (599, 224, 224)
shape of the test labels: (599,)


In [22]:
# extract local binary pattern (LBP) features from the dataset images (train set)

lbp_features = []
num_points = 100
radius = 8
eps=1e-7
for img in dataset:
    lbp = feature.local_binary_pattern(img, num_points, radius, method="uniform")
    (hist, _) = np.histogram(lbp.ravel(), bins=np.arange(0, num_points + 3), range=(0, num_points + 2))
    # normalize the histogram
    hist = hist.astype("float")
    hist /= (hist.sum() + eps)
    lbp_features.append(hist)
    
lbp_features = np.array(lbp_features)
print('shape of the resulted feature vectors: ' + str(lbp_features.shape))
    
# save the result into a pickle file:
save_obj(lbp_features, 'train_lbp_featurs')

shape of the resulted feature vectors: (1796, 102)


In [23]:
# 5-fold cross-validation for a support vector machine (SVM) model:

kf = KFold(n_splits=5, shuffle=True, random_state=12)
accuracy_sum = 0
for train_index, test_index in kf.split(lbp_features):
    # split tha data based on the current fold:
    trainX = lbp_features[train_index]
    trainY = labels[train_index]
    testX = lbp_features[test_index]
    testY = labels[test_index]
    
    # Train the model:
    model = svm.SVC(kernel='rbf', probability=True)
    model.fit(trainX, trainY)
    
    predY = model.predict(testX) # use the trained model for prediction
    accuracy = metrics.accuracy_score(testY, predY) # calculate tha accuracy metric for the predictions
    accuracy_sum += accuracy
    print("the accuracy score for this fold is: " + str(accuracy))
    
print("The average accuracy score is ", accuracy_sum / 5)

the accuracy score for this fold is: 0.08055555555555556
the accuracy score for this fold is: 0.09749303621169916
the accuracy score for this fold is: 0.08356545961002786
the accuracy score for this fold is: 0.08356545961002786
the accuracy score for this fold is: 0.10027855153203342
The average accuracy score is  0.08909161250386878


In [59]:
# 5-fold cross-validation for the KNN model:

kf = KFold(n_splits=5, shuffle=True, random_state=24)
accuracy_sum = 0
for train_index, test_index in kf.split(lbp_features):
    # split tha data based on the current fold:
    trainX = lbp_features[train_index]
    trainY = labels[train_index]
    testX = lbp_features[test_index]
    testY = labels[test_index]
    
    # Train the model:
    model = KNeighborsClassifier()
    model.fit(trainX, trainY)
    
    predY = model.predict(testX) # use the trained model for prediction
    accuracy = metrics.accuracy_score(testY, predY) # calculate tha accuracy metric for the predictions
    accuracy_sum += accuracy
    print("the accuracy score for this fold is: " + str(accuracy))
    
print("The average accuracy score is ", accuracy_sum / 5)

the accuracy score for this fold is: 0.041666666666666664
the accuracy score for this fold is: 0.1
the accuracy score for this fold is: 0.06666666666666667
the accuracy score for this fold is: 0.041666666666666664
the accuracy score for this fold is: 0.058823529411764705
The average accuracy score is  0.06176470588235293


In [26]:
# 5-fold cross-validation for the Random Forest model:

kf = KFold(n_splits=5, shuffle=True, random_state=12)
accuracy_sum = 0
for train_index, test_index in kf.split(lbp_features):
    # split tha data based on the current fold:
    trainX = lbp_features[train_index]
    trainY = labels[train_index]
    testX = lbp_features[test_index]
    testY = labels[test_index]
    
    # Train the model:
    model = RandomForestClassifier(n_estimators=100)
    model.fit(trainX, trainY)
    
    predY = model.predict(testX) # use the trained model for prediction
    accuracy = metrics.accuracy_score(testY, predY) # calculate tha accuracy metric for the predictions
    accuracy_sum += accuracy
    print("the accuracy score for this fold is: " + str(accuracy))
    
print("The average accuracy score is ", accuracy_sum / 5)

the accuracy score for this fold is: 0.09166666666666666
the accuracy score for this fold is: 0.09192200557103064
the accuracy score for this fold is: 0.07799442896935933
the accuracy score for this fold is: 0.06963788300835655
the accuracy score for this fold is: 0.08635097493036212
The average accuracy score is  0.08351439182915506


In [55]:
# 5-fold cross-validation for the Decision Tree model:

kf = KFold(n_splits=5, shuffle=True, random_state=12)
accuracy_sum = 0
for train_index, test_index in kf.split(lbp_features):
    # split tha data based on the current fold:
    trainX = lbp_features[train_index]
    trainY = labels[train_index]
    testX = lbp_features[test_index]
    testY = labels[test_index]
    
    # Train the model:
    model = tree.DecisionTreeClassifier()
    model.fit(trainX, trainY)
    
    predY = model.predict(testX) # use the trained model for prediction
    accuracy = metrics.accuracy_score(testY, predY) # calculate tha accuracy metric for the predictions
    accuracy_sum += accuracy
    print("the accuracy score for this fold is: " + str(accuracy))
    
print("The average accuracy score is ", accuracy_sum / 5)

the accuracy score for this fold is: 0.05
the accuracy score for this fold is: 0.06666666666666667
the accuracy score for this fold is: 0.058333333333333334
the accuracy score for this fold is: 0.09166666666666666
the accuracy score for this fold is: 0.058823529411764705
The average accuracy score is  0.06509803921568627


In [108]:
# 5-fold cross-validation for the Logistic Regression model:

kf = KFold(n_splits=5, shuffle=True, random_state=12)
accuracy_sum = 0
for train_index, test_index in kf.split(lbp_features):
    # split tha data based on the current fold:
    trainX = lbp_features[train_index]
    trainY = labels[train_index]
    testX = lbp_features[test_index]
    testY = labels[test_index]
    
    # Train the model:
    model = LogisticRegression()
    model.fit(trainX, trainY)
    
    predY = model.predict(testX) # use the trained model for prediction
    accuracy = metrics.accuracy_score(testY, predY) # calculate tha accuracy metric for the predictions
    accuracy_sum += accuracy
    print("the accuracy score for this fold is: " + str(accuracy))
    
print("The average accuracy score is ", accuracy_sum / 5)

the accuracy score for this fold is: 0.09166666666666666
the accuracy score for this fold is: 0.03333333333333333
the accuracy score for this fold is: 0.05
the accuracy score for this fold is: 0.11666666666666667
the accuracy score for this fold is: 0.08403361344537816
The average accuracy score is  0.07514005602240896


In [31]:
# parameter tuning for SVM

trainX = load_obj('train_lbp_featurs')
trainY = load_obj('trainY')

param_grid={'C':[0.1,1,10,100],'gamma':[0.0001,0.001,0.1,1],'kernel':['rbf','poly']}
svc=svm.SVC(probability=True)
model=GridSearchCV(svc,param_grid)
model.fit(trainX, trainY)
print('Best Score : ', model.best_score_)
print('Best Parameters : ', model.best_params_)



Best Score :  0.10579077684927267
Best Parameters :  {'C': 0.1, 'gamma': 0.0001, 'kernel': 'rbf'}


In [27]:
# extract local binary pattern (LBP) features from the dataset images (test set)

lbp_features = []
num_points = 100
radius = 8
eps=1e-7
for img in test_dataset:
    lbp = feature.local_binary_pattern(img, num_points, radius, method="uniform")
    (hist, _) = np.histogram(lbp.ravel(), bins=np.arange(0, num_points + 3), range=(0, num_points + 2))
    # normalize the histogram
    hist = hist.astype("float")
    hist /= (hist.sum() + eps)
    lbp_features.append(hist)
    
lbp_features = np.array(lbp_features)
print('shape of the resulted feature vectors: ' + str(lbp_features.shape))
    
# save the result into a pickle file:
save_obj(lbp_features, 'test_lbp_featurs')

shape of the resulted feature vectors: (599, 102)


In [63]:
# load all the extracted features and the labels from the final dataset:

trainX = load_obj('final_train_lbp_featurs') 
testX = load_obj('final_test_lbp_featurs')
trainY = load_obj('final_train_labels')
testY = load_obj('final_test_labels')

In [66]:
# Train on the whole train set using Random Forest, then test on the test set and save the predictions: 

model = RandomForestClassifier(n_estimators=100, random_state = 12)
model.fit(trainX, trainY)
predicted_RF = model.predict(testX)
accuracy = metrics.accuracy_score(testY, predicted_RF)
print(accuracy)
save_obj(predicted_RF, 'predicted_RF')

0.0881542699724518


In [67]:
# Train on the whole train set using SVM, then test on the test set and save the predictions: 

model = svm.SVC(kernel='rbf', C=0.1, gamma= 0.0001)
model.fit(trainX, trainY)
predicted_SVM = model.predict(testX)
accuracy = metrics.accuracy_score(testY, predicted_SVM)
print(accuracy)
save_obj(predicted_SVM, 'predicted_SVM')

0.09641873278236915


In [83]:
# Train on the whole train set using Logistic Regression, then test on the test set and save the predictions: 

model = LogisticRegression()
model.fit(trainX, trainY)
predicted_LR = model.predict(testX)
accuracy = metrics.accuracy_score(testY, predicted_LR)
print(accuracy)
save_obj(predicted_LR, 'predicted_LR')

0.08402203856749312


In [105]:
# ensemble the results of all the selected models:

predicted_densenet = np.genfromtxt(dir + 'densenet_test_predictions_ensemble.csv', delimiter=',')
predicted_resnet = np.genfromtxt(dir + 'resnet152_test_predictions_ensemble.csv', delimiter=',')
predicted_vgg = np.genfromtxt(dir + 'vgg_test_predictions_ensemble.csv', delimiter=',')

combined = np.array([predicted_densenet, predicted_resnet, predicted_vgg, predicted_LR, predicted_SVM, predicted_RF])
majority_vote = mode(combined)[0][0]
accuracy = metrics.accuracy_score(testY, majority_vote)
print(accuracy)

0.09641873278236915
