In [1]:
## Import for jupiter notebook styling // Ignore if using colab ##

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
##  Imports ##

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# Pre-processing

This following sections of code are for the pre-processing of data. In doing so I will use KNN classifier to judge whether pre-processing steps are advantageous and fine tune parameters to improve the performance of the training set. The training2 dataset will be split in to a smaller dataset. I will then use KNN imputation to fill in the nan gaps for training2. I will then test this by removing values from training1 and saving the nan filled training1 within a seperate document called training3. We can then use the same KNN imputation model on training3, whilst knowing the training1 ground truth data. I can then measure the distance from original values, calculate the mean percentage of nan values within an image and change the weight of the image accordingly using the confidence rating. Once this has been completed, I can use PCA to reduce dimensionality on both training1 and training2. To choose the correct number of principal components I have the PCA measured in total to 95% variance of the input data. Once PCA is complete, I will combine the two training sets and randomise the image samples to avoid any future bias.




In [3]:
## reading training data to variables ##
    
data=pd.read_csv("training1.csv") 
dataWithMissing=pd.read_csv("training2.csv")
training1WithDataRemoved=pd.read_csv("training3.csv") #this is a copy of training1 with 'n' values removed to test KNNimputation

## Normalisation step
Normalise all of the training sets including the training3 which has been created from training1 with missing values. This is used to test imputation as will be explained further in the code


In [4]:
from sklearn.preprocessing import MinMaxScaler

#normalise training1, training2 and training3(manipulated training1 with missing values)

dataFeat = data.iloc[:,:4096]
dataGistFeat = data.iloc[:,4096:-2] #normalise gist data seperately as will be on seperate scale
dataLabel = data.iloc[:,-2:]

dataWMFeat = dataWithMissing.iloc[:,:4096]
dataGistWMFeat = dataWithMissing.iloc[:,4096:-2]
dataWMLabel = dataWithMissing.iloc[:,-2:]

dataT1WMFeat = training1WithDataRemoved.iloc[:,:4096]
dataGistT1WMFeat = training1WithDataRemoved.iloc[:,4096:-2]
dataT1WMLabel = training1WithDataRemoved.iloc[:,-2:]
    
scaler = MinMaxScaler()
Gistscaler = MinMaxScaler()

#training data without missing
dataFeat = pd.DataFrame(scaler.fit_transform(dataFeat))
GistFeat = pd.DataFrame(Gistscaler.fit_transform(dataGistFeat))
data = np.column_stack((dataFeat,dataGistFeat,dataLabel))

#training2 data with missing
dataWMFeat = pd.DataFrame(scaler.transform(dataWMFeat))
dataGistWMFeat = pd.DataFrame(Gistscaler.fit_transform(dataGistWMFeat))
dataWithMissing = np.column_stack((dataWMFeat,dataGistWMFeat,dataWMLabel))

#training1 data with missing
dataT1WMFeat = pd.DataFrame(scaler.transform(dataT1WMFeat))
dataGistT1WMFeat = pd.DataFrame(Gistscaler.fit_transform(dataGistT1WMFeat))
training1WithDataRemoved = np.column_stack((dataT1WMFeat,dataGistT1WMFeat,dataT1WMLabel))

my_data_array=data
my_dataWithMissing_array=dataWithMissing 
my_training1WithMissing_array = training1WithDataRemoved

## KNN classifier (not part of main classifier or Pre-process)
Please note this is not the main classifier, nor does this contribute to any results. This is purely used to test whether the training sets are working as they should during pre-processing stages. Another KNN classifier is also also for the same purposes above to test training2 after imputation. The main classifier from which predictions will be made is the Multi-Layer Perceptron Regressor

In [5]:
## KNN classifier on complete training set 1 ##

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


#seperate training and validation sets of 70% and 30% of data set respectively
lengthTraining = len(my_data_array) * 0.70
my_data_array_train = my_data_array[:int(lengthTraining)]
my_data_array_valid = my_data_array[int(lengthTraining):]

#variables for storing manipulated data
data_features = []
data_labels = []
data_features_valid = []
data_labels_valid = []
Y_train = []
X_train = []

#seperate the training labels and features
for i in my_data_array_train:
    data_features.append(i[:-2])
    data_labels.append(i[-2:-1])

#seperate the validation labels and features
for i in my_data_array_valid:
    data_features_valid.append(i[:-2])
    data_labels_valid.append(i[-2:-1])

#add training data to singular list
Y_train = np.stack(data_labels)
X_train = np.stack(data_features)

#add testing data and ground truth labels
Y_test_actual = np.stack(data_labels_valid)
X_test = np.stack(data_features_valid)

#create KNeighbours object,choose amount of neighbours,input the training, validation data - validation results
knearest = KNeighborsClassifier(n_neighbors=5)
knearest.fit(X_train,Y_train)
Y_predict_knearest=knearest.predict(X_test)

#simplify label array to singular list to be passed through classifier
Y_test_actual = np.squeeze(Y_test_actual)
accuracy_knearest = round(accuracy_score(Y_predict_knearest,Y_test_actual)*100,2)
print("Accuracy score: "+str(accuracy_knearest)+ "%")
confusion_matrix(Y_test_actual, Y_predict_knearest)


Accuracy score: 75.56%


  return self._fit(X, y)


array([[76, 16],
       [28, 60]], dtype=int64)

## Check Nans
This function is used to check for nan values before and after imputation

In [6]:
## function for checking whether array contains any nan ##

def contains_nan(arrayL):

    array_has_nan = np.isnan(arrayL)
    hasMissing = False
    for i in array_has_nan:
        if i.any() == True:
            hasMissing = True
    return hasMissing    

## KNN Imputer

Using the trained KNN imputer, we can fill the nan gaps in training2 with feature data which will is likely to be more accurate than the feature data that can obtained by a simple imputer utilising just mean,mode,median of a single column.

In [7]:
## KNN imputer for filling in missing values in array ##

#convert data with missing values to a list for use in KNNimputer
from sklearn.impute import KNNImputer

dwm = dataWithMissing.tolist()
dwm = dwm[:800] #accuracy increased and helps to not negate value of training1
dwmLabels = []
dwmFeat = []

#seperate labels ready for imputation
for i in dwm:
    dwmFeat.append(i[:-2])
    dwmLabels.append(i[-2:])

#create imputer object, fit and transform the features 
imputer = KNNImputer(n_neighbors=5)
dwmFeat = imputer.fit_transform(dwmFeat)

#recombine labels and features
dwm = np.column_stack((dwmFeat,dwmLabels)) 

print("done") #this may take time to run

done


In [8]:
## proof imputation has removed nan values ##

#for use when adjusting confidence score of training2
counterMissing = np.count_nonzero(np.isnan(dataWithMissing[:800]))
                                
print(np.count_nonzero(np.isnan(dataWithMissing[:800])))
dwn = pd.DataFrame(dwm)
print(np.count_nonzero(np.isnan(dwm)))

735941
0


In [9]:
## Testing accuracy of the KNN imputer Part1 (using created imputer object on training3 which is training1 with missing values) ##

#KNN imputer on training 1 (from manipulated training 1 called training 3) with values removed to emulate training 2
#By knowing the original values for training 1 we can test our imputers accuracy against ground truth

t1dataWithMissing = training1WithDataRemoved.tolist()

t1dataWithMissingFeatures = []
t1dataWithMissingLabels = []

#print(t1dataWithMissing[0])
for i in t1dataWithMissing:
    t1dataWithMissingFeatures.append(i[:-2])
    t1dataWithMissingLabels.append(i[-2:])
 
t1dataWithMissingFeatures = imputer.transform(t1dataWithMissingFeatures)
t1dataWithMissing = np.column_stack((t1dataWithMissingFeatures,t1dataWithMissingLabels))

In [10]:
## testing KNN imputation accuracy Part2 (compare accuracy of imputed nan values in training3 against ground truth for training1) ##

data_actual = data.tolist() #holds original training1 data

counter = 0      #mean difference between original training1 values and predicted nan
totalCounter = 0 #total value of original training1 
meanDif = 0
counter2 = 0     #total number of nan values

#iterates through original training1 with ground truth and training1 with imputed values to test accuracy
for x in range(10):
        for i in range(len(data_actual)):
            if t1dataWithMissing[x][i] != data_actual[x][i]:
                counter = counter + (data_actual[x][i] -t1dataWithMissing[x][i]) #keeps total of difference in values
                totalCounter = totalCounter + data_actual[x][i] #counts total value from within training1 features where value removed
                counter2 = counter2 + 1 #counts amount of nan cells from training3
        
if (counter < 0):
    counter = counter * counter
percentageDif = (totalCounter /counter2) / counter
percentageDif = 100 - percentageDif

print("The KNN imputation fills incomplete nan values to approximately "+str(percentageDif)+ "% percent accuracy of original value")

The KNN imputation fills incomplete nan values to approximately 99.19242051295566% percent accuracy of original value


# Change confidence ratings according to accuracy
The features that have been filled by imputation in training2 are likely to be less accurate than the features in training1 by the nature of not knowing the ground truth of those values for training2. However we have managed to obtain a rough guideline of accuracy from testing the imputer on training1 and the training1 with nans added (training3). We can then measure the percentage of nan values that are contained in training2 and measure the impact of this loss of accuracy. The confidence rating is then adjusted to reflect this. 

In [11]:
## change confidence rating for training 2 to reflect innacuracies in imputed missing data ##

from collections import Counter

#firstly check proportion of data with is nan so we have a fair reflection of how innacurate whole dataset is
counterLen = 0
for i in dwm:
    for c in i:
        counterLen = counterLen +1
proportionOfMissing = counterMissing/counterLen

#use accuracy of filled nan gaps along with which proportion of data cells which are filled nan gaps
percentageDif = percentageDif * proportionOfMissing

#update confidence rating of training2         
for i in dwm:
    percent = (i[-1]/100)*percentageDif
    i[-1] = (i[-1] - percent)

This is the second KNN classifier I have used. This is not necessarily part of the pre-processing nor is part of the overall classification. I am just using this to test that training2 is showing trends resembling training1 after imputation.

In [12]:

## KNN classifier on complete training set 2 ##

#seperate training and validation sets of 70% and 30% of data set respectively
lengthTraining = len(dwm) * 0.70
my_training2_train = dwm[:int(lengthTraining)]
my_training2_valid = dwm[int(lengthTraining):]

#variables for storing manipulated data
data_features2 = []
data_labels2 = []
data_features_valid2 = []
data_labels_valid2 = []
Y_train2 = []
X_train2 = []

#seperate the training labels and features
for i in my_training2_train:
    data_features2.append(i[:-2])
    data_labels2.append(i[-2:-1])

#seperate the validation labels and features
for i in my_training2_valid:
    data_features_valid2.append(i[:-2])
    data_labels_valid2.append(i[-2:-1])

#add training data to singular list
Y_train2 = np.stack(data_labels2)
X_train2 = np.stack(data_features2)

#add testing data and ground truth labels
Y_test_actual2 = np.stack(data_labels_valid2)
X_test2 = np.stack(data_features_valid2)

#create KNeighbours object,choose amount of neighbours,input the training, validation data - validation results
knearest2 = KNeighborsClassifier(n_neighbors=5)
knearest2.fit(X_train2,Y_train2)
Y_predict_knearest2=knearest.predict(X_test2)

#simplify label array to singular list to be passed through classifier
Y_test_actual2 = np.squeeze(Y_test_actual2)
accuracy_knearest2 = round(accuracy_score(Y_predict_knearest2,Y_test_actual2)*100,2)
print("Accuracy score: "+str(accuracy_knearest2)+ "%")
confusion_matrix(Y_test_actual2, Y_predict_knearest2)

Accuracy score: 70.0%


  return self._fit(X, y)


array([[101,  24],
       [ 48,  67]], dtype=int64)

# PCA (Principal Components Analysis)
For both the training1 and training2, we will use PCA to reduce the dimensionality. We will fit the PCA to training1 and then use the PCA transform method to reduce the amount of features in training1 and training2. We will then add the labels to both sets again.

In [13]:
##pca on training sets##

from sklearn.decomposition import PCA

# placeholders for features, labels and confidence from training set1 and 2 before pca
data_feat = []
data_lab = []
dwm_feat = []
dwm_lab = []

#seperate the training labels and features
for i in my_data_array:
        data_feat.append(i[:-2])
        data_lab.append(i[-2:])
        
#seperate the training labels and features
for i in dwm:
    dwm_feat.append(i[:-2])
    dwm_lab.append(i[-2:])
    
#pca on training set1 
pca = PCA(n_components = 0.95) #number of primary components to account for 95% variance
pca_data = pca.fit_transform(data_feat)

#pca on training set2 (using pca trained on training1 instead of both as more trustworthy having had no nan features)
pca_data_withMissing = pca.transform(dwm_feat)

# append labels on end of features for both lists 
train1 = np.column_stack((pca_data,data_lab))
train2 = np.column_stack((pca_data_withMissing,dwm_lab))


# Merge the training sets
We will now merge the two training sets as part of the last step of pre-processing. We will also shuffle the two sets in order to avoid any bias.

In [14]:
## create a single training set from training1 and training2

import random

#create two lists of training1 and training2 subset, combine and shuffle
trainlist1 = train1.tolist()
trainlist2 = train2.tolist()
trainingData = trainlist1 + trainlist2
random.shuffle(trainingData)

# Multilayer perceptron Regressor

Using the single dataset containing features,labels and confidence from training1 and imputed training2, the multilayer Perceptron regressor will be trained. Once tuned, we will use the trained multilayer perceptron model on our test data set.

In [15]:
#Convert training and data in to array format

testData=pd.read_csv("test.csv") 

trainingData_array = np.array(trainingData) #test data from both training1 and imputed training2
testData_array = np.array(testData) #test data 

print(trainingData_array.shape)

(1400, 375)


## Test data Pre-Processing
We will normalise, use imputation and pca on the test data so that it is scaled and in the same format as the training data used to train the MLP

In [16]:
#impute missing values for test features using trained imputer from training data

testWithMissing = testData_array.tolist()

test = imputer.transform(testWithMissing)

In [17]:
#normalise test data using same scaler as training data

test1 = test[:,:4096]
testGistFeat = test[:,4096:] #the divide between gist and cnn features

test1 = pd.DataFrame(scaler.transform(test1)) #Scaling for Gist and CNN data seperately
testGistFeat = pd.DataFrame(Gistscaler.transform(testGistFeat))
test = np.column_stack((test1,testGistFeat))



In [18]:
#use trained pca on test data to remove same features

testInput = pca.transform(test)

# Merge confidence and label
So that we can use the confidence rating we will then merge the confidence and labels together. On the training data we will change the label to be the same value as the confidence, however positive for memorable and negative for non memorable (for instance -66 will be a 0 label with 66 confidence).  By doing so, this will convert the classifier in to a regression model that once tested against our test data and validation set, will provide a resulting output of any floating point number between -1 to 1. We shall then be able to take a binary result from the variable scales of label data by converting positive labels back to 1 and negative labels back to 0. 

In [19]:
#Combine confidence and label data 

#for 1 labels value is positive, for 0 labels value is negative. Allows for binary classification on confidence
#convert initially to regression task

trainingInputLabels = trainingData_array[:,-2:-1] #seperate labels
trainingInputConf = trainingData_array[:,-1:]

trainingInputConf = trainingInputConf * 100 

for i in range(len(trainingInputLabels)):
    if trainingInputLabels[i] > 0:
        trainingInputLabels[i] = trainingInputConf[i] #keep positive for label 1
    else:
        trainingInputLabels[i] = -abs(trainingInputConf[i]) #convert to negative for label 0

# Multi-Layer Perceptron Regressor
We will now use the MLPRegressor to use the training data which has labels which now scale from -1 to 1 to train the model. By using the regressor, we will use a split of training/validation and check the validation for accuracy. To check for accuracy we will convert back to binary, giving labels 1 for positive intergers and 0 for negative. We can then compare the original binary values, tune the number of hidden layers, random states and learning rate and retrain the MLPR to see if we can improve. Once the accuracy has reached what is deemed a suitable level, this model can be used to classify out test set and produce predictions.. Please be aware that the random seed causes results to differ on re-run. 

In [20]:
#split training data in to training/validation sets
    
trainingData_feat = trainingData_array[:,:-2]
lengthTraining = len(trainingData_feat) * 0.70
my_train = trainingData_feat[:int(lengthTraining)].tolist()
my_valid = trainingData_feat[int(lengthTraining):].tolist()
my_train_lab = trainingInputLabels[:int(lengthTraining)].tolist()
my_valid_actual = trainingInputLabels[int(lengthTraining):].tolist()
Y_pred_lab = []

In [21]:
from sklearn.neural_network import MLPRegressor

MLPR = MLPRegressor(hidden_layer_sizes=(300,100,200,300),
                    random_state=5,
                    verbose=True,
                    learning_rate_init=0.01,
                    #activation='tanh',
                    #batch_size = 23
                  )

MLPR.fit(my_train,my_train_lab)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 2217.62560297
Iteration 2, loss = 1498.01580754
Iteration 3, loss = 1119.73904895
Iteration 4, loss = 840.58768593
Iteration 5, loss = 618.01338998
Iteration 6, loss = 411.38667735
Iteration 7, loss = 251.60640609
Iteration 8, loss = 150.20324010
Iteration 9, loss = 88.52587942
Iteration 10, loss = 81.46123204
Iteration 11, loss = 68.47529417
Iteration 12, loss = 60.51701818
Iteration 13, loss = 44.20719142
Iteration 14, loss = 41.32096885
Iteration 15, loss = 27.07836175
Iteration 16, loss = 25.36606703
Iteration 17, loss = 15.11928019
Iteration 18, loss = 11.95400664
Iteration 19, loss = 10.49452227
Iteration 20, loss = 8.50216478
Iteration 21, loss = 7.10527079
Iteration 22, loss = 4.85886125
Iteration 23, loss = 4.07552114
Iteration 24, loss = 3.38279970
Iteration 25, loss = 2.51887649
Iteration 26, loss = 1.92241756
Iteration 27, loss = 1.20574042
Iteration 28, loss = 0.92979502
Iteration 29, loss = 0.56464595
Iteration 30, loss = 0.42811048
Iteration 31, loss 

MLPRegressor(hidden_layer_sizes=(300, 100, 200, 300), learning_rate_init=0.01,
             random_state=5, verbose=True)

In [22]:
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

Y_pred_labels=MLPR.predict(my_valid)

#convert regression values to binary classification (<0 = non-memorable >0 = memorable)

#convert predicted labels
for i in range(len(Y_pred_labels)):
    if Y_pred_labels[i] > 0:
        Y_pred_labels[i] = 1
    else:
        Y_pred_labels[i] = 0
        
my_valid_actual = np.array(my_valid_actual)

#convert actual labels
for i in range(len(my_valid_actual)):
    if my_valid_actual[i] > 0:
        my_valid_actual[i] = 1
    else:
        my_valid_actual[i] = 0
    
accuracy_score(Y_pred_labels, my_valid_actual)

0.7054631828978623

In [23]:
#create predictions on test set

Y_pred_labels=MLPR.predict(testInput)

for i in range(len(Y_pred_labels)):
    if Y_pred_labels[i] > 0:
        Y_pred_labels[i] = 1
    else:
        Y_pred_labels[i] = 0

for i in Y_pred_labels:
    print(i)

1.0
1.0
1.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
0.0
1.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
