In [60]:
### ALL GROUP MEMBERS - Pre-processing ###

# Load Data

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

from pandas import read_csv

file_path = 'datasets/train_imperson_without4n7_balanced_data.csv'

# Read data removing the first row that is a header as numbers
data = read_csv(file_path, header=None, skiprows=1)

print(data)

       0    1    2         3         4         5         6    7    8    9    \
0        0    0    0  0.000066  0.000066  0.009150  0.009150    0    0    0   
1        0    0    0  0.000014  0.000014  0.000000  0.000000    0    0    0   
2        0    0    0  0.035528  0.035528  0.070588  0.070588    0    0    0   
3        0    0    0  0.005128  0.005128  0.094771  0.094771    0    0    0   
4        0    0    0  0.035116  0.035116  0.070588  0.070588    0    0    0   
...    ...  ...  ...       ...       ...       ...       ...  ...  ...  ...   
97039    0    0    0  0.035953  0.035953  0.038562  0.038562    0    0    0   
97040    0    0    0  0.018075  0.018075  0.038562  0.038562    0    0    0   
97041    0    0    0  0.041889  0.041889  0.038562  0.038562    0    0    0   
97042    0    0    0  0.004191  0.004191  0.038562  0.038562    0    0    0   
97043    0    0    0  0.002892  0.002892  0.038562  0.038562    0    0    0   

       ...  143  144  145  146  147  148  149  150 

In [61]:
### ALL GROUP MEMBERS - Pre-processing ###

# Clean Data

# Remove columns where mean is zero
data = data.loc[:, (data != 0).any(axis = 0)]

print(data)

            3         4         5         6    11   12   13   14   15   17   \
0      0.000066  0.000066  0.009150  0.009150    1    1    1    1    1    1   
1      0.000014  0.000014  0.000000  0.000000    1    1    1    1    1    1   
2      0.035528  0.035528  0.070588  0.070588    1    1    1    1    1    1   
3      0.005128  0.005128  0.094771  0.094771    1    1    1    1    1    1   
4      0.035116  0.035116  0.070588  0.070588    1    1    1    1    1    1   
...         ...       ...       ...       ...  ...  ...  ...  ...  ...  ...   
97039  0.035953  0.035953  0.038562  0.038562    1    1    1    1    1    1   
97040  0.018075  0.018075  0.038562  0.038562    1    1    1    1    1    1   
97041  0.041889  0.041889  0.038562  0.038562    1    1    1    1    1    1   
97042  0.004191  0.004191  0.038562  0.038562    1    1    1    1    1    1   
97043  0.002892  0.002892  0.038562  0.038562    1    1    1    1    1    1   

       ...  136  137  138  139  140  141  142  143 

In [62]:
### ALL GROUP MEMBERS - Pre-processing ###

# Separate Input and Output Data

data_array = data.values

input_data = data_array[:,0:80]
output_data = data_array[:,80]

In [63]:
### ALL GROUP MEMBERS - Pre-processing ###

# Rescale Data

from sklearn.preprocessing import MinMaxScaler
from numpy import set_printoptions

min_max_scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_data = min_max_scaler.fit_transform(input_data)

# Summarize transformed data
set_printoptions(precision = 3)
print(rescaled_data)

[[6.422e-05 6.422e-05 9.150e-03 ... 0.000e+00 0.000e+00 0.000e+00]
 [1.169e-05 1.169e-05 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [3.631e-02 3.631e-02 7.059e-02 ... 0.000e+00 0.000e+00 0.000e+00]
 ...
 [4.281e-02 4.281e-02 3.856e-02 ... 0.000e+00 0.000e+00 0.000e+00]
 [4.281e-03 4.281e-03 3.856e-02 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.953e-03 2.953e-03 3.856e-02 ... 0.000e+00 0.000e+00 0.000e+00]]


In [64]:
### ALL GROUP MEMBERS - Pre-processing ###

# Standardize Data

from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler().fit(rescaled_data)
standardized_data = standard_scaler.transform(rescaled_data)

# Summarize standardized data
set_printoptions(precision = 3)
print(standardized_data[0:1,:])

[[-0.398 -0.398 -0.521 -0.521  0.015  0.015  0.015  0.     0.015  0.015
   0.015  0.015 -3.143  0.015  1.569  0.015 -1.533  1.534  0.015  0.709
   0.015  0.641  0.594 -0.175 -0.412 -0.285  1.391 -0.032 -1.542  0.02
  -0.103 -0.163 -0.036 -0.138  0.03   2.654 -0.005 -0.008 -0.009 -0.006
   0.015 -0.308 -0.237 -0.17  -0.003 -0.306 -0.059 -0.005 -0.029 -0.151
  -0.294 -0.061 -0.257 -0.021 -0.069 -0.006 -0.007 -0.315 -0.029 -0.295
  -0.068 -0.183 -0.025 -0.039 -0.148 -0.148 -0.147 -0.148 -0.148 -0.003
  -0.003  0.    -1.05  -0.104 -0.996 -0.092 -0.026 -0.453 -0.453 -0.496]]


In [65]:
### ALL GROUP MEMBERS - Pre-processing ###

# Normalize Data

from sklearn.preprocessing import Normalizer

normalizer = Normalizer().fit(rescaled_data)
normalized_data = normalizer.transform(rescaled_data)

# Summarize normalized data
set_printoptions(precision = 3)
print(normalized_data[0:1:])

[[1.500e-05 1.500e-05 2.138e-03 2.138e-03 2.336e-01 2.336e-01 2.336e-01
  0.000e+00 2.336e-01 2.336e-01 2.336e-01 2.336e-01 9.225e-02 2.336e-01
  2.072e-01 2.308e-01 0.000e+00 2.336e-01 2.336e-01 1.645e-01 2.336e-01
  1.911e-01 2.336e-01 7.188e-02 1.168e-01 0.000e+00 2.336e-01 0.000e+00
  0.000e+00 7.089e-04 2.656e-04 2.995e-03 3.385e-04 3.379e-04 6.390e-04
  2.211e-01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 2.336e-01 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00]]


In [66]:
### NAHOM - Selecting features ###

# Feature Selection - Univariate method

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

select_k_best = SelectKBest(score_func = chi2, k = 20)
selected_k_best_raw_data = select_k_best.fit(input_data, output_data)
selected_k_best_normalized_data = select_k_best.fit(normalized_data, output_data)

univariate_selection_raw_data_result = selected_k_best_raw_data.transform(input_data)
univariate_selection_normalized_data_result = selected_k_best_normalized_data.transform(normalized_data)

# Summarize scores
set_printoptions(precision = 3)
print(univariate_selection_raw_data_result)
print(univariate_selection_normalized_data_result)

[[0.009 0.009 0.887 ... 0.    0.    0.   ]
 [0.    0.    0.434 ... 0.    0.    0.   ]
 [0.071 0.071 0.    ... 0.    0.    0.   ]
 ...
 [0.039 0.039 0.    ... 0.    0.    0.   ]
 [0.039 0.039 0.    ... 0.    0.    0.   ]
 [0.039 0.039 0.    ... 0.    0.    0.   ]]
[[0.002 0.002 0.207 ... 0.    0.    0.   ]
 [0.    0.    0.11  ... 0.    0.    0.   ]
 [0.016 0.016 0.    ... 0.    0.    0.   ]
 ...
 [0.009 0.009 0.    ... 0.    0.    0.   ]
 [0.009 0.009 0.    ... 0.    0.    0.   ]
 [0.009 0.009 0.    ... 0.    0.    0.   ]]


In [None]:
### NAHOM - Selecting features ###

# Feature Selection - RFE method

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
rfe = RFE(model, 20)
rfe_data = rfe.fit(input_data, output_data)

# Summarize scores
print("Num Features: %d" % rfe_data.n_features_)
print("Selected Features: %s" % rfe_data.support_)
print("Feature Ranking: %s" % rfe_data.ranking_)

In [82]:
### LIUTAURAS VILDA - Exploring and selecting ML algorithms ###

# Trying various datasets: [1]raw, [2]standardized, [3]normalized, [4]k best features selected
# In order to define, which fits the best and contributes mostly to algorithm's accuracy
# Trying with LogisticRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
scoring = 'accuracy'

raw_data_result = cross_val_score(model, input_data, output_data, cv=kfold, scoring=scoring) # Accuracy: 0.947 (0.102)
standardized_data_result = cross_val_score(model, standardized_data, output_data, cv=kfold, scoring=scoring) # Accuracy: 0.945 (0.106)
normalized_data_result = cross_val_score(model, normalized_data, output_data, cv=kfold, scoring=scoring) # Accuracy: 0.958 (0.084)
k_best_selected_raw_data_result = cross_val_score(model, univariate_selection_raw_data_result, output_data, cv=kfold, scoring=scoring) # Accuracy: 0.956 (0.067)
k_best_selected_normalized_data_result = cross_val_score(model, univariate_selection_normalized_data_result, output_data, cv=kfold, scoring=scoring) # Accuracy: 0.953 (0.092)

print("Accuracy with Raw Data: %.1f%% (%.3f)" % (raw_data_result.mean() * 100, raw_data_result.std()))
print("Accuracy with Standardized Data: %.1f%% (%.3f)" % (standardized_data_result.mean() * 100, standardized_data_result.std()))
print("Accuracy with Normalized Data: %.1f%% (%.3f)" % (normalized_data_result.mean() * 100, normalized_data_result.std()))
print("Accuracy with K best features selected from Raw Data: %.1f%% (%.3f)" % (k_best_selected_raw_data_result.mean() * 100, k_best_selected_raw_data_result.std()))
print("Accuracy with K best features selected from Normalized Data: %.1f%% (%.3f)" % (k_best_selected_normalized_data_result.mean() * 100, k_best_selected_normalized_data_result.std()))

Accuracy with Raw Data: 94.7% (0.102)
Accuracy with Standardized Data: 94.5% (0.106)
Accuracy with Normalized Data: 95.8% (0.084)
Accuracy with K best features selected from Raw Data: 95.6% (0.078)
Accuracy with K best features selected from Normalized Data: 95.3% (0.092)


In [86]:
### LIUTAURAS VILDA - Exploring and selecting ML algorithms ###

# For a second iteration trying further two best datasets: [1]univariate_selection_input_data_result, [2]univariate_selection_normalized_data_result
# Trying with KNeighborsClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

kfold = KFold(n_splits=10, random_state=7) 
model = KNeighborsClassifier()
scoring = 'accuracy'

k_best_selected_raw_data_result = cross_val_score(model, univariate_selection_raw_data_result, output_data, cv=kfold, scoring=scoring) # Accuracy: 98.4% (0.031)
k_best_selected_normalized_data_result = cross_val_score(model, univariate_selection_normalized_data_result, output_data, cv=kfold, scoring=scoring) # Accuracy: 99.0% (0.020)

print("Accuracy: %.1f%% (%.3f)" % (k_best_selected_raw_data_result.mean() * 100, k_best_selected_raw_data_result.std()))
print("Accuracy: %.1f%% (%.3f)" % (k_best_selected_normalized_data_result.mean() * 100, k_best_selected_normalized_data_result.std()))

Accuracy: 98.4% (0.031)
Accuracy: 99.0% (0.020)


In [79]:
### LIUTAURAS VILDA - Exploring and selecting ML algorithms ###

# Potential candidate algorithms comparison

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

import time

models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Linear Discriminant', LinearDiscriminantAnalysis()))
models.append(('k-Nearest Neighbors', KNeighborsClassifier()))
models.append(('Decision Trees', DecisionTreeClassifier()))
models.append(('Naive Bayes', GaussianNB()))

results = []
names = []
scoring = 'accuracy'
print("### Classification Algorithms Accuracy ###\n")
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
    start_time = time.time()
    cv_results = cross_val_score(model, univariate_selection_normalized_data_result, output_data, cv=kfold, scoring=scoring)
    run_time = time.time() - start_time
    results.append(cv_results)
    names.append(name)
    print("%s: %.1f%% (std: %.3f), runtime: %.1fs" % (name, cv_results.mean() * 100, cv_results.std(), run_time))
    
# Based on the obtained accuracy results and algorithms runtimes, for further tunning I recommend such algorithms:
# [1]Logistic Regression, [2]k-Nearest Neighbors, [3]Gaussian Naive Bayes and possibly [4]Decision Trees

### Classification Algorithms Accuracy ###

Logistic Regression: 95.3% (std: 0.092), runtime: 3.1s
Linear Discriminant: 94.7% (std: 0.111), runtime: 3.2s
k-Nearest Neighbors: 99.0% (std: 0.020), runtime: 33.1s
Decision Trees: 96.7% (std: 0.067), runtime: 4.5s
Naive Bayes: 96.0% (std: 0.082), runtime: 0.6s


In [81]:
### LIUTAURAS VILDA - Exploring and selecting ML algorithms ###

# Neural Network Algorithm assessment

from keras.models import Sequential
from keras.layers import Dense
import numpy

numpy.random.seed(7)

model = Sequential()
model.add(Dense(12, input_dim=20, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(univariate_selection_normalized_data_result, output_data, epochs=10, batch_size=300)

scores = model.evaluate(univariate_selection_normalized_data_result, output_data)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

# Due to this algorithm's provisional accuracy of 99.37%, it seems to be a good candidate

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

acc: 99.37%


In [None]:
### NELLA - Refining algorithms ###

# For tuning part we are working on: DecisionTreeClassifier, GaussianNB, and one of Neural Network algorithm
# This part is DecisionTreeClassifier tuning

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix
from numpy import linspace


kfold = KFold(n_splits=10, random_state=7) 

print()
print("With MAX Depths")
print("--------------------------")

max_depths = linspace(1, 11, 11, endpoint=True)

for depth in max_depths:
    decision_tree_model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=depth,
     max_features=None, max_leaf_nodes=None,
     min_impurity_decrease=1e-07, min_samples_leaf=1,
     min_samples_split=2, min_weight_fraction_leaf=0.0,
     presort=False, random_state=None, splitter='best')

    decision_tree_model.fit(univariate_selection_normalized_data_result, output_data)
    output_prediction = decision_tree_model.predict(univariate_selection_normalized_data_result)

    matrix = confusion_matrix(output_data, output_prediction)
    print()
    print("with: " + str(depth))
    print(matrix)
    print()
    cv_results = cross_val_score(decision_tree_model, univariate_selection_normalized_data_result, output_data, cv=kfold, scoring=scoring)
    print(cv_results.mean())
    print("-----------")

print()
print("With MIN Sample Split")
print("--------------------------")

min_samples_splits = linspace(2, 11, 10, endpoint=True)

for sample_split in min_samples_splits:
    decision_tree_model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
     max_features=None, max_leaf_nodes=None,
     min_impurity_decrease=1e-07, min_samples_leaf=1,
     min_samples_split=int(sample_split), min_weight_fraction_leaf=0.0,
     presort=False, random_state=None, splitter='best')

    decision_tree_model.fit(univariate_selection_normalized_data_result, output_data)
    output_prediction = decision_tree_model.predict(univariate_selection_normalized_data_result)

    matrix = confusion_matrix(output_data, output_prediction)
    print("with: " + str(sample_split))
    print(matrix)
    print()
    cv_results = cross_val_score(decision_tree_model, univariate_selection_normalized_data_result, output_data, cv=kfold, scoring=scoring)
    print(cv_results.mean())
    print("-----------")

print()
print("With MIN Sample Leafs")
print("--------------------------")

min_samples_leafs = linspace(0.1, 0.5, 5, endpoint=True)

for sample_leaf in min_samples_leafs:
    decision_tree_model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
     max_features=None, max_leaf_nodes=None,
     min_impurity_decrease=1e-07, min_samples_leaf=sample_leaf,
     min_samples_split=5, min_weight_fraction_leaf=0.0,
     presort=False, random_state=None, splitter='best')

    decision_tree_model.fit(univariate_selection_normalized_data_result, output_data)
    output_prediction = decision_tree_model.predict(univariate_selection_normalized_data_result)

    matrix = confusion_matrix(output_data, output_prediction)
    print("with: " + str(sample_leaf))
    print(matrix)
    print()
    cv_results = cross_val_score(decision_tree_model, univariate_selection_normalized_data_result, output_data, cv=kfold, scoring=scoring)
    print(cv_results.mean())
    print("-----------")

In [73]:
### NELLA - Refining algorithms ###

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import numpy

def create_model():
    # create model
    model = Sequential()
    model.add(Dense(4, input_dim=20, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# fix random seed for reproducibility
seed = 8
numpy.random.seed(seed)

final_model = KerasClassifier(build_fn=create_model, epochs=8, batch_size=300, verbose=0)

In [74]:
### JO - Evaluating model and analysing the results ###

# Import and separate test data, fit model and make prediction using selected models

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from pandas import read_csv
import time

filename = 'datasets/test_imperson_without4n7_balanced_data.csv'
# Read data removing the first row that is a header as numbers
data = read_csv(filename, header=None, skiprows=1)

# Separate input and target data
test_data = data.values
input_test_data = test_data[:,0:152]
output_test_data = test_data[:,152]

test = select_k_best.fit(input_test_data, output_test_data)
# Transform test data using preferred feature selection method
input_test_data_fs = test.transform(input_test_data)

# Use selected model to make predictions on input test data 
start_time = time.time()
final_model.fit(univariate_selection_normalized_data_result, output_data)
run_time = time.time() - start_time
predicted = final_model.predict(input_test_data_fs)

print('Time taken to build model: ' + str(run_time))

Time taken to build model: 11.63680100440979


In [75]:
### JO - Evaluating model and analysing the results ###

# Evaluate using Classification Accuracy

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(output_test_data, predicted)
print("Accuracy: " + str(accuracy))

Accuracy: 0.9649882962298919


In [76]:
### JO - Evaluating model and analysing the results ###

# Evaluate using Confusion Matrix

from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(output_test_data, predicted)
print(matrix)

[[18702  1377]
 [   29 20050]]


In [77]:
### JO - Evaluating model and analysing the results ###

# Evaluate using Classification Report

from sklearn.metrics import classification_report

report = classification_report(output_test_data, predicted)
print(report)

              precision    recall  f1-score   support

         0.0       1.00      0.93      0.96     20079
         1.0       0.94      1.00      0.97     20079

    accuracy                           0.96     40158
   macro avg       0.97      0.96      0.96     40158
weighted avg       0.97      0.96      0.96     40158



In [78]:
### JO - Evaluating model and analysing the results ###

# Compute Matthews correlation coefficient

from sklearn.metrics import matthews_corrcoef

mcc = matthews_corrcoef(output_test_data, predicted)
print('Matthews correlation coefficient: ' + str(mcc))

Matthews correlation coefficient: 0.9320794446121159
