In [1]:
import numpy as np
import pandas as pd
import glob
import time

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

from sklearn.decomposition import PCA

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix

In [2]:
def extract_data_from_path(folder_path, label):
    """extracts data from a set of CSV files into one DataFrame"""
    
    # get path to all files
    all_files = glob.glob(folder_path + "/*.csv")
    
    # read each file into a DataFrame
    li = []
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        li.append(df)
    
    # concatenate all DataFrames
    concat_df = pd.concat(li, axis=0, ignore_index=True)
    
    # remove unncessary column
    concat_df.drop('Unnamed: 0', axis=1, inplace=True)
    
    # add the label to the DataFrame
    concat_df['label'] = label
    
    return concat_df

In [3]:
def avg_num_imposter_actions(y_test, y_pred):
    """calculates the average number of actions required to identiy an imposter"""
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    ta = y_test.size
    return ta / (1 - (fp / ta))

User 2

In [4]:
# extract data for user 2
user2_df = extract_data_from_path(r'C:\Users\DELL\Documents\Research\Mouse Dynamics\Dataset\user2\extracted', 1)

User 4

In [5]:
# extract data for user 4
user4_df = extract_data_from_path(r'C:\Users\DELL\Documents\Research\Mouse Dynamics\Dataset\user4\extracted', 0)

ML

In [6]:
# concatenate DataFrames into one
df = pd.concat([user2_df, user4_df], axis=0, ignore_index=True)

In [7]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(np.nan_to_num(df.loc[:, df.columns != 'label']), df['label'] ,test_size=0.2, random_state=42)

Logistic Regression

In [8]:
# create Logistic Regression model
start = time.time() 

logistic_regression_classifier = LogisticRegression()
logistic_regression_classifier.fit(np.nan_to_num(x_train.astype(np.float32)) ,y_train)

end = time.time()
process = round(end-start,2)
print("Logistic Regression Classifier has fitted, this process took {} seconds".format(process))

Logistic Regression Classifier has fitted, this process took 0.08 seconds


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# evaluate Logistic Regression model
logistic_regression_classifier.score(np.nan_to_num(x_test.astype(np.float32)), y_test)

0.3768553459119497

SVM

In [10]:
# create SVM model
start = time.time() 

svm_classifier = SVC()
svm_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Support Vector Machine Classifier has fitted, this process took {} seconds".format(process))

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)


Support Vector Machine Classifier has fitted, this process took 13.48 seconds


In [11]:
# evaluate SVM model
svm_classifier.score(x_test,y_test)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


0.6231446540880503

Naive Bayes

In [12]:
# create Naive Bayes model
start = time.time() 

naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(x_train, y_train)

end = time.time()
process = round(end-start,2)
print("Naive Bayes Classifier has fitted, this process took {} seconds".format(process))

Naive Bayes Classifier has fitted, this process took 0.02 seconds


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  ret = umr_sum(arr, axis, dtype, out, keepdims)


In [13]:
# evaluate Naive Bayes model
naive_bayes_classifier.score(x_test,y_test)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /


0.3768553459119497

K Neighbours Classifier

In [14]:
# create K Neighbours Classifier model
start = time.time() 

k_neighbors_classifier = KNeighborsClassifier(n_neighbors=5)
k_neighbors_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("K Neighbors Classifier has fitted, this process took {} seconds".format(process))

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


K Neighbors Classifier has fitted, this process took 0.35 seconds


In [15]:
# evaluate K Neighbours Classifier model
k_neighbors_classifier.score(x_test,y_test)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


0.8744654088050314

Decision Trees

In [16]:
# create Decision Trees model
start = time.time() 

decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(np.nan_to_num(x_train.astype(np.float32)), y_train)

end = time.time()
process = round(end-start,2)
print("Decision Tree Classifier has fitted, this process took {} seconds".format(process))

Decision Tree Classifier has fitted, this process took 0.2 seconds


In [17]:
# evaluate Decision Trees model
decision_tree_classifier.score(np.nan_to_num(x_test.astype(np.float32)),y_test)

0.9647798742138365

In [18]:
# make predictions using the Decision Trees model
y_pred = decision_tree_classifier.predict(np.nan_to_num(x_test.astype(np.float32)))

In [19]:
# calculate the average number of imposter required for Decision Trees model
avg_num_imposter_actions(y_test, y_pred)

4074.426250644662