In [41]:
import numpy as np
import pandas as pd
from log_parser import parse_keyboard_log
from keyboard_heatmap import KeyboardHeatmap

# Segment length of each data sample
seg_length = 30

# List of parsed logfiles
keyboard = []
keyboard.append(parse_keyboard_log("keylogs/joseph/keyboard.log"))
keyboard.append(parse_keyboard_log("keylogs/jonathan/keyboard.log"))

# Hack to get the number of segments contained in each file
lengths = [
    keyboard[0].time.iloc[-1] / seg_length,
    keyboard[1].time.iloc[-1] / seg_length,
]

# Empty lists for inserting data
X_actual = []
Y_actual = []

for k in range(len(keyboard)):
    for i in range(int(lengths[k])):
        # For each segment in each logfile
        # Create a heatmap for that segment
        heatmap = KeyboardHeatmap(keyboard[k], i, seg_length)
        # If the heatmap isn't blank
        if heatmap.class_label() != 'Null':
            X_actual.append(heatmap.heatmap_data().ravel().tolist())
            Y_actual.append(heatmap.class_label())

# Display features
for i in range(len(X_actual)):
    print(X_actual[i][:10], Y_actual[i])
print(f"Data samples: {len(Y_actual)}")


[0.067, 0.133, 0.0, 0.2, 0.067, 0.0, 0.0, 11.752, 0.316, 0.0] Joseph
[0.367, 0.2, 0.367, 0.1, 0.367, 0.0, 0.0, 1.844, 0.997, 0.269] Joseph
[0.267, 0.267, 0.033, 0.3, 0.133, 0.0, 0.0, 1.816, 1.426, 0.864] Joseph
[0.333, 0.333, 0.233, 0.433, 0.0, 0.0, 0.0, 0.688, 0.41, 0.377] Joseph
[0.367, 0.4, 0.133, 0.433, 0.0, 0.0, 0.0, 1.38, 0.43, 0.596] Joseph
[0.367, 0.567, 0.233, 0.3, 0.0, 0.0, 0.0, 1.012, 0.567, 0.969] Joseph
[0.633, 0.533, 0.2, 0.567, 0.0, 0.0, 0.0, 0.92, 0.624, 0.384] Joseph
[0.333, 0.433, 0.2, 0.5, 0.033, 0.0, 0.0, 1.04, 0.763, 0.725] Joseph
[0.367, 0.367, 0.1, 0.4, 0.1, 0.0, 0.0, 1.616, 0.416, 0.555] Joseph
[0.5, 0.4, 0.133, 0.433, 0.0, 0.0, 0.0, 0.704, 0.864, 1.224] Joseph
[0.333, 0.5, 0.067, 0.333, 0.033, 0.0, 0.0, 0.832, 0.526, 1.4] Joseph
[0.0, 0.033, 0.0, 0.033, 0.0, 0.0, 0.0, 0.0, 0.272, 0.0] Joseph
[0.1, 0.167, 0.0, 0.267, 0.033, 0.0, 0.0, 4.96, 0.336, 0.0] Joseph
[0.233, 0.367, 0.033, 0.3, 0.1, 0.0, 0.0, 2.917, 0.88, 0.512] Joseph
[0.133, 0.267, 0.067, 0.167, 0.0, 0.

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_actual, Y_actual, random_state=0)

# Create random forests classifier object
# n_jobs controls multithreading, -1 uses all available threads
# criterion determines how well a feature splits data, we use gini
# max_features determines the number of candidate features available for a best split
# n_estimators determines the number of trees in our forest
# oob_score determines whether or not our forest uses out of bag error
rfc = RandomForestClassifier(n_jobs=-1, criterion='gini', max_features= 'sqrt', n_estimators = 100, oob_score = True) 

model = rfc.fit(X_train, Y_train)

# Print best score
print( 
    f"Train score: {rfc.score(X_train, Y_train)} " +
    f"Test score: {rfc.score(X_test, Y_test)}"
)

# Predict on test set
Y_predict = rfc.predict(X_test)

# Ouput predictions vs ground truth values
for i in range(len(Y_predict)):
    print(f"Actual: {Y_test[i]:<10}  -  Predicted: {Y_predict[i]}")

Train score: 1.0 Test score: 0.9230769230769231
Actual: Joseph      -  Predicted: Joseph
Actual: Joseph      -  Predicted: Joseph
Actual: Joseph      -  Predicted: Joseph
Actual: JONSOL      -  Predicted: Joseph
Actual: Joseph      -  Predicted: Joseph
Actual: Joseph      -  Predicted: Joseph
Actual: JONSOL      -  Predicted: JONSOL
Actual: Joseph      -  Predicted: Joseph
Actual: Joseph      -  Predicted: Joseph
Actual: Joseph      -  Predicted: Joseph
Actual: JONSOL      -  Predicted: JONSOL
Actual: JONSOL      -  Predicted: JONSOL
Actual: Joseph      -  Predicted: Joseph


In [43]:
import matplotlib.pyplot as plt

# We want to plot the feature importance of all features to see how our classifier is splitting data
importances = model.feature_importances_
indices = np.argsort(importances[::-1])
names = [heatmap.heatmap_data[i] for i in indices]

# Barplot
plt.bar(range(X_train.shape[1]), importances[indices])
# Add feature names as x-axis labels
plt.xticks(range(X_train.shape[1]), names, rotation=20, fontsize = 8)
# Create plot title
plt.title("Feature Importance")
# Show plot
plt.show()

TypeError: 'method' object is not subscriptable