# Keystroke Dynamics

### Importing libraries

In [48]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

## Data Loading and Preprocessing

### Loading data

In [49]:
df = pd.read_csv('data/StrongPasswordData.csv')
df.sum()

subject            s002s002s002s002s002s002s002s002s002s002s002s0...
sessionIndex                                                   91800
rep                                                           520200
H.period                                                   1904.9262
DD.period.t                                                5388.6217
UD.period.t                                                3483.6955
H.t                                                         1748.833
DD.t.i                                                     3449.3299
UD.t.i                                                     1700.4969
H.i                                                        1663.9357
DD.i.e                                                     3251.1835
UD.i.e                                                     1587.2478
H.e                                                        1818.4228
DD.e.five                                                  7699.6484
UD.e.five                         

In [50]:
df.head()

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


Anomaly detection per user (not general): it will allow us to detect if a user has a different typing pattern than usual. Keystroke dynamics are useful for this purpose because they are unique to each user.
However, we don't need the sessionIndex and rep columns, so we can drop them. They only indicate the number of times a user has logged in and typed their password within a session, respectively.

In [51]:
drop_cols = ['sessionIndex', 'rep']
df = df.drop(drop_cols, axis=1)
df.head()

Unnamed: 0,subject,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,DD.i.e,UD.i.e,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,0.2212,0.1043,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,0.1357,0.0449,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,0.1542,0.0721,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,0.2038,0.0998,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,0.1589,0.0686,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


### Feature engineering

As the subject column is categorical , we need to encode it. We can use one-hot encoding for this purpose.
We choose one-hot encoding because we don't want to introduce any kind of ordinality in the data. Subjects are just labels that identify a user but are only nominal. They don't share relationship between them.

In [52]:
enc = preprocessing.OneHotEncoder()
enc.fit(df[['subject']])
onehotlabels = enc.transform(df[['subject']]).toarray()
onehotlabels.shape

(20400, 51)

We have 51 subjects, so we have 51 columns. Now, we are replacing the subject column with the one-hot encoded columns and dropping the original subject column thanks to the ColumnTransformer class from scikit-learn, we want to keep the rest of the columns as they are already numerical features.

In [53]:
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
unique_cols=df['subject'].nunique()

data=np.array(ct.fit_transform(df))

In [54]:
data[:5]

array([[1.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.1491, 0.3979, 0.2488, 0.1069, 0.1674,
        0.0605, 0.1169, 0.2212, 0.1043, 0.1417, 1.1885, 1.0468, 0.1146,
        1.6055, 1.4909, 0.1067, 0.759 , 0.6523, 0.1016, 0.2136, 0.112 ,
        0.1349, 0.1484, 0.0135, 0.0932, 0.3515, 0.2583, 0.1338, 0.3509,
        0.2171, 0.0742],
       [1.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
        0.    , 0.    , 0.    , 0.    ,

In [55]:
X=data[:,unique_cols:]
y=data[:,:unique_cols]
print(y.shape)

(20400, 51)


## Data splitting

As a usual practice, we will split the dataset into 80% for training and 20% for testing. We want to stratify the split by subject, so we can have the same proportion of subjects in both sets.

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df['subject'], test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (16320, 31)
X_test shape: (4080, 31)
y_train shape: (16320, 51)
y_test shape: (4080, 51)


In [57]:
X_train[:5]

array([[ 0.0515,  0.1755,  0.124 ,  0.032 ,  0.0471,  0.0151,  0.0254,
         0.3914,  0.366 ,  0.0665,  0.5834,  0.5169,  0.0404,  0.3648,
         0.3244,  0.0541,  0.217 ,  0.1629,  0.0547,  0.1279,  0.0732,
         0.0591,  0.0602,  0.0011,  0.0462,  0.1528,  0.1066,  0.0599,
         0.1704,  0.1105,  0.0547],
       [ 0.1309,  0.4092,  0.2783,  0.0945,  0.1122,  0.0177,  0.1349,
         0.1338, -0.0011,  0.0843,  0.6247,  0.5404,  0.071 ,  0.3906,
         0.3196,  0.1307,  0.2212,  0.0905,  0.0953,  0.116 ,  0.0207,
         0.1241,  0.1598,  0.0357,  0.0892,  0.1528,  0.0636,  0.095 ,
         0.2073,  0.1123,  0.0961],
       [ 0.0679,  0.1718,  0.1039,  0.0856,  0.0787, -0.0069,  0.0914,
         0.0167, -0.0747,  0.0813,  0.1869,  0.1056,  0.076 ,  0.2118,
         0.1358,  0.0837,  0.1231,  0.0394,  0.076 ,  0.1152,  0.0392,
         0.1038,  0.0117, -0.0921,  0.0781,  0.0932,  0.0151,  0.0805,
         0.1804,  0.0999,  0.0478],
       [ 0.0963,  0.1825,  0.0862,  0.06

## Single models

### Random Forest Classifier

In [58]:
from sklearn.ensemble import RandomForestClassifier

In [59]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf.predict(X_test)

print(f"Training set score: {rf.score(X_train, y_train)}")
print(f"Test set score: {rf.score(X_test, y_test)}")

Training set score: 0.9998774509803922
Test set score: 0.7


### K-Nearest Neighbors Classifier

In [60]:
from sklearn.neighbors import KNeighborsClassifier

In [61]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)

print(f"Training set score: {knn.score(X_train, y_train)}")
print(f"Test set score: {knn.score(X_test, y_test)}")

Training set score: 0.7469362745098039
Test set score: 0.6877450980392157


### Multi-Layer Perceptron Classifier

In [62]:
from sklearn.neural_network import MLPClassifier

In [63]:
mlp = MLPClassifier(random_state=42, max_iter=1000)
mlp.fit(X_train, y_train)
mlp.predict(X_test)

print(f"Training set score: {mlp.score(X_train, y_train)}")
print(f"Test set score: {mlp.score(X_test, y_test)}")

Training set score: 0.8969975490196078
Test set score: 0.8254901960784313


