# Key Stoke Biometrics
## Goal is to identify a person based on the way they type
> The data consist of keystroke-timing information from 51 subjects (typists), each typing a password (.tie5Roanl) 400 times.

In [40]:
# conda install tensorflow

In [41]:
# conda install scikit-learn

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the data
df = pd.read_csv('DSL-StrongPasswordData.csv')

In [43]:
df.head(100)

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.2340,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.0560,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.1040,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.0270,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,s002,2,46,0.1080,0.1758,0.0678,0.0932,0.1392,0.0460,0.0771,...,0.1180,0.1405,0.0225,0.0708,0.1901,0.1193,0.0826,0.2171,0.1345,0.0755
96,s002,2,47,0.1053,0.1715,0.0662,0.0839,0.1219,0.0380,0.0718,...,0.1220,0.1120,-0.0100,0.0723,0.1987,0.1264,0.0723,0.2337,0.1614,0.0942
97,s002,2,48,0.1059,0.1882,0.0823,0.0913,0.1309,0.0396,0.0718,...,0.0942,0.1051,0.0109,0.0686,0.2180,0.1494,0.0715,0.3572,0.2857,0.1069
98,s002,2,49,0.1262,0.2715,0.1453,0.0934,0.1744,0.0810,0.0721,...,0.1146,0.1279,0.0133,0.0639,0.1808,0.1169,0.0871,0.2192,0.1321,0.0821


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20400 entries, 0 to 20399
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   subject          20400 non-null  object 
 1   sessionIndex     20400 non-null  int64  
 2   rep              20400 non-null  int64  
 3   H.period         20400 non-null  float64
 4   DD.period.t      20400 non-null  float64
 5   UD.period.t      20400 non-null  float64
 6   H.t              20400 non-null  float64
 7   DD.t.i           20400 non-null  float64
 8   UD.t.i           20400 non-null  float64
 9   H.i              20400 non-null  float64
 10  DD.i.e           20400 non-null  float64
 11  UD.i.e           20400 non-null  float64
 12  H.e              20400 non-null  float64
 13  DD.e.five        20400 non-null  float64
 14  UD.e.five        20400 non-null  float64
 15  H.five           20400 non-null  float64
 16  DD.five.Shift.r  20400 non-null  float64
 17  UD.five.Shif

# columns what do they mean?
- subject_id: the subject id (eg s002 or s057)
- not all subjects came for the session example s001 did not come so he/she is not included in dataset
- session_index is the session which the password was typed , person took breaks in between each session ranging from 1 to 8

## remaining are timing information
- rep is the repetition of the password typed in the session
- H.period: the duration where the period key was held down
- DD.period.t: the duration between pressing the period key and then pressing the t key
- UD.period.t: the duration between releasing the period key and then pressing the t key

# note
- UD can be negative if the user does not release the key before pressing the next key
- H + UD = DD

Consider the following one-line example of what you will see in the data:
  subject  sessionIndex  rep      H.period   DD.period.t   UD.period.t     ...
     s002             1    1        0.1491        0.3979        0.2488     ...
The example presents typing data for subject 2, session 1, repetition 1. The period key was held down for 0.1491 seconds (149.1 milliseconds); the time between pressing the period key and the t key (keydown-keydown time) was 0.3979 seconds; the time between releasing the period and pressing the t key (keyup-keydown time) was 0.2488 seconds; and so on.

In [45]:
df['subject'].unique()

array(['s002', 's003', 's004', 's005', 's007', 's008', 's010', 's011',
       's012', 's013', 's015', 's016', 's017', 's018', 's019', 's020',
       's021', 's022', 's024', 's025', 's026', 's027', 's028', 's029',
       's030', 's031', 's032', 's033', 's034', 's035', 's036', 's037',
       's038', 's039', 's040', 's041', 's042', 's043', 's044', 's046',
       's047', 's048', 's049', 's050', 's051', 's052', 's053', 's054',
       's055', 's056', 's057'], dtype=object)

In [46]:
# convert the subject column to a id column with numbers
df['subject'] = df['subject'].astype('category')
df['id'] = df['subject'].cat.codes



In [47]:
df

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return,id
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742,0
1,s002,1,2,0.1111,0.3451,0.2340,0.0694,0.1283,0.0589,0.0908,...,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747,0
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.0560,0.0821,...,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945,0
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.1040,...,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813,0
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1582,0.0270,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20395,s057,8,46,0.0884,0.0685,-0.0199,0.1095,0.1290,0.0195,0.0945,...,0.1383,0.0164,0.0820,0.1329,0.0509,0.1005,0.2054,0.1049,0.1047,50
20396,s057,8,47,0.0655,0.0630,-0.0025,0.0910,0.1148,0.0238,0.0916,...,0.0512,-0.0496,0.1037,0.0868,-0.0169,0.1445,0.2206,0.0761,0.1198,50
20397,s057,8,48,0.0939,0.1189,0.0250,0.1008,0.1122,0.0114,0.0721,...,0.1169,0.0256,0.0689,0.1311,0.0622,0.1034,0.2017,0.0983,0.0905,50
20398,s057,8,49,0.0923,0.1294,0.0371,0.0913,0.0990,0.0077,0.0992,...,0.0821,-0.0061,0.0576,0.0697,0.0121,0.0979,0.1917,0.0938,0.0931,50


In [48]:
df['id'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
      dtype=int8)

In [49]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation, Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets

X = df.drop(['subject','id'],axis=1)
y = df['id']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)



In [50]:
max(y)

50

In [51]:
y

0         0
1         0
2         0
3         0
4         0
         ..
20395    50
20396    50
20397    50
20398    50
20399    50
Name: id, Length: 20400, dtype: int8

In [52]:
# convert X_train and X_test to numpy array
X_train = np.array(X_train)
X_test = np.array(X_test)
X_val = np.array(X_val)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)



In [53]:
X_train

array([[ 6.    ,  2.    ,  0.0567, ...,  0.1548,  0.084 ,  0.0768],
       [ 5.    , 25.    ,  0.1183, ...,  0.3972,  0.2515,  0.0903],
       [ 4.    , 24.    ,  0.0562, ...,  0.4659,  0.391 ,  0.067 ],
       ...,
       [ 3.    , 35.    ,  0.1182, ...,  0.2033,  0.1083,  0.095 ],
       [ 5.    ,  3.    ,  0.0847, ...,  0.618 ,  0.5473,  0.0786],
       [ 7.    ,  5.    ,  0.0942, ...,  0.2294,  0.1173,  0.0984]])

In [54]:
y_train

array([46, 13, 42, ..., 46, 42, 40], dtype=int8)

In [55]:
# reshape the data to 3D
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
X_val = X_val.reshape(X_val.shape[0], 1, X_val.shape[1])


In [56]:
max(y)

50

In [57]:
# Build LSTM model

model = Sequential()
model.add(LSTM(128,activation='relu',return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(max(y)+1, activation='softmax'))
opt = keras.optimizers.Adam(learning_rate=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

mc = ModelCheckpoint('Securythm.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
early_stop = EarlyStopping(monitor='val_loss', patience=10)


model.fit(X_train, y_train, epochs=200, validation_data=(X_val, y_val), callbacks=[mc, early_stop])
model.summary()




Epoch 1/200
Epoch 1: val_accuracy improved from -inf to 0.06373, saving model to Securythm.h5
Epoch 2/200
Epoch 2: val_accuracy improved from 0.06373 to 0.08027, saving model to Securythm.h5
Epoch 3/200
Epoch 3: val_accuracy improved from 0.08027 to 0.10570, saving model to Securythm.h5
Epoch 4/200
Epoch 4: val_accuracy improved from 0.10570 to 0.20251, saving model to Securythm.h5
Epoch 5/200
Epoch 5: val_accuracy improved from 0.20251 to 0.28002, saving model to Securythm.h5
Epoch 6/200
Epoch 6: val_accuracy improved from 0.28002 to 0.39185, saving model to Securythm.h5
Epoch 7/200
Epoch 7: val_accuracy improved from 0.39185 to 0.43444, saving model to Securythm.h5
Epoch 8/200
Epoch 8: val_accuracy improved from 0.43444 to 0.45343, saving model to Securythm.h5
Epoch 9/200
Epoch 9: val_accuracy improved from 0.45343 to 0.52298, saving model to Securythm.h5
Epoch 10/200
Epoch 10: val_accuracy improved from 0.52298 to 0.55270, saving model to Securythm.h5
Epoch 11/200
Epoch 11: val_accu

In [58]:
test_loss , test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)

Test accuracy: 0.8855392336845398
Test loss: 0.8882290124893188


In [59]:
# Build LSTM model

model = Sequential()
model.add(LSTM(128,activation='relu',return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128,activation='relu',return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(64, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(max(y)+1, activation='softmax'))
opt = keras.optimizers.Adam(learning_rate=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

mc = ModelCheckpoint('Securythm1.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
early_stop = EarlyStopping(monitor='val_loss', patience=20)


model.fit(X_train, y_train, epochs=200, validation_data=(X_val, y_val), callbacks=[mc, early_stop])
model.summary()




Epoch 1/200
Epoch 1: val_accuracy improved from -inf to 0.05025, saving model to Securythm1.h5
Epoch 2/200
Epoch 2: val_accuracy improved from 0.05025 to 0.08854, saving model to Securythm1.h5
Epoch 3/200
Epoch 3: val_accuracy improved from 0.08854 to 0.10202, saving model to Securythm1.h5
Epoch 4/200
Epoch 4: val_accuracy improved from 0.10202 to 0.10999, saving model to Securythm1.h5
Epoch 5/200
Epoch 5: val_accuracy improved from 0.10999 to 0.12960, saving model to Securythm1.h5
Epoch 6/200
Epoch 6: val_accuracy improved from 0.12960 to 0.14062, saving model to Securythm1.h5
Epoch 7/200
Epoch 7: val_accuracy improved from 0.14062 to 0.15809, saving model to Securythm1.h5
Epoch 8/200
Epoch 8: val_accuracy improved from 0.15809 to 0.19822, saving model to Securythm1.h5
Epoch 9/200
Epoch 9: val_accuracy improved from 0.19822 to 0.23928, saving model to Securythm1.h5
Epoch 10/200
Epoch 10: val_accuracy improved from 0.23928 to 0.28646, saving model to Securythm1.h5
Epoch 11/200
Epoch 11

In [60]:
test_loss , test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
print('Test loss:', test_loss)

Test accuracy: 0.8835784196853638
Test loss: 2.3001418113708496


In [66]:
X_test.shape

(4080, 1, 33)

In [75]:
# load model
from keras.models import load_model
ScoresDF = pd.DataFrame(columns=['ModelName','TestAccuracy','TestLoss'])

#load model
model = load_model('Securythm.h5')


test_loss , test_acc = model.evaluate(X_test, y_test , batch_size=16)

ScoresDF = pd.concat([ScoresDF, pd.DataFrame({'ModelName':['Securythm.h5'],'TestAccuracy':[test_acc],'TestLoss':[test_loss]})], ignore_index=True)





In [76]:
second_model = load_model('Securythm1.h5')
test_loss , test_acc = second_model.evaluate(X_test, y_test, batch_size=16)
ScoresDF = ScoresDF.append({'ModelName':'Second Model','TestAccuracy':test_acc,'TestLoss':test_loss},ignore_index=True)




  ScoresDF = ScoresDF.append({'ModelName':'Second Model','TestAccuracy':test_acc,'TestLoss':test_loss},ignore_index=True)


In [77]:

ScoresDF

Unnamed: 0,ModelName,TestAccuracy,TestLoss
0,Securythm.h5,0.876961,0.92394
1,Second Model,0.88799,5.035567
