# Key Stoke Biometrics
## Goal is to identify a person based on the way they type
> The data consist of keystroke-timing information from 51 subjects (typists), each typing a password (.tie5Roanl) 400 times.

In [9]:
# conda install tensorflow

In [None]:
conda install scikit-learn

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the data
df = pd.read_csv('DSL-StrongPasswordData.csv')

In [7]:
df.head(100)

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.2340,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.0560,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.1040,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.0270,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,s002,2,46,0.1080,0.1758,0.0678,0.0932,0.1392,0.0460,0.0771,...,0.1180,0.1405,0.0225,0.0708,0.1901,0.1193,0.0826,0.2171,0.1345,0.0755
96,s002,2,47,0.1053,0.1715,0.0662,0.0839,0.1219,0.0380,0.0718,...,0.1220,0.1120,-0.0100,0.0723,0.1987,0.1264,0.0723,0.2337,0.1614,0.0942
97,s002,2,48,0.1059,0.1882,0.0823,0.0913,0.1309,0.0396,0.0718,...,0.0942,0.1051,0.0109,0.0686,0.2180,0.1494,0.0715,0.3572,0.2857,0.1069
98,s002,2,49,0.1262,0.2715,0.1453,0.0934,0.1744,0.0810,0.0721,...,0.1146,0.1279,0.0133,0.0639,0.1808,0.1169,0.0871,0.2192,0.1321,0.0821


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20400 entries, 0 to 20399
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   subject          20400 non-null  object 
 1   sessionIndex     20400 non-null  int64  
 2   rep              20400 non-null  int64  
 3   H.period         20400 non-null  float64
 4   DD.period.t      20400 non-null  float64
 5   UD.period.t      20400 non-null  float64
 6   H.t              20400 non-null  float64
 7   DD.t.i           20400 non-null  float64
 8   UD.t.i           20400 non-null  float64
 9   H.i              20400 non-null  float64
 10  DD.i.e           20400 non-null  float64
 11  UD.i.e           20400 non-null  float64
 12  H.e              20400 non-null  float64
 13  DD.e.five        20400 non-null  float64
 14  UD.e.five        20400 non-null  float64
 15  H.five           20400 non-null  float64
 16  DD.five.Shift.r  20400 non-null  float64
 17  UD.five.Shif

# columns what do they mean?
- subject_id: the subject id (eg s002 or s057)
- not all subjects came for the session example s001 did not come so he/she is not included in dataset
- session_index is the session which the password was typed , person took breaks in between each session ranging from 1 to 8

## remaining are timing information
- rep is the repetition of the password typed in the session
- H.period: the duration where the period key was held down
- DD.period.t: the duration between pressing the period key and then pressing the t key
- UD.period.t: the duration between releasing the period key and then pressing the t key

# note
- UD can be negative if the user does not release the key before pressing the next key
- H + UD = DD

Consider the following one-line example of what you will see in the data:
  subject  sessionIndex  rep      H.period   DD.period.t   UD.period.t     ...
     s002             1    1        0.1491        0.3979        0.2488     ...
The example presents typing data for subject 2, session 1, repetition 1. The period key was held down for 0.1491 seconds (149.1 milliseconds); the time between pressing the period key and the t key (keydown-keydown time) was 0.3979 seconds; the time between releasing the period and pressing the t key (keyup-keydown time) was 0.2488 seconds; and so on.

In [None]:
df['subject'].unique()

In [8]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation, Flatten
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets

X = df.drop(['subject'],axis=1)
y = df['subject']

# label encoding of subject column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



ModuleNotFoundError: No module named 'tensorflow'

In [None]:
y

In [None]:
# Build LSTM model
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1:]), activation='relu', return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(df['subject'].unique(), activation='softmax'))
opt = keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

model.summary()

model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

