In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models

In [47]:
# Load the data
data = pd.read_csv('key_press_data.csv')

In [48]:
data.head()

Unnamed: 0,User ID,Key,Duration,Time Between Keys,Typing Speed (KPS),Backspace Count,Typing Session Duration
0,1,Key.enter,0.0,,0.0,0,1.555494
1,1,'j',1.007231,,0.39021,0,2.562725
2,1,'s',0.248771,,1.778412,0,2.811496
3,1,'k',0.00984,,1.77221,0,2.821336
4,1,'d',0.021683,,2.110432,0,2.843019


In [49]:
data.tail()

Unnamed: 0,User ID,Key,Duration,Time Between Keys,Typing Speed (KPS),Backspace Count,Typing Session Duration
70,1,'f',0.029834,,10.281413,0,7.100191
71,1,'d',0.023279,,10.247814,0,7.123471
72,1,'i',0.080198,,10.133725,0,7.203669
73,1,'u',0.012482,,10.116196,0,7.216151
74,1,Key.esc,1.398343,,8.590173,0,8.614494


In [25]:
# Preprocess the data
def preprocess_data(data, scaler=None, template_columns=None):
    # Convert key column to string
    data['Key'] = data['Key'].astype(str)

    # One-hot encode the 'Key' column
    data = pd.get_dummies(data, columns=['Key'])

    # Align with template columns if provided
    if template_columns is not None:
        for col in template_columns:
            if col not in data.columns:
                data[col] = 0
        data = data[template_columns]
    else:
        template_columns = data.columns.tolist()

    # Fill any NaN values with 0
    data.fillna(0, inplace=True)

    # Normalize numerical columns
    if scaler is None:
        scaler = StandardScaler()
        data[['Duration', 'Time Between Keys', 'Typing Speed (KPS)', 'Backspace Count', 'Typing Session Duration']] = scaler.fit_transform(
            data[['Duration', 'Time Between Keys', 'Typing Speed (KPS)', 'Backspace Count', 'Typing Session Duration']]
        )
    else:
        data[['Duration', 'Time Between Keys', 'Typing Speed (KPS)', 'Backspace Count', 'Typing Session Duration']] = scaler.transform(
            data[['Duration', 'Time Between Keys', 'Typing Speed (KPS)', 'Backspace Count', 'Typing Session Duration']]
        )

    return data, scaler, template_columns

In [26]:
preprocessed_data, scaler, template_columns = preprocess_data(data)

In [86]:
preprocessed_data.head()

Unnamed: 0,User ID,Duration,Time Between Keys,Typing Speed (KPS),Backspace Count,Typing Session Duration,Key_'b',Key_'c',Key_'d',Key_'f',...,Key_'j',Key_'k',Key_'r',Key_'s',Key_'u',Key_'v',Key_'w',Key_'x',Key_Key.enter,Key_Key.esc
0,1,-0.475639,0.0,-2.552988,0.0,-2.468609,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,1,4.61444,0.0,-2.402033,0.0,-1.792014,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,1,0.781534,0.0,-1.864999,0.0,-1.624905,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
3,1,-0.425912,0.0,-1.867399,0.0,-1.618295,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
4,1,-0.366063,0.0,-1.736555,0.0,-1.60373,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [87]:
template_columns

['User ID',
 'Duration',
 'Time Between Keys',
 'Typing Speed (KPS)',
 'Backspace Count',
 'Typing Session Duration',
 "Key_'b'",
 "Key_'c'",
 "Key_'d'",
 "Key_'f'",
 "Key_'h'",
 "Key_'i'",
 "Key_'j'",
 "Key_'k'",
 "Key_'r'",
 "Key_'s'",
 "Key_'u'",
 "Key_'v'",
 "Key_'w'",
 "Key_'x'",
 'Key_Key.enter',
 'Key_Key.esc']

In [27]:
# Ensure all columns are numeric
assert preprocessed_data.applymap(np.isreal).all().all()

In [88]:
preprocessed_data.head()

Unnamed: 0,User ID,Duration,Time Between Keys,Typing Speed (KPS),Backspace Count,Typing Session Duration,Key_'b',Key_'c',Key_'d',Key_'f',...,Key_'j',Key_'k',Key_'r',Key_'s',Key_'u',Key_'v',Key_'w',Key_'x',Key_Key.enter,Key_Key.esc
0,1,-0.475639,0.0,-2.552988,0.0,-2.468609,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,1,4.61444,0.0,-2.402033,0.0,-1.792014,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,1,0.781534,0.0,-1.864999,0.0,-1.624905,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
3,1,-0.425912,0.0,-1.867399,0.0,-1.618295,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
4,1,-0.366063,0.0,-1.736555,0.0,-1.60373,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [28]:
# Split the data into training and test sets
X = preprocessed_data.drop(columns=['User ID'])
y = preprocessed_data['User ID']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Ensure there are no infinite values
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [30]:
# Convert to numpy arrays
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

In [31]:
# Ensure the data types are float32
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In [33]:
# Build the autoencoder model
input_dim = X_train.shape[1]
encoding_dim = 32  # Size of the encoding layer

autoencoder = models.Sequential([
    layers.InputLayer(input_shape=(input_dim,)),
    layers.Dense(encoding_dim, activation='relu'),
    layers.Dense(input_dim, activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')

In [64]:
autoencoder.add(tf.keras.layers.Dense(256, input_shape=(X_train.shape[1],), activation='sigmoid'))

In [65]:
# Train the autoencoder
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7a01f84c5ea0>

In [66]:
# Evaluate the model
reconstructions = autoencoder.predict(X_test)
reconstruction_errors = np.mean(np.square(reconstructions - X_test), axis=1)



In [67]:
# Set a threshold for anomaly detection
threshold = np.mean(reconstruction_errors) + 2 * np.std(reconstruction_errors)

In [91]:
def is_anomaly(new_data, autoencoder, threshold, scaler, template_columns):
    # Preprocess the new data with the same template columns and scaler
    new_data, _, _ = preprocess_data(new_data, scaler, template_columns)

    # Ensure new_data columns match template_columns
    for col in template_columns:
        if col not in new_data.columns:
            new_data[col] = 0

    # Ensure the order of columns matches template_columns
    new_data = new_data[template_columns]

    print("new_data.head()", new_data.head())
    print("New data shape before reshape:", new_data.shape)

    # Convert to numpy array and ensure the data type is float32
    new_data = new_data.to_numpy().astype(np.float32)

    # Reshape the new data to match the expected input shape
    new_data = new_data[:, :autoencoder.input_shape[1]]  # Ensure correct number of features
    new_data = new_data.reshape(-1, autoencoder.input_shape[1])  # Reshape to match the model input shape

    # Check shapes after reshape
    print("New data shape after reshape:", new_data.shape)

    # Predict using the autoencoder
    reconstructions = autoencoder.predict(new_data)
    reconstruction_errors = np.mean(np.square(reconstructions - new_data), axis=1)

    # Check if the reconstruction error exceeds the threshold
    return reconstruction_errors > threshold


In [92]:
# Example usage
new_data = pd.read_csv('new_key_press_data.csv')
anomalies = is_anomaly(new_data, autoencoder, threshold, scaler, template_columns)
print("Anomalies detected:", np.sum(anomalies))


new_data.head()    User ID  Duration  Time Between Keys  Typing Speed (KPS)  Backspace Count  \
0        1 -0.475639                0.0           -2.552988              0.0   
1        1  4.614440                0.0           -2.402033              0.0   
2        1  0.781534                0.0           -1.864999              0.0   
3        1 -0.425912                0.0           -1.867399              0.0   
4        1 -0.366063                0.0           -1.736555              0.0   

   Typing Session Duration  Key_'b'  Key_'c'  Key_'d'  Key_'f'  ...  Key_'j'  \
0                -2.468609        0        0    False    False  ...    False   
1                -1.792014        0        0    False    False  ...     True   
2                -1.624905        0        0    False    False  ...    False   
3                -1.618295        0        0    False    False  ...    False   
4                -1.603730        0        0     True    False  ...    False   

   Key_'k'  Key_'r'  K