In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten,LSTM, Dense, UpSampling1D,Reshape,MaxPooling1D,Conv1D,Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

# tran and save model start
def train_and_save_model(model, data, username, epochs=10, batch_size=32):
    # Compile the model with a binary cross-entropy loss function and accuracy metric
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model on the keystroke data
    model.fit(data, data, epochs=epochs, batch_size=batch_size)
    
    # Save the trained model to a file and return the file path
    model_path = f'models/{username}_model.h5'
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    model.save(model_path)
    
    return model_path

# tran and save model start




# generative function start
def create_generator_model(input_dim):
    # Initialize the generator model
    model = Sequential()
    
    # Add dense layers to the generator
    model.add(Dense(256, input_dim=input_dim, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(input_dim, activation='tanh'))  # Output layer
    
    return model

def create_discriminator_model(input_dim):
    # Initialize the discriminator model
    model = Sequential()
    
    # Add dense layers to the discriminator
    model.add(Dense(512, input_dim=input_dim, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    
    return model

# generative function end

# Long Short-Term Memory (LSTM) Network start
def create_lstm_model(input_shape):
    # Initialize the LSTM model
    model = Sequential()
    
    # Add LSTM layers
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(50))
    
    # Add the output layer for binary classification
    model.add(Dense(1, activation='sigmoid'))
    
    return model
# Long Short-Term Memory (LSTM) Network end


# Multi-Layer Perceptron (MLP) start

def create_mlp_model(input_dim):
    # Initialize the MLP model
    model = Sequential()
    
    # Add dense layers to the MLP
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    
    return model

# Multi-Layer Perceptron (MLP) end


# random forest layer start
def train_random_forest_model(filepath):
    # Load the keystroke data from the CSV file
    df = pd.read_csv(filepath)
    
    # Separate the features (flight time, delay time) and the target (keystroke type)
    X = df[['hold_time', 'flight_time']].fillna(0)
    y = df['key']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Train the Random Forest classifier
    rf_model = RandomForestClassifier(n_estimators=100)
    rf_model.fit(X_train, y_train)
    
    # Evaluate the model on the testing set
    y_pred = rf_model.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    
    return rf_model, rf_accuracy
# random forest layer end

# Convolutional Neural Network (CNN) start
def create_cnn_model(input_shape):
    # Initialize the CNN model
    model = Sequential()
    
    # Add convolutional and pooling layers
    model.add(Conv2D(9, kernel_size=(9), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    # Flatten the output and add dense layers
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    
    return model

# Convolutional Neural Network (CNN) end


def process_data(data = []):
    rf = np.array(data)
    keystrokes = []
    press_times = []
    release_times = []
    
    for k in rf:
        keystrokes.append(k[0])
        if k[2] == 'keydown':
            press_times.append(k[1])
        elif k[2] == 'keyup':
            release_times.append(k[1])
    
    print('keystrokes: ',len(keystrokes))
    print('\n')
    print('press times: ',len(press_times))
    print('\n')
    print('released times: ',len(release_times))
    flight_times = np.array(release_times) - np.array(press_times)
    
    print('released times: ',len(flight_times),flight_times)
    
    delayed_times = np.array(press_times[1:]) - np.array(release_times[:-1])
    print('\n')
    print('delayed times: ',len(delayed_times),delayed_times)
    
    new_data = np.concatenate([flight_times,delayed_times])
    print('\n')
    print('fight/delayed times: ',len(new_data),new_data.shape,new_data)
    reshaped_data = new_data.reshape(-1,1)
    print('\n')
    print('reshaped fight/delayed times: ',len(reshaped_data),reshaped_data.shape,reshaped_data)
    
    model = create_cnn_model(input_shape=reshaped_data.shape)
    # model_path = train_and_save_model(model,reshaped_data,'GOD IS GOOD')
    
    print('\n')
    # print(f'model path: {model_path}')
    
# save processed featured data start
def save_processed_data(username,dataPath):
    process_dir = os.path.join('featured_data','featured_csvs')
    if not os.path.exists(process_dir):
        os.makedirs(process_dir)
    processed_filepath = os.path.join(process_dir, f'{username}_processed_featured_data.csv')
    processed_filepath2 = os.path.join(process_dir, f'{username}_processed_featured_data2.csv')
    
    hold_times= []
    flight_times= []
    last_release_time = None
    dataFm = pd.read_csv(dataPath)
    
    for i in range(len(dataFm)):
        if dataFm.iloc[i]['event'] == 'keydown':
            release_index = dataFm[(dataFm['key']== dataFm.iloc[i]['key']) & (dataFm['event'] == 'keyup')].index
            print('release index:', release_index)
            if not release_index.empty:
                hold_time = dataFm.loc[release_index[0], 'time'] - dataFm.iloc[i]['time']
                hold_times.append(hold_time)
            else:
                hold_times.append(None)
            if last_release_time is not None:
                flight_time = dataFm.iloc[i]['time'] - last_release_time
                flight_times.append(flight_time)
            else:
                flight_times.append(None)
        elif dataFm.iloc[i]['event'] == 'keyup':
            last_release_time = dataFm.iloc[i]['time']
        
    data = {'hold_time': hold_times, 'flight_time': flight_times}
    print('DATA',data)
    
    dataFm['hold_time'] = pd.Series(hold_times)
    dataFm['flight_time'] = pd.Series(flight_times)
    
    dataFm.to_csv(processed_filepath2, index=False)
    
            
    
# save processed featured data end

# generate features start
def generate_features(username,filepath):
    # Generate features here

    process_dir = os.path.join('processed_data','processed_features')
    if not os.path.exists(process_dir):
        os.makedirs(process_dir)
    processed_filepath = os.path.join(process_dir, f'{username}_processed_featured_data.csv')
    # Load the raw keystroke data
    raw_data = pd.read_csv(filepath)

    # Sort data by timestamp to ensure correct order
    raw_data = raw_data.sort_values(by='time')

    # Initialize lists to store computed features
    hold_times = []
    flight_times = []
    delay_times = []

    # Initialize variables to keep track of previous key events
    prev_keyup_time = None
    prev_keydown_time = None
    prev_key = None

    # Process the raw keystroke data
    for index, row in raw_data.iterrows():
        key = row['key']
        event = row['event']
        timestamp = row['time']
        
        if event == 'keydown':
            # Calculate delay time (time between the previous keydown and current keydown)
            if prev_keydown_time is not None:
                delay_time = timestamp - prev_keydown_time
                delay_times.append({'key': key, 'delay_time': delay_time})
            
            # Update the previous keydown timestamp
            prev_keydown_time = timestamp
        
        elif event == 'keyup':
            # Calculate hold time (time between keydown and keyup of the same key)
            hold_time = timestamp - prev_keydown_time
            hold_times.append({'key': key, 'hold_time': hold_time})
            
            # Calculate flight time (time between the previous keyup and the current keydown)
            if prev_keyup_time is not None:
                flight_time = timestamp - prev_keyup_time
                flight_times.append({'key': key, 'flight_time': flight_time})
            
            # Update the previous keyup timestamp
            prev_keyup_time = timestamp

    # Convert the lists to DataFrames
    hold_times_df = pd.DataFrame(hold_times)
    flight_times_df = pd.DataFrame(flight_times)
    delay_times_df = pd.DataFrame(delay_times)
    print("hold_times: ",hold_times)
    print("\n")
    print("flight_times: ", flight_times)
    print("\n")
    print("delay_times: ",delay_times)
    print("\n")

    # Merge all the DataFrames on the 'key' column
    features_df = pd.merge(hold_times_df, flight_times_df, on='key', how='outer')
    features_df = pd.merge(features_df, delay_times_df, on='key', how='outer')
    
    print("\n")
    print("features_df: ",features_df)

    # Save the features to a CSV file
    features_df.to_csv(processed_filepath, index=False)

    print(f"Keystroke features saved to '{processed_filepath}'")
    """
    Explanation of the Script

    Loading and Sorting:
        The script starts by loading the raw keystroke data from a CSV file and sorting it by timestamp to ensure the events are processed in chronological order.

    Feature Computation:
        Hold Time: The script calculates the time a key is held down by finding the difference between the keydown and keyup events for the same key.
        Flight Time: The time between releasing one key and pressing the next key is computed.
        Delay Time: The time between pressing two consecutive keys is calculated.

    Data Merging:
        The computed features are stored in separate lists, which are then converted to pandas DataFrames.
        These DataFrames are merged on the key column to form a single DataFrame containing all the features.
    """
    return processed_filepath

# generate features end

# create cnn model start
def create_model(input_shape):
    model = Sequential()
    
    # Encoder
    model.add(Conv1D(filters=16, kernel_size=2, activation='relu', input_shape=input_shape, padding='same'))
    model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(Conv1D(filters=8, kernel_size=2, activation='relu', padding='same'))
    model.add(MaxPooling1D(pool_size=2, padding='same'))
    
    # Flatten and bottleneck layer
    model.add(Flatten())
    model.add(Dense(8, activation='relu'))  # Bottleneck layer
    
    # Decoder
    model.add(Dense(8 * (input_shape[0] // 4), activation='relu'))
    model.add(Reshape((input_shape[0] // 4, 8)))
    model.add(UpSampling1D(2))
    model.add(Conv1D(filters=8, kernel_size=2, activation='relu', padding='same'))
    model.add(UpSampling1D(2))
    model.add(Conv1D(filters=16, kernel_size=2, activation='relu', padding='same'))
    model.add(Conv1D(filters=1, kernel_size=2, activation='sigmoid', padding='same'))
    
    model.compile(optimizer='adam', loss='mse')
    return model
# create cnn model end


# train and save model start
def load_preprocess_data_train_model(username,filepath):
    # Load the keystroke data
    data = pd.read_csv(filepath)
    key_out = data.drop('key',axis=1)
    key_out = key_out.drop('hold_time',axis=1)
    key_out = key_out.drop('flight_time',axis=1)
    # Use all data as features (no label column)
    X = key_out.values

    # Split data for testing (here using 80% training and 20% testing)
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

    # Standardize
    scaler = StandardScaler()
    X = scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

    # Train/test split
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

    # Reshape if necessary for the CNN
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    
    # Create the model
    model = create_model(X_train.shape[1:])

    # Model summary
    model.summary()
    
    # Train the model
    model.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

    model_dir = os.path.join('trained_models','model_files')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_filepath = os.path.join(model_dir, f'{username}_keystroke_auth_model.keras')
    # Save the trained model
    model.save(model_filepath)
    
    # Detect anomalies
    # Load the trained autoencoder model
    autoencoder = tf.keras.models.load_model(model_filepath)

    # Compute reconstruction error on test data
    reconstructions = autoencoder.predict(X_test)
    # reconstructions = autoencoder.predict(X)
    mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)
    # mse = np.mean(np.power(X - reconstructions, 2), axis=1)

    # Define a threshold for anomaly detection
    threshold = np.percentile(mse, 95)  # For example, use the 95th percentile as the threshold

    # Predict anomalies
    anomalies = mse > threshold
    
    # if mse > threshold:
    #     print(f"Anomaly detection threshold: {threshold} (imposter)")
    #     print("\n")
    # else:
    #     print(f"Anomaly detection threshold: {threshold} (genuine user)")
    #     print("\n")

    # Output results
    for i, is_anomaly in enumerate(anomalies):
        if is_anomaly:
            print(f"Test sample {i} is an anomaly (likely an impostor).")
            print("\n")
        else:
            print(f"Test sample {i} is normal (likely a genuine user).")
            print("\n")

# train and save model end


# create Multi-Layer Perceptron (MLP) model start
def create_mlp_model(input_shape):
    model = Sequential()
    model.add(Dense(64, input_shape=(input_shape,), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model
# create Multi-Layer Perceptron (MLP) model end


# train and save mlp model start
def preprocess_mlp_data(data):
    X = data.drop('key', axis=1).values  # Features
    y = data.drop('key',axis=1)['delay_time'].values  # Labels
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test
    
def load_preprocess_data_train_mlp_model(username,filepath):
    # Load the keystroke data
    data = pd.read_csv(filepath)
    X_train, X_test, y_train, y_test = preprocess_mlp_data(data)
    
    print("X_train.shape: ",X_train.shape,"X_test.shape: ",X_test.shape)
    
    model = create_mlp_model(X_train.shape[1])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
    
    # Save the trained model
    model_dir = os.path.join('trained_models','model_files')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_filepath = os.path.join(model_dir, f'{username}_keystroke_auth_mlp_model.keras')
    # Save the trained model
    model.save(model_filepath)
    model.summary
    
    # Detect anomalies
    # Load the trained autoencoder model
    autoencoder = tf.keras.models.load_model(model_filepath)

    # Compute reconstruction error on test data
    reconstructions = autoencoder.predict(X_test)
    # reconstructions = autoencoder.predict(X)
    mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)
    # mse = np.mean(np.power(X - reconstructions, 2), axis=1)

    # Define a threshold for anomaly detection
    threshold = np.percentile(mse, 95)  # For example, use the 95th percentile as the threshold

    # Predict anomalies
    anomalies = mse > threshold
    
    # if mse > threshold:
    #     print(f"Anomaly detection threshold: {threshold} (imposter)")
    #     print("\n")
    # else:
    #     print(f"Anomaly detection threshold: {threshold} (genuine user)")
    #     print("\n")

    # Output results
    for i, is_anomaly in enumerate(anomalies):
        if is_anomaly:
            print(f"Test sample {i} is an anomaly (likely an impostor).")
            print("\n")
        else:
            print(f"Test sample {i} is normal (likely a genuine user).")
            print("\n")
    
#train model mlp end





    
    

    

def process_items():
    # data = pd.read_csv("../data/raw/jjulius.csv")
    # print("raw data: ",data)
    featured_path =  generate_features("jjulius","../data/raw/jjulius.csv")
    load_preprocess_data_train_model("jjulius",featured_path)
    # load_preprocess_data_train_mlp_model("jjulius",featured_path)
    # save_processed_data('jjulius',"../data/raw/jjulius.csv")
    # process_data(data)
    # print(data)
    
if __name__ == "__main__":
    process_items()
    


hold_times:  [{'key': 'h', 'hold_time': 115}, {'key': 'e', 'hold_time': 128}, {'key': 'l', 'hold_time': 82}, {'key': 'l', 'hold_time': 126}, {'key': 'o', 'hold_time': 146}, {'key': 'w', 'hold_time': 118}, {'key': ' ', 'hold_time': 152}, {'key': 'w', 'hold_time': 22}, {'key': 'o', 'hold_time': 138}, {'key': 'r', 'hold_time': 133}, {'key': 'l', 'hold_time': 78}, {'key': 'd', 'hold_time': 133}, {'key': ' ', 'hold_time': 127}, {'key': 'h', 'hold_time': 101}, {'key': 'e', 'hold_time': 106}, {'key': 'l', 'hold_time': 85}, {'key': 'l', 'hold_time': 96}, {'key': 'o', 'hold_time': 83}, {'key': ' ', 'hold_time': 114}, {'key': 'u', 'hold_time': 80}, {'key': 'g', 'hold_time': 111}, {'key': 'a', 'hold_time': 15}, {'key': 'n', 'hold_time': 139}, {'key': 'd', 'hold_time': 110}, {'key': 'a', 'hold_time': 170}, {'key': ' ', 'hold_time': 121}, {'key': 'a', 'hold_time': 103}, {'key': 'r', 'hold_time': 118}, {'key': 'e', 'hold_time': 203}, {'key': ' ', 'hold_time': 111}, {'key': 'y', 'hold_time': 70}, {'k

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: nan - val_loss: nan
Epoch 2/50
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 3/50
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 4/50
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 5/50
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 6/50
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 7/50
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 8/50
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 9/50
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: nan - val_lo

W0000 00:00:1724898299.317267   51254 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Conv2D" attr { key: "T" value { type: DT_FLOAT } } attr { key: "data_format" value { s: "NHWC" } } attr { key: "dilations" value { list { i: 1 i: 1 i: 1 i: 1 } } } attr { key: "explicit_paddings" value { list { } } } attr { key: "padding" value { s: "SAME" } } attr { key: "strides" value { list { i: 1 i: 1 i: 1 i: 1 } } } attr { key: "use_cudnn_on_gpu" value { b: true } } inputs { dtype: DT_FLOAT shape { dim { size: 32 } dim { size: 1 } dim { } dim { size: 8 } } } inputs { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 2 } dim { size: 8 } dim { size: 8 } } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 2799 num_cores: 4 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 4194304 memory_size: 268435456 } out

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Test sample 0 is normal (likely a genuine user).


Test sample 1 is normal (likely a genuine user).


Test sample 2 is normal (likely a genuine user).


Test sample 3 is normal (likely a genuine user).


Test sample 4 is normal (likely a genuine user).


Test sample 5 is normal (likely a genuine user).


Test sample 6 is normal (likely a genuine user).


Test sample 7 is normal (likely a genuine user).


Test sample 8 is normal (likely a genuine user).


Test sample 9 is normal (likely a genuine user).


Test sample 10 is normal (likely a genuine user).


Test sample 11 is normal (likely a genuine user).


Test sample 12 is normal (likely a genuine user).


Test sample 13 is normal (likely a genuine user).


Test sample 14 is normal (likely a genuine user).


Test sample 15 is normal (likely a genuine user).


Test sample 16 is normal (likely a genuine user).


Test sample 17 is normal (likely a genuine user).




  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(
