In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten,LSTM, Dense, UpSampling1D,Reshape,MaxPooling1D,Conv1D,Dropout,LeakyReLU
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import os


# tran and save model start
def train_and_save_model(model, data, username, epochs=10, batch_size=32):
    # Compile the model with a binary cross-entropy loss function and accuracy metric
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model on the keystroke data
    model.fit(data, data, epochs=epochs, batch_size=batch_size)
    
    # Save the trained model to a file and return the file path
    model_path = f'models/{username}_model.h5'
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    model.save(model_path)
    
    return model_path

# tran and save model start




# generative function start
def create_generator_model(input_dim):
    # Initialize the generator model
    model = Sequential()
    
    # Add dense layers to the generator
    model.add(Dense(256, input_dim=input_dim, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(input_dim, activation='tanh'))  # Output layer
    
    return model

def create_discriminator_model(input_dim):
    # Initialize the discriminator model
    model = Sequential()
    
    # Add dense layers to the discriminator
    model.add(Dense(512, input_dim=input_dim, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    
    return model

# generative function end

# Long Short-Term Memory (LSTM) Network start
def create_lstm_model(input_shape):
    # Initialize the LSTM model
    model = Sequential()
    
    # Add LSTM layers
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(50))
    
    # Add the output layer for binary classification
    model.add(Dense(1, activation='sigmoid'))
    
    return model
# Long Short-Term Memory (LSTM) Network end


# Multi-Layer Perceptron (MLP) start

def create_mlp_model(input_dim):
    # Initialize the MLP model
    model = Sequential()
    
    # Add dense layers to the MLP
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    
    return model

# Multi-Layer Perceptron (MLP) end


# random forest layer start
def train_random_forest_model(filepath):
    # Load the keystroke data from the CSV file
    df = pd.read_csv(filepath)
    
    # Separate the features (flight time, delay time) and the target (keystroke type)
    X = df[['hold_time', 'flight_time']].fillna(0)
    y = df['key']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Train the Random Forest classifier
    rf_model = RandomForestClassifier(n_estimators=100)
    rf_model.fit(X_train, y_train)
    
    # Evaluate the model on the testing set
    y_pred = rf_model.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    
    return rf_model, rf_accuracy
# random forest layer end

# Convolutional Neural Network (CNN) start
def create_cnn_model(input_shape):
    # Initialize the CNN model
    model = Sequential()
    
    # Add convolutional and pooling layers
    model.add(Conv2D(9, kernel_size=(9), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    # Flatten the output and add dense layers
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    
    return model

# Convolutional Neural Network (CNN) end


def process_data(data = []):
    rf = np.array(data)
    keystrokes = []
    press_times = []
    release_times = []
    
    for k in rf:
        keystrokes.append(k[0])
        if k[2] == 'keydown':
            press_times.append(k[1])
        elif k[2] == 'keyup':
            release_times.append(k[1])
    
    print('keystrokes: ',len(keystrokes))
    print('\n')
    print('press times: ',len(press_times))
    print('\n')
    # print('released times: ',len(release_times))
    flight_times = np.array(release_times) - np.array(press_times)
    
    # print('released times: ',len(flight_times),flight_times)
    
    delayed_times = np.array(press_times[1:]) - np.array(release_times[:-1])
    print('\n')
    # print('delayed times: ',len(delayed_times),delayed_times)
    
    new_data = np.concatenate([flight_times,delayed_times])
    print('\n')
    # print('fight/delayed times: ',len(new_data),new_data.shape,new_data)
    reshaped_data = new_data.reshape(-1,1)
    print('\n')
    # print('reshaped fight/delayed times: ',len(reshaped_data),reshaped_data.shape,reshaped_data)
    
    model = create_cnn_model(input_shape=reshaped_data.shape)
    # model_path = train_and_save_model(model,reshaped_data,'GOD IS GOOD')
    
    print('\n')
    # print(f'model path: {model_path}')
    
# save processed featured data start
def save_processed_data(username,dataPath):
    process_dir = os.path.join('featured_data','featured_csvs')
    if not os.path.exists(process_dir):
        os.makedirs(process_dir)
    processed_filepath = os.path.join(process_dir, f'{username}_processed_featured_data.csv')
    processed_filepath2 = os.path.join(process_dir, f'{username}_processed_featured_data2.csv')
    
    hold_times= []
    flight_times= []
    last_release_time = None
    dataFm = pd.read_csv(dataPath)
    
    for i in range(len(dataFm)):
        if dataFm.iloc[i]['event'] == 'keydown':
            release_index = dataFm[(dataFm['key']== dataFm.iloc[i]['key']) & (dataFm['event'] == 'keyup')].index
            print('release index:', release_index)
            if not release_index.empty:
                hold_time = dataFm.loc[release_index[0], 'time'] - dataFm.iloc[i]['time']
                hold_times.append(hold_time)
            else:
                hold_times.append(None)
            if last_release_time is not None:
                flight_time = dataFm.iloc[i]['time'] - last_release_time
                flight_times.append(flight_time)
            else:
                flight_times.append(None)
        elif dataFm.iloc[i]['event'] == 'keyup':
            last_release_time = dataFm.iloc[i]['time']
        
    data = {'hold_time': hold_times, 'flight_time': flight_times}
    # print('DATA',data)
    
    dataFm['hold_time'] = pd.Series(hold_times)
    dataFm['flight_time'] = pd.Series(flight_times)
    
    dataFm.to_csv(processed_filepath2, index=False)
    
            
    
# save processed featured data end

# generate features start
def generate_features(username,filepath):
    # Generate features here

    process_dir = os.path.join('processed_data','processed_features')
    if not os.path.exists(process_dir):
        os.makedirs(process_dir)
    processed_filepath = os.path.join(process_dir, f'{username}_processed_featured_data.csv')
    # Load the raw keystroke data
    raw_data = pd.read_csv(filepath)

    # Sort data by timestamp to ensure correct order
    raw_data = raw_data.sort_values(by='time')

    # Initialize lists to store computed features
    hold_times = []
    flight_times = []
    delay_times = []

    # Initialize variables to keep track of previous key events
    prev_keyup_time = None
    prev_keydown_time = None
    prev_key = None

    # Process the raw keystroke data
    for index, row in raw_data.iterrows():
        key = row['key']
        event = row['event']
        timestamp = row['time']
        
        if event == 'keydown':
            # Calculate delay time (time between the previous keydown and current keydown)
            if prev_keydown_time is not None:
                delay_time = timestamp - prev_keydown_time
                delay_times.append({'key': key, 'delay_time': delay_time})
            
            # Update the previous keydown timestamp
            prev_keydown_time = timestamp
        
        elif event == 'keyup':
            # Calculate hold time (time between keydown and keyup of the same key)
            hold_time = timestamp - prev_keydown_time
            hold_times.append({'key': key, 'hold_time': hold_time})
            
            # Calculate flight time (time between the previous keyup and the current keydown)
            if prev_keyup_time is not None:
                flight_time = timestamp - prev_keyup_time
                flight_times.append({'key': key, 'flight_time': flight_time})
            
            # Update the previous keyup timestamp
            prev_keyup_time = timestamp

    # Convert the lists to DataFrames
    hold_times_df = pd.DataFrame(hold_times)
    flight_times_df = pd.DataFrame(flight_times)
    delay_times_df = pd.DataFrame(delay_times)
    # print("hold_times: ",hold_times)
    # print("\n")
    # print("flight_times: ", flight_times)
    # print("\n")

    # print("delay_times: ",delay_times)
    # print("\n")

    # Merge all the DataFrames on the 'key' column
    features_df = pd.merge(hold_times_df, flight_times_df, on='key', how='outer')
    features_df = pd.merge(features_df, delay_times_df, on='key', how='outer')
    
    print("\n")
    print("features_df: \n",features_df)

    # Save the features to a CSV file
    features_df.to_csv(processed_filepath, index=False)

    print(f"Keystroke features saved to '{processed_filepath}'")
    """
    Explanation of the Script

    Loading and Sorting:
        The script starts by loading the raw keystroke data from a CSV file and sorting it by timestamp to ensure the events are processed in chronological order.

    Feature Computation:
        Hold Time: The script calculates the time a key is held down by finding the difference between the keydown and keyup events for the same key.
        Flight Time: The time between releasing one key and pressing the next key is computed.
        Delay Time: The time between pressing two consecutive keys is calculated.

    Data Merging:
        The computed features are stored in separate lists, which are then converted to pandas DataFrames.
        These DataFrames are merged on the key column to form a single DataFrame containing all the features.
    """
    return processed_filepath

# generate features end

# create cnn model start
def create_cnn_model(input_shape):
    model = Sequential()
    
    # Encoder
    model.add(Conv1D(filters=16, kernel_size=2, activation='relu', input_shape=input_shape, padding='same'))
    model.add(MaxPooling1D(pool_size=2, padding='same'))
    model.add(Conv1D(filters=8, kernel_size=2, activation='relu', padding='same'))
    model.add(MaxPooling1D(pool_size=2, padding='same'))
    
    # Flatten and bottleneck layer
    model.add(Flatten())
    model.add(Dense(8, activation='relu'))  # Bottleneck layer
    
    # Decoder
    model.add(Dense(8 * (input_shape[0] // 4), activation='relu'))
    model.add(Reshape((input_shape[0] // 4, 8)))
    model.add(UpSampling1D(2))
    model.add(Conv1D(filters=8, kernel_size=2, activation='relu', padding='same'))
    model.add(UpSampling1D(2))
    model.add(Conv1D(filters=16, kernel_size=2, activation='relu', padding='same'))
    model.add(Conv1D(filters=1, kernel_size=2, activation='sigmoid', padding='same'))
    
    model.compile(optimizer='adam', loss='mse')
    return model
# create cnn model end


# train and save model start
def load_preprocess_data_train_cnn_model(username,filepath):
    # Load the keystroke data
    data = pd.read_csv(filepath)
    key_out = data.drop('key',axis=1)
    key_out = key_out.drop('hold_time',axis=1)
    key_out = key_out.drop('flight_time',axis=1)
    # Use all data as features (no label column)
    X = key_out.values

    # Split data for testing (here using 80% training and 20% testing)
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

    # Standardize
    scaler = StandardScaler()
    X = scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

    # Train/test split
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

    # Reshape if necessary for the CNN
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    
    # Create the model
    model = create_cnn_model(X_train.shape[1:])

    # Model summary
    model.summary()
    
    # Train the model
    model.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

    model_dir = os.path.join('trained_models','model_files')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_filepath = os.path.join(model_dir, f'{username}_keystroke_auth_model.keras')
    # Save the trained model
    model.save(model_filepath)
    
    # Detect anomalies
    # Load the trained autoencoder model
    autoencoder = tf.keras.models.load_model(model_filepath)

    # Compute reconstruction error on test data
    reconstructions = autoencoder.predict(X_test)
    # reconstructions = autoencoder.predict(X)
    mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)
    # mse = np.mean(np.power(X - reconstructions, 2), axis=1)

    # Define a threshold for anomaly detection
    threshold = np.percentile(mse, 95)  # For example, use the 95th percentile as the threshold

    # Predict anomalies
    anomalies = mse > threshold
    
    print("\n\n anomalies",anomalies)
    print("\n\n reconstructions:-",reconstructions)
    
    results = "fail"
    if len(anomalies)>0:
        if anomalies[0]:
            results = False
        else:
            results = True
    print("\n\n is user authenicate: ", results)
    
    
    # if mse > threshold:
    #     print(f"Anomaly detection threshold: {threshold} (imposter)")
    #     print("\n")
    # else:
    #     print(f"Anomaly detection threshold: {threshold} (genuine user)")
    #     print("\n")

    # Output results
    # for i, is_anomaly in enumerate(anomalies):
    #     if is_anomaly:
    #         print(f"Test sample {i} is an anomaly (likely an impostor).")
    #         print("\n")
    #     else:
    #         print(f"Test sample {i} is normal (likely a genuine user).")
    #         print("\n")

# train and save model end


# create Multi-Layer Perceptron (MLP) model start
def create_mlp_model(input_shape):
    model = Sequential()
    model.add(Dense(64, input_shape=(input_shape,), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model
# create Multi-Layer Perceptron (MLP) model end


# train and save mlp model start
def preprocess_mlp_data(data):
    X = data.drop('key', axis=1).values  # Features
    y = data.drop('key',axis=1)['delay_time'].values  # Labels
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test
    
def load_preprocess_data_train_mlp_model(username,filepath):
    # Load the keystroke data
    data = pd.read_csv(filepath)
    X_train, X_test, y_train, y_test = preprocess_mlp_data(data)
    
    print("X_train.shape: ",X_train.shape,"X_test.shape: ",X_test.shape)
    
    model = create_mlp_model(X_train.shape[1])
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
    
    # Save the trained model
    model_dir = os.path.join('trained_models','model_files')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_filepath = os.path.join(model_dir, f'{username}_keystroke_auth_mlp_model.keras')
    # Save the trained model
    model.save(model_filepath)
    model.summary
    
    # Detect anomalies
    # Load the trained autoencoder model
    autoencoder = tf.keras.models.load_model(model_filepath)

    # Compute reconstruction error on test data
    reconstructions = autoencoder.predict(X_test)
    # reconstructions = autoencoder.predict(X)
    mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)
    # mse = np.mean(np.power(X - reconstructions, 2), axis=1)

    # Define a threshold for anomaly detection
    threshold = np.percentile(mse, 95)  # For example, use the 95th percentile as the threshold

    # Predict anomalies
    anomalies = mse > threshold
    
    # Output results
    print("\n\nreconstructions: ",reconstructions)
    
    
    # if mse > threshold:
    #     print(f"Anomaly detection threshold: {threshold} (imposter)")
    #     print("\n")
    # else:
    #     print(f"Anomaly detection threshold: {threshold} (genuine user)")
    #     print("\n")

    # Output results
    # for i, is_anomaly in enumerate(anomalies):
    #     if is_anomaly:
    #         print(f"Test sample {i} is an anomaly (likely an impostor).")
    #         print("\n")
    #     else:
    #         print(f"Test sample {i} is normal (likely a genuine user).")
    #         print("\n")
            
    print("\n\nanomalies: ", anomalies)
    results = "fail"
    if len(anomalies)>0:
        if anomalies[0]:
            results = False
        else:
            results = True
    print("\n\n is user authenicate: ", results)
    
    
#train model mlp end

# train and create gan model start
def preprocess_gan_data(data):
    X = data.drop('key', axis=1).values  # Features
    y = data.drop('key',axis=1)['delay_time'].values  # Labels
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test

# Build the generator model
def build_generator(latent_dim):
    model = Sequential()
    model.add(Dense(16, input_dim=latent_dim))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(32))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(3, activation='linear'))  # Output: key_hold_time, key_flight_time, key_release_time
    return model

# Build the discriminator model
def build_discriminator(input_shape):
    model = Sequential()
    model.add(Dense(32, input_shape=input_shape))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(16))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))  # Output: probability of being genuine
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build the GAN model
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model
# Route for generating synthetic keystroke data
def generate_synthetic_data(featured_path,model_gan_generator_filepath):
    latent_dim = 10
    generator = tf.keras.models.load_model(model_gan_generator_filepath)
    data = pd.read_csv(featured_path)
    X_train, X_test, y_train, y_test = preprocess_gan_data(data)
    print("X_train",X_train.tolist())
    
    latent_points = np.random.normal(0, 1, (len(X_train), latent_dim))
    
    generated_data = generator.predict(latent_points)
    
    # Build and train authentication model
    auth_model = Sequential([
        Dense(64, activation='relu', input_shape=(generated_data.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    auth_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    auth_model.fit(generated_data, y_train, epochs=10, batch_size=32, validation_split=0.1)
    
    predictions = auth_model.predict(generated_data)
    authenticate = np.mean(predictions)
    print("prediction", np.mean(predictions))
    
    print("authenticate",bool(authenticate))

def load_preprocess_data_train_gan_model(username,filepath):
    data = pd.read_csv(filepath)
    X_train, X_test, y_train, y_test = preprocess_gan_data(data)
    
    latent_dim = 10  # Dimensionality of the latent space
    generator = build_generator(latent_dim)
    discriminator = build_discriminator((X_train.shape[1],))
    gan = build_gan(generator, discriminator)
    
    # Training parameters
    epochs = 200
    batch_size = 32
    
    for epoch in range(epochs):
         # Train discriminator
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        real_features = X_train[idx]
        fake_features = generator.predict(np.random.normal(0, 1, (batch_size, latent_dim)))
        d_loss_real = discriminator.train_on_batch(real_features, np.ones((batch_size, 1)))
        d_loss_fake = discriminator.train_on_batch(fake_features, np.zeros((batch_size, 1)))
        
        # Train generator
        g_loss = gan.train_on_batch(np.random.normal(0, 1, (batch_size, latent_dim)), np.ones((batch_size, 1)))
        print(f"{epoch}/{epochs} [D loss: {0.5 * (d_loss_real[0] + d_loss_fake[0])} | D accuracy: {100 * 0.5 * (d_loss_real[1] + d_loss_fake[1])}%] [G loss: {g_loss}]")
        # # Train discriminator
        # real_samples = X_train[np.random.randint(0, X_train.shape[0], batch_size)]
        # real_labels = np.ones((batch_size, 1))
        
        # latent_points = np.random.normal(0, 1, (batch_size, latent_dim))
        # fake_samples = generator.predict(latent_points)
        # fake_labels = np.zeros((batch_size, 1))
        
        # discriminator_loss_real = discriminator.train_on_batch(real_samples, real_labels)
        # discriminator_loss_fake = discriminator.train_on_batch(fake_samples, fake_labels)
        
        # # Train generator (via GAN model)
        # gan_loss = gan.train_on_batch(latent_points, real_labels)
        
        # if (epoch + 1) % 200 == 0:
            
        #     print(f"Epoch {epoch + 1}/{epochs},")
        #     print(f"Discriminator Loss: {discriminator_loss_real[0]:.4f},")
        #     print(f"GAN Loss:",gan_loss)
    
    # Save the models
    model_dir = os.path.join("trained_models","gen_model_files")
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_gan_generator_filepath = os.path.join(model_dir,f'{username}_keystroke_gan_generator.keras')
    model_gan_discriminator_filepath = os.path.join(model_dir,f'{username}_keystroke_gan_discriminator.keras')
    generator.save(model_gan_generator_filepath)
    discriminator.save(model_gan_discriminator_filepath)
    generator.summary
    discriminator.summary
    generate_synthetic_data(filepath,model_gan_generator_filepath)
    
    
    
    
# train and create gan model end

# train and create Long Short-Term Memory (LSTM) Network model start
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(32))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def process_lstm_data(data):
    X = data.drop('key', axis=1).values  # Features
    y = data.drop('key',axis=1)['delay_time'].values  # Labels
    
    # standarize feautures
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Reshape input data for LSTM [samples, time steps, features]
    X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
    
    return X_train, X_test, y_train, y_test,X_scaled

def lstm_prediction_users_auth(username,filepath):
    # Load the saved model
    data = pd.read_csv(filepath)
    key_features =  data.drop('key',axis=1).values  # Labels
    model_dir = os.path.join("trained_models","lstm_model_files")
    lstm_model_filepath = os.path.join(model_dir,f'{username}_keystroke_lstm_model.keras')
    lstm_model = tf.keras.models.load_model(lstm_model_filepath)
    
    # Load the scaler
    lstm_scaler_path = os.path.join(model_dir,f'{username}_scaler_numpy.npy')
    lstm_scaler_mean = np.load(lstm_scaler_path, allow_pickle=True)
    lstm_scaler_var = np.load(lstm_scaler_path.replace(".npy","_var.npy"), allow_pickle=True)
    if not isinstance(lstm_scaler_mean, np.ndarray) or not isinstance(lstm_scaler_var, np.ndarray):
        print("Failed to load scalar parameters")
        return
    print("lstm_scaler_mean:",lstm_scaler_mean,"lstm_scaler_var:",lstm_scaler_var)
    scaler = StandardScaler()
    scaler.mean_ = lstm_scaler_mean
    scaler.var_ = lstm_scaler_var
    scaler.scale_ = np.sqrt(lstm_scaler_var)
    
    scaler.n_samples_seen_ = 0
    key_features = scaler.fit_transform(key_features)
    
    key_features = np.reshape(key_features, (key_features.shape[0], 1, key_features.shape[1]))  # Reshape for LSTM
    
    predictions = lstm_model.predict(key_features)
    authenticated = (predictions[0][0] > 0.5)
    
    # Process new data
    #...
    
    # Apply scaler
    #...
    
    # Predict the delay time
    #...
    
    # Return the prediction
    
    print("lstm prediction: ",np.mean(predictions),"authenticated",authenticated)
    
    
def load_preprocess_data_train_lstm_model(username,filepath):
    data = pd.read_csv(filepath)
    
    X_train, X_test, y_train,y_test,X_scaled = process_lstm_data(data)
    lstm_model = build_lstm_model((1, X_train.shape[2]))
    lstm_model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))
    # Save the models
    model_dir = os.path.join("trained_models","lstm_model_files")
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    lstm_model_filepath = os.path.join(model_dir,f'{username}_keystroke_lstm_model.keras')
    lstm_model.save(lstm_model_filepath)
    
    # Save the model and scaler for later use
    print("Scaled Mean",np.mean(X_scaled),"X_scaled Var",np.var(X_scaled))
    lstm_scaler_path = os.path.join(model_dir,f'{username}_scaler_numpy.npy')
    np.save(lstm_scaler_path,np.mean(X_scaled))
    np.save(lstm_scaler_path.replace(".npy","_var.npy"),np.var(X_scaled))
    
    lstm_model.summary
    
    lstm_prediction_users_auth(username,filepath)
# train and create Long Short-Term Memory (LSTM) Network model end


# train and create random forest machine learning model start

def process_rfml_model(data):
    # Separate features and target
    X = data.drop('key', axis=1)
    y = data['key']
    # Convert data to numeric, handle any non-numeric data
    X = X.apply(pd.to_numeric, errors='coerce')
    X = X.fillna(0)  # Handle any NaN values by filling with 0
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test,scaler

def rfml_predictions_auth(username,filepath):
    model_dir = os.path.join("trained_models","random_forest_model_files")
    rf_model_filepath = os.path.join(model_dir,f'{username}_keystroke_random_forest_model.pkl')
    rf_scaler_filepath = os.path.join(model_dir,f'{username}_scaler_random_forest_model.pkl')
    rfml_load_model = joblib.load(rf_model_filepath)
    rfml_load_scaler = joblib.load(rf_scaler_filepath)
    #load json
    data = pd.read_csv(filepath)
    print("data input:\n",data,"\n\n")
    key_values = np.array([data.drop("key",axis=1).values])
    
    # key_values = key_values.reshape(1,-1,len(key_values))
    # key_feature_data = np.array([data['hold_time'],data['flight_time'],data['delay_time']])
    print("key_feature_data input:\n",key_values,"\n\n key_feature_data shape:\n",key_values.shape,"\n\n")
    # key_features = key_feature_data
   
    # print("key_features shape: \n", key_features.shape,"\n",key_features,"\n")
    
    key_features = rfml_load_scaler.transform(key_values)
    
    # predictions = rfml_load_model.predict(key_features)
    # authenticate = np.mean(predictions)
    
    # print("authenticate :",authenticate)
    
    
def load_preprocess_data_train_random_forest_model(username,filepath):
    
    # Save the models
    model_dir = os.path.join("trained_models","random_forest_model_files")
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    rf_model_filepath = os.path.join(model_dir,f'{username}_keystroke_random_forest_model.pkl')
    rf_scaler_filepath = os.path.join(model_dir,f'{username}_scaler_random_forest_model.pkl')
    data = pd.read_csv(filepath)
    
    features = np.array(data['flight_time'],data['delay_time'])
    features = features.reshape(1,-1)
    print("Feature:\n",features,"\n\n","Feature shape:\n",features.shape,"\n\n")
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    rf_model = RandomForestClassifier()
    rf_model.fit(features,np.array([1]))
    
    joblib.dump(rf_model, rf_model_filepath)
    joblib.dump(scaler, rf_scaler_filepath)
    
    
    
    # auth
    features_auth = np.array(data['flight_time'],data['delay_time'])
    features_auth = features.reshape(1,-1)
    
    rfml_load_model = joblib.load(rf_model_filepath)
    rfml_load_scaler = joblib.load(rf_scaler_filepath)
    
    feature_values = rfml_load_scaler.transform(features_auth)
    
    predictions = rfml_load_model.predict(feature_values)
    authenticate = np.mean(predictions)
    print("predictions", predictions,"\n\n","auth",bool(authenticate))
    
    
    
    
    # data = pd.read_csv(filepath)
    # X_train, X_test, y_train, y_test,scaler = process_rfml_model(data)
    
    # rf_model = RandomForestClassifier(n_estimators=100,random_state=42)
    # rf_model.fit(X_train, y_train)
    
    # joblib.dump(rf_model, rf_model_filepath)
    # joblib.dump(scaler, rf_scaler_filepath)
    
    # # rf_model.summary 
    
    # print("Random Forest Model Trained")
    
    # rfml_predictions_auth(username,filepath)
    
# train and create random forest machine learning model model end





    
    

    

def process_items():
    # data = pd.read_csv("../data/raw/jjulius.csv")
    # print("raw data: ",data)
    featured_path =  generate_features("jjulius","../data/raw/jjulius.csv")
    # load_preprocess_data_train_cnn_model("jjulius",featured_path)
    # load_preprocess_data_train_mlp_model("jjulius",featured_path)
    # load_preprocess_data_train_gan_model("jjulius",featured_path)
    # load_preprocess_data_train_lstm_model("jjulius",featured_path)
    # load_preprocess_data_train_random_forest_model("jjulius",featured_path)
    
    # save_processed_data('jjulius',"../data/raw/jjulius.csv")
    # process_data(data)
    # print(data)
    
if __name__ == "__main__":
    process_items()
    




features_df: 
      key  hold_time  flight_time  delay_time
0               152         1867        1833
1               152         1867         285
2               152         1867         448
3               152         1867         296
4               152         1867         203
...   ..        ...          ...         ...
3767   y        132          767         748
3768   y        132          767         522
3769   y        132          468         421
3770   y        132          468         748
3771   y        132          468         522

[3772 rows x 4 columns]
Keystroke features saved to 'processed_data/processed_features/jjulius_processed_featured_data.csv'


# generate features from raw keystores

In [17]:
user_name = "frank"
featured_path =  generate_features("jjulius",f"../data/raw/{user_name}.csv")



features_df: 
       key  hold_time  flight_time  delay_time
0                121          667         637
1                121          667         475
2                121          667         471
3                121          667         455
4                121          667         486
...    ..        ...          ...         ...
84341   y        196          372         298
84342   y        196          372         188
84343   y        196          372          12
84344   y        196          372         301
84345   y        196          372         172

[84346 rows x 4 columns]
Keystroke features saved to 'processed_data/processed_features/jjulius_processed_featured_data.csv'


# load and train CNN neural model

In [18]:
load_preprocess_data_train_cnn_model(user_name,featured_path)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - loss: nan - val_loss: nan
Epoch 2/50
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 3/50
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 4/50
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: nan - val_loss: nan
Epoch 5/50
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 6/50
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: nan - val_loss: nan
Epoch 7/50
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 8/50
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: nan - val_loss: nan
Epoch 9/50
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

W0000 00:00:1725192819.958896  178914 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Conv2D" attr { key: "T" value { type: DT_FLOAT } } attr { key: "data_format" value { s: "NHWC" } } attr { key: "dilations" value { list { i: 1 i: 1 i: 1 i: 1 } } } attr { key: "explicit_paddings" value { list { } } } attr { key: "padding" value { s: "SAME" } } attr { key: "strides" value { list { i: 1 i: 1 i: 1 i: 1 } } } attr { key: "use_cudnn_on_gpu" value { b: true } } inputs { dtype: DT_FLOAT shape { dim { size: 32 } dim { size: 1 } dim { } dim { size: 8 } } } inputs { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 2 } dim { size: 8 } dim { size: 8 } } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 2799 num_cores: 4 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 4194304 memory_size: 268435456 } out

[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


 anomalies [[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


 reconstructions:- []


 is user authenicate:  True


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


# load and train MLP neural model

In [19]:
load_preprocess_data_train_mlp_model(user_name,featured_path)

X_train.shape:  (67476, 3) X_test.shape:  (16870, 3)
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.0000e+00 - loss: -5910246.0000 - val_accuracy: 0.0000e+00 - val_loss: -83190648.0000
Epoch 2/10
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.0000e+00 - loss: -159383824.0000 - val_accuracy: 0.0000e+00 - val_loss: -500133952.0000
Epoch 3/10
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.0000e+00 - loss: -692536064.0000 - val_accuracy: 0.0000e+00 - val_loss: -1367940608.0000
Epoch 4/10
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.0000e+00 - loss: -1684558336.0000 - val_accuracy: 0.0000e+00 - val_loss: -2789258496.0000
Epoch 5/10
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.0000e+00 - loss: -3225893632.0000 - val_accuracy: 0.0000e+00 - val_loss: -4868043776.0000
Epoch 6/10
[1m2109/2109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

# load and train GAN neural model

In [20]:
load_preprocess_data_train_gan_model(user_name,featured_path)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step




0/200 [D loss: 0.6950100064277649 | D accuracy: 47.65625%] [G loss: [array(0.716661, dtype=float32), array(0.716661, dtype=float32), array(0.390625, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
1/200 [D loss: 0.7210729122161865 | D accuracy: 37.63020634651184%] [G loss: [array(0.7334206, dtype=float32), array(0.7334206, dtype=float32), array(0.3359375, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
2/200 [D loss: 0.7278144359588623 | D accuracy: 35.9375%] [G loss: [array(0.73147994, dtype=float32), array(0.73147994, dtype=float32), array(0.34375, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
3/200 [D loss: 0.7326586246490479 | D accuracy: 34.04017686843872%] [G loss: [array(0.73502994, dtype=float32), array(0.73502994, dtype=float32), array(0.328125, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
4/200 [D loss: 0.7352

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1898/1898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.0000e+00 - loss: -5630005.5000 - val_accuracy: 0.0000e+00 - val_loss: -80830144.0000
Epoch 2/10
[1m1898/1898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.0000e+00 - loss: -157646928.0000 - val_accuracy: 0.0000e+00 - val_loss: -485167520.0000
Epoch 3/10
[1m1898/1898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.0000e+00 - loss: -670294976.0000 - val_accuracy: 0.0000e+00 - val_loss: -1317811072.0000
Epoch 4/10
[1m1898/1898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.0000e+00 - loss: -1636290432.0000 - val_accuracy: 0.0000e+00 - val_loss: -2657921792.0000
Epoch 5/10
[1m1898/1898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.0000e+00 - loss: -3131202560.0000 - val_accuracy: 0.0000e+00 - val_loss: -4592186880.0000
Epoch 6/10
[1m1898/1898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

# load and train LSTM neural model

In [21]:
load_preprocess_data_train_lstm_model(user_name,featured_path)

Epoch 1/10


  super().__init__(**kwargs)


[1m4218/4218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 4ms/step - accuracy: 0.0000e+00 - loss: -284820.4062 - val_accuracy: 0.0000e+00 - val_loss: -2093301.7500
Epoch 2/10
[1m4218/4218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - accuracy: 0.0000e+00 - loss: -3129798.7500 - val_accuracy: 0.0000e+00 - val_loss: -6857196.0000
Epoch 3/10
[1m4218/4218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - accuracy: 0.0000e+00 - loss: -8500854.0000 - val_accuracy: 0.0000e+00 - val_loss: -13883833.0000
Epoch 4/10
[1m4218/4218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - accuracy: 0.0000e+00 - loss: -16100454.0000 - val_accuracy: 0.0000e+00 - val_loss: -23140290.0000
Epoch 5/10
[1m4218/4218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 0.0000e+00 - loss: -25952138.0000 - val_accuracy: 0.0000e+00 - val_loss: -34575928.0000
Epoch 6/10
[1m4218/4218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

# load and train random forest machine learning model

In [22]:
load_preprocess_data_train_random_forest_model(user_name,featured_path)

Feature:
 [[667 667 667 ... 372 372 372]] 

 Feature shape:
 (1, 84346) 


predictions [1] 

 auth True
