In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models

In [5]:
# Load the data
data = pd.read_csv('key_press_data.csv')

In [25]:
data.describe()

Unnamed: 0,User ID,Duration,Time Between Keys,Typing Speed (KPS),Backspace Count,Typing Session Duration
count,290.0,290.0,290.0,290.0,290.0,290.0
mean,1.0,0.226592,0.164131,3.231846,0.131034,32.750477
std,0.0,0.427852,0.422697,1.13167,0.338021,16.163363
min,1.0,0.0,0.0,0.0,0.0,3.412131
25%,1.0,0.108501,0.048078,2.456865,0.0,19.23145
50%,1.0,0.14558,0.088636,3.756631,0.0,31.640384
75%,1.0,0.209677,0.131639,4.13858,0.0,44.045824
max,1.0,5.274579,5.153114,4.393904,1.0,62.827461


In [6]:
data.head()

Unnamed: 0,User ID,Key,Duration,Time Between Keys,Typing Speed (KPS),Backspace Count,Typing Session Duration
0,1,'s',2.124914,2.000771,0.293072,0,3.412131
1,1,'h',0.146307,0.077548,0.562044,0,3.558437
2,1,'u',0.22694,0.119265,0.792523,0,3.785377
3,1,'s',0.151834,0.131639,1.269935,0,3.937211
4,1,'h',0.07749,0.131639,1.245423,0,4.014701


In [7]:
data.tail()

Unnamed: 0,User ID,Key,Duration,Time Between Keys,Typing Speed (KPS),Backspace Count,Typing Session Duration
285,1,'t',0.031979,0.079358,4.303208,1,60.420039
286,1,Key.shift,0.426107,0.290034,4.289507,1,60.846147
287,1,'!',0.484279,0.331366,4.288247,1,61.330426
288,1,Key.shift,0.089403,0.331366,4.282005,1,61.419829
289,1,Key.esc,1.407632,1.288641,4.201984,1,62.827461


In [8]:
# Preprocess the data
def preprocess_data(data, scaler=None, template_columns=None):
    # Convert key column to string
    data['Key'] = data['Key'].astype(str)

    # One-hot encode the 'Key' column
    data = pd.get_dummies(data, columns=['Key'])

    # Align with template columns if provided
    if template_columns is not None:
        for col in template_columns:
            if col not in data.columns:
                data[col] = 0
        data = data[template_columns]
    else:
        template_columns = data.columns.tolist()

    # Fill any NaN values with 0
    data.fillna(0, inplace=True)

    # Normalize numerical columns
    if scaler is None:
        scaler = StandardScaler()
        data[['Duration', 'Time Between Keys', 'Typing Speed (KPS)', 'Backspace Count', 'Typing Session Duration']] = scaler.fit_transform(
            data[['Duration', 'Time Between Keys', 'Typing Speed (KPS)', 'Backspace Count', 'Typing Session Duration']]
        )
    else:
        data[['Duration', 'Time Between Keys', 'Typing Speed (KPS)', 'Backspace Count', 'Typing Session Duration']] = scaler.transform(
            data[['Duration', 'Time Between Keys', 'Typing Speed (KPS)', 'Backspace Count', 'Typing Session Duration']]
        )

    return data, scaler, template_columns

In [9]:
preprocessed_data, scaler, template_columns = preprocess_data(data)

In [10]:
preprocessed_data.head()

Unnamed: 0,User ID,Duration,Time Between Keys,Typing Speed (KPS),Backspace Count,Typing Session Duration,Key_'!',Key_'a',Key_'b',Key_'c',...,Key_'u',Key_'v',Key_'w',Key_'y',Key_'z',Key_Key.backspace,Key_Key.enter,Key_Key.esc,Key_Key.shift,Key_Key.space
0,1,4.444532,4.352564,-2.601336,-0.388322,-1.818252,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,-0.187971,-0.20519,-2.363248,-0.388322,-1.809184,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,0.000815,-0.106326,-2.159233,-0.388322,-1.79512,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,1,-0.17503,-0.077004,-1.73664,-0.388322,-1.78571,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1,-0.349091,-0.077004,-1.758337,-0.388322,-1.780907,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
template_columns

['User ID',
 'Duration',
 'Time Between Keys',
 'Typing Speed (KPS)',
 'Backspace Count',
 'Typing Session Duration',
 "Key_'!'",
 "Key_'a'",
 "Key_'b'",
 "Key_'c'",
 "Key_'d'",
 "Key_'e'",
 "Key_'f'",
 "Key_'g'",
 "Key_'h'",
 "Key_'i'",
 "Key_'k'",
 "Key_'l'",
 "Key_'m'",
 "Key_'n'",
 "Key_'o'",
 "Key_'p'",
 "Key_'r'",
 "Key_'s'",
 "Key_'t'",
 "Key_'u'",
 "Key_'v'",
 "Key_'w'",
 "Key_'y'",
 "Key_'z'",
 'Key_Key.backspace',
 'Key_Key.enter',
 'Key_Key.esc',
 'Key_Key.shift',
 'Key_Key.space']

In [12]:
# Ensure all columns are numeric
# assert preprocessed_data.applymap(np.isreal).all().all()

In [13]:
preprocessed_data.head()

Unnamed: 0,User ID,Duration,Time Between Keys,Typing Speed (KPS),Backspace Count,Typing Session Duration,Key_'!',Key_'a',Key_'b',Key_'c',...,Key_'u',Key_'v',Key_'w',Key_'y',Key_'z',Key_Key.backspace,Key_Key.enter,Key_Key.esc,Key_Key.shift,Key_Key.space
0,1,4.444532,4.352564,-2.601336,-0.388322,-1.818252,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,-0.187971,-0.20519,-2.363248,-0.388322,-1.809184,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,0.000815,-0.106326,-2.159233,-0.388322,-1.79512,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,1,-0.17503,-0.077004,-1.73664,-0.388322,-1.78571,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1,-0.349091,-0.077004,-1.758337,-0.388322,-1.780907,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
# Split the data into training and test sets
X = preprocessed_data.drop(columns=['User ID'])
y = preprocessed_data['User ID']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Ensure there are no infinite values
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [16]:
# Convert to numpy arrays
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

In [17]:
# Ensure the data types are float32
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In [18]:
# Build the autoencoder model
input_dim = X_train.shape[1]
encoding_dim = 32  # Size of the encoding layer

autoencoder = models.Sequential([
    layers.InputLayer(input_shape=(input_dim,)),
    layers.Dense(encoding_dim, activation='relu'),
    layers.Dense(input_dim, activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')

In [19]:
# autoencoder.add(tf.keras.layers.Dense(256, input_shape=(X_train.shape[1],), activation='sigmoid'))

In [20]:
# Train the autoencoder
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x786ffbd72fe0>

In [21]:
# Evaluate the model
reconstructions = autoencoder.predict(X_test)
reconstruction_errors = np.mean(np.square(reconstructions - X_test), axis=1)



In [22]:
# Set a threshold for anomaly detection
threshold = np.mean(reconstruction_errors) + 2 * np.std(reconstruction_errors)

In [33]:
def is_anomaly(new_data, autoencoder, threshold, scaler, template_columns):
    # Preprocess the new data with the same template columns and scaler
    new_data, _, _ = preprocess_data(new_data, scaler, template_columns)

    # Ensure new_data columns match template_columns
    for col in template_columns:
        if col not in new_data.columns:
            new_data[col] = 0

    # Ensure the order of columns matches template_columns
    new_data = new_data[template_columns]

    # print("new_data.head()", new_data.head())
    # print("New data shape before reshape:", new_data.shape)

    # Convert to numpy array and ensure the data type is float32
    new_data = new_data.to_numpy().astype(np.float32)

    # Reshape the new data to match the expected input shape
    new_data = new_data[:, :autoencoder.input_shape[1]]  # Ensure correct number of features
    new_data = new_data.reshape(-1, autoencoder.input_shape[1])  # Reshape to match the model input shape

    # Check shapes after reshape
    # print("New data shape after reshape:", new_data.shape)

    # Predict using the autoencoder
    reconstructions = autoencoder.predict(new_data)
    reconstruction_errors = np.mean(np.square(reconstructions - new_data), axis=1)

    # Check if the reconstruction error exceeds the threshold
    anomalies = reconstruction_errors > threshold

    # Count the number of anomalies detected
    num_anomalies = np.sum(anomalies)

    return anomalies, num_anomalies

In [34]:
# Example usage
new_data = pd.read_csv('new_key_press_data.csv')
anomalies, num_anomalies = is_anomaly(new_data, autoencoder, threshold, scaler, template_columns)

total_data_points = new_data.shape[0]
anomaly_percentage = (num_anomalies / total_data_points) * 100

# print(f"Total data points: {total_data_points}")
# print(f"Anomalies detected: {num_anomalies}")
# print(f"Anomaly percentage: {anomaly_percentage:.2f}%")

if anomaly_percentage > 50:
    print("Anomaly detected, your system might have been taken over!")
else:
    print("No anomaly detected, you are safe.")

No anomaly detected, you are safe.


In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models
import os

# Preprocessing function
def preprocess_data(data, scaler=None, template_columns=None):
    if scaler is None:
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data)
    else:
        data_scaled = scaler.transform(data)

    if template_columns is None:
        template_columns = data.columns

    data = pd.DataFrame(data_scaled, columns=template_columns)

    return data, scaler, template_columns

# Load the data
data = pd.read_csv('key_press_data.csv')

# Method-1
# ascii_values={'Key.enter': 0, "'1'": 1, "'2'": 2, "'3'": 3, "'4'": 4, "'5'": 5, "'6'": 6, "'7'": 7, "'8'": 8, "'9'": 9, "'0'": 10, "'-'": 11, "'='": 12, 'Key.backspace': 13, 'Key.tab': 14, "'q'": 15, "'w'": 16, "'e'": 17, "'r'": 18, "'t'": 19, "'y'": 20, "'u'": 21, "'i'": 22, "'o'": 23, "'p'": 24, "'['": 25, "']'": 26, "'\\\\'": 27, 'Key.caps_lock': 28, "'a'": 29, "'s'": 30, "'d'": 31, "'f'": 32, "'g'": 33, "'h'": 34, "'j'": 35, "'k'": 36, "'l'": 37, "';'": 38, '"\'"': 39, 'Key.shift': 40, "'z'": 41, "'x'": 42, "'c'": 43, "'v'": 44, "'b'": 45, "'n'": 46, "'m'": 47, "','": 48, "'.'": 49, "'/'": 50, 'Key.shift_r': 51, 'Key.ctrl_l': 52, '<255>': 53, 'Key.cmd': 54, 'Key.alt_l': 55, 'Key.space': 56, 'Key.alt_gr': 57, 'Key.ctrl_r': 58, 'Key.left': 59, 'Key.up': 60, 'Key.down': 61, 'Key.right': 62, 'Key.esc': 63, "'!'": 64, "'@'": 65, "'#'": 66, "'$'": 67, "'%'": 68, "'^'": 69, "'&'": 70, "'*'": 71, "'('": 72, "')'": 73, "'_'": 74, "'+'": 75, "'{'": 76, "'}'": 77, "'|'": 78, "':'": 79, '\'"\'': 80, "'<'": 81, "'>'": 82, "'?'": 83, '<188>': 84, '<190>': 85, '<191>': 86, "'\\x0c'": 87, '<186>': 88, '<222>': 89, "'\\x10'": 90, "'\\x1b'": 91, "'\\x1d'": 92, "'\\x1c'": 93, "'A'": 94, "'Q'": 95, "'W'": 96, "'E'": 97, "'R'": 98, "'T'": 99, "'Y'": 100, "'U'": 101, "'I'": 102, "'O'": 103, "'P'": 104, "'L'": 105, "'K'": 106, "'J'": 107, "'H'": 108, "'G'": 109, "'F'": 110, "'D'": 111, "'S'": 112, "'Z'": 113, "'X'": 114, "'C'": 115, "'V'": 116, "'B'": 117, "'N'": 118, "'M'": 119}
# data['Key'] = data['Key'].map(ascii_values)

# Method-2
ascii_values = {}
def keyToInt(data):
  for key in data['Key']:
    if key not in ascii_values:
      ascii_values[key] = len(ascii_values)

keyToInt(data)
data['Key'] = data['Key'].map(ascii_values)


# Now data['Key'] contains integer values corresponding to each key
print(data.head())


preprocessed_data, scaler, template_columns = preprocess_data(data)
preprocessed_data.head()

# Split the data into training and test sets
X = preprocessed_data.drop(columns=['User ID'])
y = preprocessed_data['User ID']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure there are no infinite values
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# Convert to numpy arrays
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

# Ensure the data types are float32
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

# Build the enhanced autoencoder model
input_dim = X_train.shape[1]
encoding_dim = 32  # Size of the encoding layer

autoencoder = models.Sequential([
    layers.InputLayer(input_shape=(input_dim,)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(encoding_dim, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(input_dim, activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')

# Train the autoencoder
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

# Evaluate the model
reconstructions = autoencoder.predict(X_test)
reconstruction_errors = np.mean(np.square(reconstructions - X_test), axis=1)

# Set a threshold for anomaly detection
threshold = np.mean(reconstruction_errors) + 2 * np.std(reconstruction_errors)

def is_anomaly(new_data, autoencoder, threshold, scaler, template_columns):
    # Preprocess the new data with the same template columns and scaler
    new_data, _, _ = preprocess_data(new_data, scaler, template_columns)

    # Ensure new_data columns match template_columns
    for col in template_columns:
        if col not in new_data.columns:
            new_data[col] = 0

    # Ensure the order of columns matches template_columns
    new_data = new_data[template_columns]

    # Convert to numpy array and ensure the data type is float32
    new_data = new_data.to_numpy().astype(np.float32)

    # Reshape the new data to match the expected input shape
    new_data = new_data[:, :autoencoder.input_shape[1]]  # Ensure correct number of features
    new_data = new_data.reshape(-1, autoencoder.input_shape[1])  # Reshape to match the model input shape

    # Predict using the autoencoder
    reconstructions = autoencoder.predict(new_data)
    reconstruction_errors = np.mean(np.square(reconstructions - new_data), axis=1)

    # Check if the reconstruction error exceeds the threshold
    anomalies = reconstruction_errors > threshold

    # Count the number of anomalies detected
    num_anomalies = np.sum(anomalies)

    return anomalies, num_anomalies


# Example usage
new_data = pd.read_csv('new_key_press_data.csv')

# #Method-1
# new_data['Key'] = new_data['Key'].map(ascii_values)

#Method-2
keyToInt(new_data)
new_data['Key'] = new_data['Key'].map(ascii_values)

anomalies, num_anomalies = is_anomaly(new_data, autoencoder, threshold, scaler, template_columns)

total_data_points = new_data.shape[0]
anomaly_percentage = (num_anomalies / total_data_points) * 100

print(f"Total data points: {total_data_points}")
print(f"Anomalies detected: {num_anomalies}")
print(f"Anomaly percentage: {anomaly_percentage:.2f}%")
if anomaly_percentage > 50:
    print("Anomaly detected (>50%), your system might have been taken over!")
else:
    print("No anomaly detected (<50%), you are safe.")


   User ID  Key  Duration  Time Between Keys  Typing Speed (KPS)  \
0        1    0  2.124914           2.000771            0.293072   
1        1    1  0.146307           0.077548            0.562044   
2        1    2  0.226940           0.119265            0.792523   
3        1    0  0.151834           0.131639            1.269935   
4        1    1  0.077490           0.131639            1.245423   

   Backspace Count  Typing Session Duration  
0                0                 3.412131  
1                0                 3.558437  
2                0                 3.785377  
3                0                 3.937211  
4                0                 4.014701  
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50


In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Preprocessing function
def preprocess_data(data, scaler=None, template_columns=None):
    if scaler is None:
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data)
    else:
        data_scaled = scaler.transform(data)

    if template_columns is None:
        template_columns = data.columns

    data = pd.DataFrame(data_scaled, columns=template_columns)

    return data, scaler, template_columns

# Load the data
data = pd.read_csv('key_press_data.csv')

# Method-1
# ascii_values={'Key.enter': 0, "'1'": 1, "'2'": 2, "'3'": 3, "'4'": 4, "'5'": 5, "'6'": 6, "'7'": 7, "'8'": 8, "'9'": 9, "'0'": 10, "'-'": 11, "'='": 12, 'Key.backspace': 13, 'Key.tab': 14, "'q'": 15, "'w'": 16, "'e'": 17, "'r'": 18, "'t'": 19, "'y'": 20, "'u'": 21, "'i'": 22, "'o'": 23, "'p'": 24, "'['": 25, "']'": 26, "'\\\\'": 27, 'Key.caps_lock': 28, "'a'": 29, "'s'": 30, "'d'": 31, "'f'": 32, "'g'": 33, "'h'": 34, "'j'": 35, "'k'": 36, "'l'": 37, "';'": 38, '"\'"': 39, 'Key.shift': 40, "'z'": 41, "'x'": 42, "'c'": 43, "'v'": 44, "'b'": 45, "'n'": 46, "'m'": 47, "','": 48, "'.'": 49, "'/'": 50, 'Key.shift_r': 51, 'Key.ctrl_l': 52, '<255>': 53, 'Key.cmd': 54, 'Key.alt_l': 55, 'Key.space': 56, 'Key.alt_gr': 57, 'Key.ctrl_r': 58, 'Key.left': 59, 'Key.up': 60, 'Key.down': 61, 'Key.right': 62, 'Key.esc': 63, "'!'": 64, "'@'": 65, "'#'": 66, "'$'": 67, "'%'": 68, "'^'": 69, "'&'": 70, "'*'": 71, "'('": 72, "')'": 73, "'_'": 74, "'+'": 75, "'{'": 76, "'}'": 77, "'|'": 78, "':'": 79, '\'"\'': 80, "'<'": 81, "'>'": 82, "'?'": 83, '<188>': 84, '<190>': 85, '<191>': 86, "'\\x0c'": 87, '<186>': 88, '<222>': 89, "'\\x10'": 90, "'\\x1b'": 91, "'\\x1d'": 92, "'\\x1c'": 93, "'A'": 94, "'Q'": 95, "'W'": 96, "'E'": 97, "'R'": 98, "'T'": 99, "'Y'": 100, "'U'": 101, "'I'": 102, "'O'": 103, "'P'": 104, "'L'": 105, "'K'": 106, "'J'": 107, "'H'": 108, "'G'": 109, "'F'": 110, "'D'": 111, "'S'": 112, "'Z'": 113, "'X'": 114, "'C'": 115, "'V'": 116, "'B'": 117, "'N'": 118, "'M'": 119}
# data['Key'] = data['Key'].map(ascii_values)

# Method-2
ascii_values = {}
def keyToInt(data):
  for key in data['Key']:
    if key not in ascii_values:
      ascii_values[key] = len(ascii_values)

keyToInt(data)
data['Key'] = data['Key'].map(ascii_values)

# Preprocess the data
preprocessed_data, scaler, template_columns = preprocess_data(data)

# Split the data into training and test sets
X = preprocessed_data.drop(columns=['User ID'])
y = preprocessed_data['User ID']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the autoencoder model with advanced techniques
input_dim = X_train.shape[1]
encoding_dim = 32  # Size of the encoding layer

autoencoder = models.Sequential([
    layers.InputLayer(input_shape=(input_dim,)),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(encoding_dim, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    layers.Dense(input_dim, activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')

# Define callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)

# Train the autoencoder
history = autoencoder.fit(X_train, X_train, epochs=100, batch_size=32,
                          validation_data=(X_test, X_test), callbacks=[early_stopping, reduce_lr])

# Evaluate the model
reconstructions = autoencoder.predict(X_test)
reconstruction_errors = np.mean(np.square(reconstructions - X_test), axis=1)

# Set a threshold for anomaly detection
threshold = np.mean(reconstruction_errors) + 2 * np.std(reconstruction_errors)

def is_anomaly(new_data, autoencoder, threshold, scaler, template_columns):
    # Preprocess the new data with the same template columns and scaler
    new_data, _, _ = preprocess_data(new_data, scaler, template_columns)

    # Ensure new_data columns match template_columns
    for col in template_columns:
        if col not in new_data.columns:
            new_data[col] = 0

    # Ensure the order of columns matches template_columns
    new_data = new_data[template_columns]

    # Convert to numpy array and ensure the data type is float32
    new_data = new_data.to_numpy().astype(np.float32)

    # Reshape the new data to match the expected input shape
    new_data = new_data[:, :autoencoder.input_shape[1]]  # Ensure correct number of features
    new_data = new_data.reshape(-1, autoencoder.input_shape[1])  # Reshape to match the model input shape

    # Predict using the autoencoder
    reconstructions = autoencoder.predict(new_data)
    reconstruction_errors = np.mean(np.square(reconstructions - new_data), axis=1)

    # Check if the reconstruction error exceeds the threshold
    anomalies = reconstruction_errors > threshold

    # Count the number of anomalies detected
    num_anomalies = np.sum(anomalies)

    return anomalies, num_anomalies

# Example usage
new_data = pd.read_csv('new_key_press_data.csv')

# Method-1
# new_data['Key'] = new_data['Key'].map(ascii_values)

#Method-2
keyToInt(new_data)
new_data['Key'] = new_data['Key'].map(ascii_values)

anomalies, num_anomalies = is_anomaly(new_data, autoencoder, threshold, scaler, template_columns)

total_data_points = new_data.shape[0]
anomaly_percentage = (num_anomalies / total_data_points) * 100

print(f"Total data points: {total_data_points}")
print(f"Anomalies detected: {num_anomalies}")
print(f"Anomaly percentage: {anomaly_percentage:.2f}%")
if anomaly_percentage > 50:
    print("Anomaly detected (>50%), your system might have been taken over!")
else:
    print("No anomaly detected (<50%), you are safe.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78



Total data points: 77
Anomalies detected: 8
Anomaly percentage: 10.39%
No anomaly detected (<50%), you are safe.
