In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential #type: ignore
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Attention #type: ignore
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau #type: ignore
from tensorflow.keras.regularizers import l2 #type: ignore
from scipy.stats import entropy

In [4]:

def generate_keystroke_data(num_samples, is_human=True):
    data = []
    for _ in range(num_samples):
        sequence = []
        sequence_length = np.random.randint(20, 100)

        keystroke_times = []
        pause_times = []
        key_hold_times = []

        key_distances = []

        errors_made = 0
        corrections_made = 0

        copy_paste_events = 0

        for i in range(sequence_length):
            if is_human:
                keystroke_time = np.random.normal(0.2, 0.05)
                pause_time = np.random.normal(0.5, 0.2)
                key_hold_time = np.random.normal(0.1, 0.03)
                key_distance = np.random.normal(2, 1)
            else:
                keystroke_time = np.random.normal(0.05, 0.01)
                pause_time = np.random.normal(0.1, 0.05)
                key_hold_time = np.random.normal(0.05, 0.01)
                key_distance = np.random.normal(1.5, 0.5)

            keystroke_times.append(keystroke_time)
            pause_times.append(pause_time)
            key_hold_times.append(key_hold_time)
            key_distances.append(key_distance)

            if is_human and np.random.random() < 0.05:
                errors_made += 1
                if np.random.random() < 0.8:
                    corrections_made += 1

            if (not is_human and np.random.random() < 0.1) or (is_human and np.random.random() < 0.02):
                copy_paste_events += 1

        avg_keystroke_time = np.mean(keystroke_times)
        std_keystroke_time = np.std(keystroke_times)
        avg_pause_time = np.mean(pause_times)
        std_pause_time = np.std(pause_times)
        avg_key_hold_time = np.mean(key_hold_times)
        std_key_hold_time = np.std(key_hold_times)

        typing_speed = len(keystroke_times) / sum(pause_times)

        rhythm_consistency = entropy(keystroke_times)

        avg_key_distance = np.mean(key_distances)
        std_key_distance = np.std(key_distances)

        error_rate = errors_made / sequence_length
        correction_rate = corrections_made / max(errors_made, 1)

        copy_paste_frequency = copy_paste_events / sequence_length

        if is_human:
            mouse_speed = np.random.normal(500, 100)
            mouse_acceleration = np.random.normal(200, 50)
            mouse_jerk = np.random.normal(100, 30)
        else:
            mouse_speed = np.random.normal(800, 50)
            mouse_acceleration = np.random.normal(100, 20)
            mouse_jerk = np.random.normal(50, 10)

        features = [
            avg_keystroke_time, std_keystroke_time,
            avg_pause_time, std_pause_time,
            avg_key_hold_time, std_key_hold_time,
            typing_speed, rhythm_consistency,
            avg_key_distance, std_key_distance,
            error_rate, correction_rate,
            copy_paste_frequency,
            mouse_speed, mouse_acceleration, mouse_jerk
        ]

        data.append(features)

    return data

num_human_samples = 15000
num_bot_samples = 15000

human_data = generate_keystroke_data(num_human_samples, is_human=True)
bot_data = generate_keystroke_data(num_bot_samples, is_human=False)

In [5]:
num_human_samples = 15000
num_bot_samples = 15000

human_data = generate_keystroke_data(num_human_samples, is_human=True)
bot_data = generate_keystroke_data(num_bot_samples, is_human=False)

In [6]:
columns = [
    'avg_keystroke_time', 'std_keystroke_time',
    'avg_pause_time', 'std_pause_time',
    'avg_key_hold_time', 'std_key_hold_time',
    'typing_speed', 'rhythm_consistency',
    'avg_key_distance', 'std_key_distance',
    'error_rate', 'correction_rate',
    'copy_paste_frequency',
    'mouse_speed', 'mouse_acceleration', 'mouse_jerk'
]

In [7]:
df_human = pd.DataFrame(human_data, columns=columns)
df_bot = pd.DataFrame(bot_data, columns=columns)

df_human['target'] = 'human'
df_bot['target'] = 'bot'

df = pd.concat([df_human, df_bot], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
def clean_data(df):
    df = df.replace([np.inf, -np.inf], np.nan)

    problematic_columns = df.columns[df.isin([np.inf, -np.inf, np.nan]).any()].tolist()

    print("Columns with NaN or infinite values:")
    for col in problematic_columns:
        nan_count = df[col].isna().sum()
        inf_count = np.isinf(df[col]).sum()
        print(f"{col}: NaN count = {nan_count}, Inf count = {inf_count}")

    for col in problematic_columns:
        median_value = df[col].median()
        df[col] = df[col].replace([np.inf, -np.inf, np.nan], median_value)

    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:
            lower_bound = df[column].quantile(0.001)
            upper_bound = df[column].quantile(0.999)
            df[column] = df[column].clip(lower_bound, upper_bound)

    return df

In [9]:
df = clean_data(df)
df['target'] = (df['target'] == 'bot').astype(int)

X = df.drop('target', axis=1).values
y = df['target'].values

Columns with NaN or infinite values:
rhythm_consistency: NaN count = 27, Inf count = 0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

In [13]:
def create_rag_model(input_shape):
    inputs = tf.keras.Input(shape=input_shape)

    # First LSTM layer
    lstm_out = LSTM(128, return_sequences=True, kernel_regularizer=l2(0.01))(inputs)
    lstm_out = BatchNormalization()(lstm_out)
    lstm_out = Dropout(0.3)(lstm_out)

    attention_out = Attention()([lstm_out, lstm_out])

    lstm_out = LSTM(64, kernel_regularizer=l2(0.01))(attention_out)
    lstm_out = BatchNormalization()(lstm_out)
    lstm_out = Dropout(0.3)(lstm_out)

    dense_out = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(lstm_out)
    dense_out = BatchNormalization()(dense_out)
    dense_out = Dropout(0.3)(dense_out)

    outputs = Dense(1, activation='sigmoid')(dense_out)

    model = tf.keras.Model(inputs, outputs)
    return model

In [16]:
model = create_rag_model((1, X_train_reshaped.shape[2]))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

In [18]:
history = model.fit(X_train_reshaped, y_train,
                    validation_split=0.2,
                    epochs=50,
                    batch_size=64,
                    callbacks=[early_stopping, reduce_lr])

Epoch 1/50




[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9746 - loss: 1.3920 - val_accuracy: 1.0000 - val_loss: 0.5567 - learning_rate: 0.0010
Epoch 2/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 1.0000 - loss: 0.0790 - val_accuracy: 1.0000 - val_loss: 0.1918 - learning_rate: 0.0010
Epoch 3/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0083 - val_accuracy: 1.0000 - val_loss: 0.0029 - learning_rate: 0.0010
Epoch 4/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 1.0000 - loss: 0.0018 - val_accuracy: 1.0000 - val_loss: 8.5810e-04 - learning_rate: 0.0010
Epoch 5/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 1.0000 - loss: 9.6367e-04 - val_accuracy: 1.0000 - val_loss: 5.9875e-04 - learning_rate: 0.0010
Epoch 6/50
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [19]:
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")



[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 1.0000 - loss: 7.8997e-05
Test Loss: 7.9021614510566e-05, Test Accuracy: 1.0


In [20]:
def predict_bot(new_data):
    if new_data.ndim == 1:
        new_data = new_data.reshape(1, -1)

    new_data_scaled = scaler.transform(new_data)

    new_data_reshaped = new_data_scaled.reshape((new_data_scaled.shape[0], 1, new_data_scaled.shape[1]))

    prediction = model.predict(new_data_reshaped)

    is_bot = prediction > 0.5
    confidence = prediction if is_bot else 1 - prediction

    return is_bot[0][0], confidence[0][0]

real_input = np.array([0.2, 0.05, 0.5, 0.1, 0.1, 0.02, 5.0, 0.8, 2.0, 0.5, 0.01, 0.005, 0.001, 300, 100, 50])
is_bot, confidence = predict_bot(real_input)
print(f"Is bot: {is_bot}, Confidence: {confidence}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 612ms/step
Is bot: False, Confidence: 0.9999710321426392
