In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from scipy.stats import entropy


In [24]:

def generate_keystroke_data(num_samples, is_human=True):
    data = []
    for _ in range(num_samples):
        sequence = []
        sequence_length = np.random.randint(20, 100)  # Longer sequences for more features

        # Basic timing features
        keystroke_times = []
        pause_times = []
        key_hold_times = []

        # Keyboard layout analysis
        key_distances = []

        # Error and correction
        errors_made = 0
        corrections_made = 0

        # Copy-paste
        copy_paste_events = 0

        for i in range(sequence_length):
            if is_human:
                keystroke_time = np.random.normal(0.2, 0.05)
                pause_time = np.random.normal(0.5, 0.2)
                key_hold_time = np.random.normal(0.1, 0.03)
                key_distance = np.random.normal(2, 1)  # Average distance between keys
            else:
                keystroke_time = np.random.normal(0.05, 0.01)
                pause_time = np.random.normal(0.1, 0.05)
                key_hold_time = np.random.normal(0.05, 0.01)
                key_distance = np.random.normal(1.5, 0.5)  # More consistent distances for bots

            keystroke_times.append(keystroke_time)
            pause_times.append(pause_time)
            key_hold_times.append(key_hold_time)
            key_distances.append(key_distance)

            # Simulate errors and corrections (only for humans)
            if is_human and np.random.random() < 0.05:  # 5% chance of error
                errors_made += 1
                if np.random.random() < 0.8:  # 80% chance of correcting the error
                    corrections_made += 1

            # Simulate copy-paste (more likely for bots)
            if (not is_human and np.random.random() < 0.1) or (is_human and np.random.random() < 0.02):
                copy_paste_events += 1

        # Calculate features
        avg_keystroke_time = np.mean(keystroke_times)
        std_keystroke_time = np.std(keystroke_times)
        avg_pause_time = np.mean(pause_times)
        std_pause_time = np.std(pause_times)
        avg_key_hold_time = np.mean(key_hold_times)
        std_key_hold_time = np.std(key_hold_times)

        typing_speed = len(keystroke_times) / sum(pause_times)

        # Keystroke rhythm consistency (lower entropy means more consistent)
        rhythm_consistency = entropy(keystroke_times)

        # Keyboard layout analysis
        avg_key_distance = np.mean(key_distances)
        std_key_distance = np.std(key_distances)

        # Error and correction rates
        error_rate = errors_made / sequence_length
        correction_rate = corrections_made / max(errors_made, 1)

        # Copy-paste frequency
        copy_paste_frequency = copy_paste_events / sequence_length

        # Mouse movement simulation (if bot, more direct movements)
        if is_human:
            mouse_speed = np.random.normal(500, 100)  # pixels per second
            mouse_acceleration = np.random.normal(200, 50)  # pixels per second^2
            mouse_jerk = np.random.normal(100, 30)  # pixels per second^3
        else:
            mouse_speed = np.random.normal(800, 50)
            mouse_acceleration = np.random.normal(100, 20)
            mouse_jerk = np.random.normal(50, 10)

        features = [
            avg_keystroke_time, std_keystroke_time,
            avg_pause_time, std_pause_time,
            avg_key_hold_time, std_key_hold_time,
            typing_speed, rhythm_consistency,
            avg_key_distance, std_key_distance,
            error_rate, correction_rate,
            copy_paste_frequency,
            mouse_speed, mouse_acceleration, mouse_jerk
        ]

        data.append(features)

    return data

# Generate data
num_human_samples = 15000
num_bot_samples = 15000

human_data = generate_keystroke_data(num_human_samples, is_human=True)
bot_data = generate_keystroke_data(num_bot_samples, is_human=False)

In [25]:
columns = [
    'avg_keystroke_time', 'std_keystroke_time',
    'avg_pause_time', 'std_pause_time',
    'avg_key_hold_time', 'std_key_hold_time',
    'typing_speed', 'rhythm_consistency',
    'avg_key_distance', 'std_key_distance',
    'error_rate', 'correction_rate',
    'copy_paste_frequency',
    'mouse_speed', 'mouse_acceleration', 'mouse_jerk'
]

In [26]:
df_human = pd.DataFrame(human_data, columns=columns)
df_bot = pd.DataFrame(bot_data, columns=columns)

df_human['target'] = 'human'
df_bot['target'] = 'bot'

df = pd.concat([df_human, df_bot], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle the data

In [27]:
# Save to CSV
df.to_csv('extended_keystroke_data.csv', index=False)

print("Dataset shape:", df.shape)
print("\nFeature summary:")
print(df.describe())

print("\nCorrelation with target:")
df['target'] = (df['target'] == 'bot').astype(int)


Dataset shape: (30000, 17)

Feature summary:
       avg_keystroke_time  std_keystroke_time  avg_pause_time  std_pause_time  \
count        30000.000000        30000.000000    30000.000000    30000.000000   
mean             0.124996            0.029538        0.299961        0.123157   
std              0.075153            0.020016        0.201156        0.075348   
min              0.043099            0.005227        0.066782        0.025705   
25%              0.050019            0.009855        0.099921        0.049252   
50%              0.110366            0.019599        0.256354        0.096097   
75%              0.200054            0.049201        0.499932        0.197085   
max              0.238976            0.078767        0.638511        0.298507   

       avg_key_hold_time  std_key_hold_time  typing_speed  rhythm_consistency  \
count       30000.000000       30000.000000  30000.000000        3.000000e+04   
mean            0.074993           0.019705      6.035243      

In [28]:
df.head()

Unnamed: 0,avg_keystroke_time,std_keystroke_time,avg_pause_time,std_pause_time,avg_key_hold_time,std_key_hold_time,typing_speed,rhythm_consistency,avg_key_distance,std_key_distance,error_rate,correction_rate,copy_paste_frequency,mouse_speed,mouse_acceleration,mouse_jerk,target
0,0.197602,0.057079,0.4618,0.180428,0.10232,0.030056,2.165437,4.162028,1.9411,0.943915,0.0,0.0,0.044776,681.941508,186.748584,62.243487,0
1,0.048166,0.009769,0.107274,0.046998,0.047279,0.012149,9.321928,3.716556,1.38834,0.383072,0.0,0.0,0.047619,749.708532,114.762483,45.637833,1
2,0.189257,0.042804,0.490689,0.222105,0.100165,0.020398,2.037951,3.064403,1.692389,1.112901,0.045455,1.0,0.045455,410.888275,256.24677,148.719917,0
3,0.209773,0.05836,0.504471,0.146404,0.096539,0.027422,1.982273,3.719774,2.106981,1.165748,0.046512,1.0,0.0,674.376308,192.868554,118.740285,0
4,0.047458,0.010797,0.099804,0.045919,0.04941,0.010377,10.01965,4.379329,1.508945,0.557317,0.0,0.0,0.097561,811.862445,95.958185,58.182956,1


In [29]:
def clean_data(df):
    # Replace infinity with NaN
    df = df.replace([np.inf, -np.inf], np.nan)

    # Identify columns with NaN or infinite values
    problematic_columns = df.columns[df.isin([np.inf, -np.inf, np.nan]).any()].tolist()

    print("Columns with NaN or infinite values:")
    for col in problematic_columns:
        nan_count = df[col].isna().sum()
        inf_count = np.isinf(df[col]).sum()
        print(f"{col}: NaN count = {nan_count}, Inf count = {inf_count}")

    # For problematic columns, replace NaN and Inf with median
    for col in problematic_columns:
        median_value = df[col].median()
        df[col] = df[col].replace([np.inf, -np.inf, np.nan], median_value)

    # Clip extremely large values
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:
            lower_bound = df[column].quantile(0.001)
            upper_bound = df[column].quantile(0.999)
            df[column] = df[column].clip(lower_bound, upper_bound)

    return df

In [30]:
df = clean_data(df)

# Convert target to numeric
df['target'] = (df['target'] == 'bot').astype(int)

# Prepare features and target
X = df.drop('target', axis=1).values
y = df['target'].values

Columns with NaN or infinite values:
rhythm_consistency: NaN count = 24, Inf count = 0


In [31]:
print("\nAfter cleaning:")
print("Any inf values in X:", np.any(np.isinf(X)))
print("Any nan values in X:", np.any(np.isnan(X)))

# Print data info
print("\nDataset shape after cleaning:", df.shape)
print("\nFeature summary:")
print(df.describe())


After cleaning:
Any inf values in X: False
Any nan values in X: False

Dataset shape after cleaning: (30000, 17)

Feature summary:
       avg_keystroke_time  std_keystroke_time  avg_pause_time  std_pause_time  \
count        30000.000000        30000.000000    30000.000000    30000.000000   
mean             0.124993            0.029536        0.299948        0.123150   
std              0.075147            0.020011        0.201131        0.075327   
min              0.045316            0.006638        0.076435        0.033356   
25%              0.050019            0.009855        0.099921        0.049252   
50%              0.110366            0.019599        0.256354        0.096097   
75%              0.200054            0.049201        0.499932        0.197085   
max              0.222432            0.065545        0.591849        0.262396   

       avg_key_hold_time  std_key_hold_time  typing_speed  rhythm_consistency  \
count       30000.000000       30000.000000  30000.000000

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [33]:
# Reshape data for LSTM input (samples, time steps, features)
X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))


In [34]:
model = Sequential([
    LSTM(128, input_shape=(1, X_train_reshaped.shape[2]), return_sequences=True, kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    LSTM(64, kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

  super().__init__(**kwargs)


In [35]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)


In [36]:
history = model.fit(
    X_train_reshaped, y_train,
    epochs=100,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)


Epoch 1/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.6922 - loss: 1.9832 - val_accuracy: 1.0000 - val_loss: 0.4043 - learning_rate: 0.0010
Epoch 2/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9974 - loss: 0.2275 - val_accuracy: 1.0000 - val_loss: 0.0592 - learning_rate: 0.0010
Epoch 3/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.9996 - loss: 0.0423 - val_accuracy: 1.0000 - val_loss: 0.0131 - learning_rate: 0.0010
Epoch 4/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0072 - val_accuracy: 1.0000 - val_loss: 0.0029 - learning_rate: 0.0010
Epoch 5/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0027 - val_accuracy: 1.0000 - val_loss: 0.0014 - learning_rate: 0.0010
Epoch 6/100
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [38]:
# Make predictions
y_pred = model.predict(X_test_reshaped)
y_pred_classes = (y_pred > 0.5).astype(int).flatten()


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [39]:
def predict_bot(new_data):
    # Ensure new_data is a 2D array
    if new_data.ndim == 1:
        new_data = new_data.reshape(1, -1)

    # Scale the new data
    new_data_scaled = scaler.transform(new_data)

    # Reshape for LSTM input
    new_data_reshaped = new_data_scaled.reshape((new_data_scaled.shape[0], 1, new_data_scaled.shape[1]))

    # Make prediction
    prediction = model.predict(new_data_reshaped)

    # Interpret prediction
    is_bot = prediction > 0.5
    confidence = prediction if is_bot else 1 - prediction

    return is_bot[0][0], confidence[0][0]


**PREDICTION**


In [41]:
real_input = np.array([0.2, 0.05, 0.5, 0.1, 0.1, 0.02, 5.0, 0.8, 2.0, 0.5, 0.01, 0.005, 0.001, 300, 100, 50])
is_bot, confidence = predict_bot(real_input)
print(f"Is bot: {is_bot}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Is bot: False
