Training script for credential stuffing attack detection using temporal user login data.
Includes EDA, preprocessing, feature importance, and training of RNN, LSTM, ConvLSTM models.
Validation and evaluation with confusion matrices and metrics table.

## Imports

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score,
                             recall_score, f1_score, roc_auc_score, classification_report)

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, ConvLSTM2D, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

2025-04-23 10:47:04.291323: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-23 10:47:05.407610: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745405225.725667  769497 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745405225.775347  769497 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745405226.702014  769497 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# Paths to datasets
# DATA_DIR = os.path.join("../Dataset")
CLEAN_TRAIN_PATH = 'clean_train.csv'
NOISY_TRAIN_PATH = 'noisy_train.csv'
CLEAN_TEST_PATH = 'clean_test.csv'
NOISY_TEST_PATH = 'noisy_test.csv'


In [3]:
def load_data():
    print("Loading datasets...")
    clean_train = pd.read_csv(CLEAN_TRAIN_PATH)
    noisy_train = pd.read_csv(NOISY_TRAIN_PATH)
    clean_test = pd.read_csv(CLEAN_TEST_PATH)
    noisy_test = pd.read_csv(NOISY_TEST_PATH)
    print("Datasets loaded.")
    return clean_train, noisy_train, clean_test, noisy_test

In [4]:
def perform_eda(df, name):
    print(f"\n--- EDA for {name} ---")
    print("Shape:", df.shape)
    print("Columns:", df.columns.tolist())
    print("Missing values:\n", df.isnull().sum())
    print("Data types:\n", df.dtypes)
    print("Basic statistics:\n", df.describe())
    print("Class distribution:\n", df['is_bot'].value_counts(normalize=True))
    
    # Plot class distribution
    plt.figure(figsize=(6,4))
    sns.countplot(x='is_bot', data=df)
    plt.title(f'Class Distribution in {name}')
    plt.savefig(f'{name}_class_distribution.png')
    plt.close()
    
    # Plot feature distributions for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols.remove('is_bot') if 'is_bot' in numeric_cols else None
    for col in numeric_cols:
        plt.figure(figsize=(6,4))
        sns.histplot(df[col].dropna(), kde=True, bins=30)
        plt.title(f'Distribution of {col} in {name}')
        plt.savefig(f'{name}_{col}_distribution.png')
        plt.close()
    
    # Correlation heatmap
    plt.figure(figsize=(10,8))
    df_corr = df.copy()
    if 'user_id' in df_corr.columns:
        df_corr = df_corr.drop(columns=['user_id'])
    corr = df_corr.corr()
    sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
    plt.title(f'Correlation Heatmap for {name}')
    plt.savefig(f'{name}_correlation_heatmap.png')
    plt.close()
    
    print(f"--- EDA for {name} completed ---\n")

In [5]:
def preprocess_data(df):
    # Impute missing values with median
    imputer = SimpleImputer(strategy='median')
    features = df.drop(columns=['bot'])
    features_imputed = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)
    
    # Feature engineering: example - extract hour from timestamp if exists
    if 'timestamp' in features_imputed.columns:
        try:
            features_imputed['timestamp'] = pd.to_datetime(features_imputed['timestamp'])
            features_imputed['hour'] = features_imputed['timestamp'].dt.hour
            features_imputed['dayofweek'] = features_imputed['timestamp'].dt.dayofweek
            features_imputed['is_weekend'] = features_imputed['dayofweek'].isin([5,6]).astype(int)
            features_imputed = features_imputed.drop(columns=['timestamp'])
        except Exception as e:
            print(f"Timestamp feature engineering skipped due to error: {e}")
    
    # Scaling features
    scaler = StandardScaler()
    features_scaled = pd.DataFrame(scaler.fit_transform(features_imputed), columns=features_imputed.columns)
    
    labels = df['bot'].values
    
    return features_scaled, labels, imputer, scaler

In [6]:
def feature_importance_rf(X, y, n_features=10):
    print("\nTraining Random Forest for feature importance...")
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    importances = rf.feature_importances_
    feature_names = X.columns
    feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
    print("Top features:\n", feat_imp.head(n_features))
    
    # Plot feature importance
    plt.figure(figsize=(8,6))
    sns.barplot(x=feat_imp.head(n_features), y=feat_imp.head(n_features).index)
    plt.title("Top Feature Importances")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.savefig("feature_importance.png")
    plt.close()
    
    top_features = feat_imp.head(n_features).index.tolist()
    return top_features

## Sequence Creation

In [7]:
def create_sequences(X, y, seq_length=10):
    """
    Create sequences of features and labels for RNN/LSTM/ConvLSTM.
    X: pd.DataFrame or np.array of shape (samples, features)
    y: np.array of shape (samples,)
    Returns:
        X_seq: np.array of shape (num_sequences, seq_length, num_features)
        y_seq: np.array of shape (num_sequences,)
    """
    X_values = X.values if isinstance(X, pd.DataFrame) else X
    sequences = []
    labels = []
    for i in range(len(X_values) - seq_length + 1):
        seq = X_values[i:i+seq_length]
        label = y[i+seq_length-1]  # label corresponds to last item in sequence
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

In [8]:
def create_convlstm_sequences(X, y, seq_length=10):
    """
    Prepare data for ConvLSTM2D which expects 5D input: (samples, time_steps, rows, cols, channels)
    We will reshape features into a 2D grid if possible.
    For simplicity, reshape features into (1, num_features, 1) grid.
    """
    X_seq, y_seq = create_sequences(X, y, seq_length)
    # Reshape to (samples, time_steps, rows=1, cols=num_features, channels=1)
    X_seq_reshaped = X_seq.reshape((X_seq.shape[0], X_seq.shape[1], 1, X_seq.shape[2], 1))
    return X_seq_reshaped, y_seq

## Model Building

In [9]:
def build_rnn(input_shape):
    model = Sequential()
    model.add(SimpleRNN(64, activation='relu', input_shape=input_shape, return_sequences=False))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [10]:
def build_lstm(input_shape):
    model = Sequential()
    model.add(LSTM(64, activation='tanh', input_shape=input_shape, return_sequences=False))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [11]:
def build_convlstm(input_shape):
    model = Sequential()
    model.add(ConvLSTM2D(filters=32, kernel_size=(1,3), activation='relu', input_shape=input_shape, return_sequences=False))
    model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

## Model Evaluation

In [12]:
def evaluate_model(model, X, y, dataset_name, model_name):
    print(f"\nEvaluating {model_name} on {dataset_name} dataset...")
    y_pred_prob = model.predict(X).ravel()
    y_pred = (y_pred_prob >= 0.5).astype(int)
    
    cm = confusion_matrix(y, y_pred)
    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred, zero_division=0)
    rec = recall_score(y, y_pred, zero_division=0)
    f1 = f1_score(y, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y, y_pred_prob)
    
    print(f"Confusion Matrix:\n{cm}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    
    return {'model': model_name, 'dataset': dataset_name, 'confusion_matrix': cm,
            'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'roc_auc': roc_auc}


## Main function

In [13]:
def main():
    # Load data
    clean_train, noisy_train, clean_test, noisy_test = load_data()
    
    # Perform EDA
    perform_eda(clean_train, "clean_train")
    perform_eda(noisy_train, "noisy_train")
    
    # Preprocess data
    X_clean_train, y_clean_train, imputer_clean, scaler_clean = preprocess_data(clean_train)
    X_noisy_train, y_noisy_train, imputer_noisy, scaler_noisy = preprocess_data(noisy_train)
    
    X_clean_test = clean_test.drop(columns=['bot'])
    y_clean_test = clean_test['bot'].values
    X_clean_test = pd.DataFrame(imputer_clean.transform(X_clean_test), columns=X_clean_test.columns)
    X_clean_test = pd.DataFrame(scaler_clean.transform(X_clean_test), columns=X_clean_test.columns)
    
    X_noisy_test = noisy_test.drop(columns=['bot'])
    y_noisy_test = noisy_test['bot'].values
    X_noisy_test = pd.DataFrame(imputer_noisy.transform(X_noisy_test), columns=X_noisy_test.columns)
    X_noisy_test = pd.DataFrame(scaler_noisy.transform(X_noisy_test), columns=X_noisy_test.columns)
    
    # Feature importance with Random Forest on clean and noisy train sets
    top_features_clean = feature_importance_rf(X_clean_train, y_clean_train, n_features=10)
    top_features_noisy = feature_importance_rf(X_noisy_train, y_noisy_train, n_features=10)
    
    # For modeling, use intersection of top features from both datasets to ensure consistency
    selected_features = list(set(top_features_clean).intersection(set(top_features_noisy)))
    if len(selected_features) < 5:
        # If intersection too small, use union instead
        selected_features = list(set(top_features_clean).union(set(top_features_noisy)))
    print(f"\nSelected features for modeling: {selected_features}")
    
    # Prepare sequences for models
    seq_length = 10
    
    # Clean dataset sequences
    X_clean_train_seq, y_clean_train_seq = create_sequences(X_clean_train[selected_features], y_clean_train, seq_length)
    X_clean_test_seq, y_clean_test_seq = create_sequences(X_clean_test[selected_features], y_clean_test, seq_length)
    
    # Noisy dataset sequences
    X_noisy_train_seq, y_noisy_train_seq = create_sequences(X_noisy_train[selected_features], y_noisy_train, seq_length)
    X_noisy_test_seq, y_noisy_test_seq = create_sequences(X_noisy_test[selected_features], y_noisy_test, seq_length)
    
    # ConvLSTM requires 5D input
    X_clean_train_conv, _ = create_convlstm_sequences(X_clean_train[selected_features], y_clean_train, seq_length)
    X_clean_test_conv, _ = create_convlstm_sequences(X_clean_test[selected_features], y_clean_test, seq_length)
    X_noisy_train_conv, _ = create_convlstm_sequences(X_noisy_train[selected_features], y_noisy_train, seq_length)
    X_noisy_test_conv, _ = create_convlstm_sequences(X_noisy_test[selected_features], y_noisy_test, seq_length)
    
    # Build models
    rnn_model_clean = build_rnn((seq_length, len(selected_features)))
    lstm_model_clean = build_lstm((seq_length, len(selected_features)))
    convlstm_model_clean = build_convlstm((seq_length, 1, len(selected_features), 1))
    
    rnn_model_noisy = build_rnn((seq_length, len(selected_features)))
    lstm_model_noisy = build_lstm((seq_length, len(selected_features)))
    convlstm_model_noisy = build_convlstm((seq_length, 1, len(selected_features), 1))
    
    # Early stopping callback
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Train models on clean dataset
    print("\nTraining models on clean dataset...")
    rnn_model_clean.fit(X_clean_train_seq, y_clean_train_seq, epochs=30, batch_size=64,
                        validation_data=(X_clean_test_seq, y_clean_test_seq), callbacks=[early_stop], verbose=2)
    lstm_model_clean.fit(X_clean_train_seq, y_clean_train_seq, epochs=30, batch_size=64,
                        validation_data=(X_clean_test_seq, y_clean_test_seq), callbacks=[early_stop], verbose=2)
    convlstm_model_clean.fit(X_clean_train_conv, y_clean_train_seq, epochs=30, batch_size=64,
                            validation_data=(X_clean_test_conv, y_clean_test_seq), callbacks=[early_stop], verbose=2)
    
    # Train models on noisy dataset
    print("\nTraining models on noisy dataset...")
    rnn_model_noisy.fit(X_noisy_train_seq, y_noisy_train_seq, epochs=30, batch_size=64,
                       validation_data=(X_noisy_test_seq, y_noisy_test_seq), callbacks=[early_stop], verbose=2)
    lstm_model_noisy.fit(X_noisy_train_seq, y_noisy_train_seq, epochs=30, batch_size=64,
                       validation_data=(X_noisy_test_seq, y_noisy_test_seq), callbacks=[early_stop], verbose=2)
    convlstm_model_noisy.fit(X_noisy_train_conv, y_noisy_train_seq, epochs=30, batch_size=64,
                           validation_data=(X_noisy_test_conv, y_noisy_test_seq), callbacks=[early_stop], verbose=2)
    
    # Evaluate models and generate confusion matrices
    results = []
    # Clean dataset evaluations
    results.append(evaluate_model(rnn_model_clean, X_clean_test_seq, y_clean_test_seq, "clean", "RNN"))
    results.append(evaluate_model(lstm_model_clean, X_clean_test_seq, y_clean_test_seq, "clean", "LSTM"))
    results.append(evaluate_model(convlstm_model_clean, X_clean_test_conv, y_clean_test_seq, "clean", "ConvLSTM"))
    
    # Noisy dataset evaluations
    results.append(evaluate_model(rnn_model_noisy, X_noisy_test_seq, y_noisy_test_seq, "noisy", "RNN"))
    results.append(evaluate_model(lstm_model_noisy, X_noisy_test_seq, y_noisy_test_seq, "noisy", "LSTM"))
    results.append(evaluate_model(convlstm_model_noisy, X_noisy_test_conv, y_noisy_test_seq, "noisy", "ConvLSTM"))
    
    # Create results table for noisy dataset
    noisy_results = [r for r in results if r['dataset'] == 'noisy']
    results_df = pd.DataFrame(noisy_results)[['model', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc']]
    print("\nNoisy Dataset Results Summary:")
    print(results_df.to_string(index=False))
    
    # Save results table to CSV
    results_df.to_csv("noisy_dataset_results_summary.csv", index=False)

In [14]:
if __name__ == "__main__":
    main()

Loading datasets...
Datasets loaded.

--- EDA for clean_train ---
Shape: (25000, 14)
Columns: ['user_id', 'login_attempts', 'failed_logins', 'unusual_time_access', 'ip_rep_score', 'browser_type', 'new_device_login', 'session_duration_deviation', 'network_packet_size_variance', 'mouse_speed', 'typing_speed', 'day_of_week', 'time_of_day', 'is_bot']
Missing values:
 user_id                         0
login_attempts                  0
failed_logins                   0
unusual_time_access             0
ip_rep_score                    0
browser_type                    0
new_device_login                0
session_duration_deviation      0
network_packet_size_variance    0
mouse_speed                     0
typing_speed                    0
day_of_week                     0
time_of_day                     0
is_bot                          0
dtype: int64
Data types:
 user_id                          object
login_attempts                    int64
failed_logins                     int64
unusual_time

ValueError: could not convert string to float: 'Firefox'

<Figure size 1000x800 with 0 Axes>