In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/csic-2010-web-application-attacks/csic_database.csv


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pickle

# 1. Load and prepare data
file_path = "/kaggle/input/csic-2010-web-application-attacks/csic_database.csv"
df = pd.read_csv(file_path)

df["text"] = (
    df["Method"].fillna("") + " " +
    df["URL"].fillna("") + " " +
    df["content"].fillna("") + " " +
    df["User-Agent"].fillna("")
)

X = df["text"].values
y = df["classification"].values  # 0 = Normal, 1 = Attack

# 2. Split dataset (no test set needed for saving the model)
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Tokenization
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')

# 4. Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# 5. Build Bi-GRU model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Bidirectional(GRU(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(GRU(32)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])

# 6. Train model
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(X_train_pad, y_train, epochs=10, batch_size=64,
          validation_split=0.2, callbacks=[early_stop], verbose=1)

# 7. Save model
model.save("bi_gru_model.h5")
print("Bi-GRU model and tokenizer saved successfully.")

2025-06-27 15:56:37.619577: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751039797.869579      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751039797.936415      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-27 15:56:57.208810: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/10
[1m611/611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 365ms/step - accuracy: 0.8691 - loss: 0.2703 - val_accuracy: 0.9845 - val_loss: 0.0534
Epoch 2/10
[1m611/611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 363ms/step - accuracy: 0.9892 - loss: 0.0454 - val_accuracy: 0.9873 - val_loss: 0.0479
Epoch 3/10
[1m611/611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 377ms/step - accuracy: 0.9929 - loss: 0.0338 - val_accuracy: 0.9885 - val_loss: 0.0481
Epoch 4/10
[1m611/611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 361ms/step - accuracy: 0.9935 - loss: 0.0332 - val_accuracy: 0.9886 - val_loss: 0.0490
Bi-GRU model and tokenizer saved successfully.


In [3]:
# Save model and tokenizer in Kaggle's working directory
model.save("/kaggle/working/bi_gru_model.h5")

with open("/kaggle/working/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [4]:
model.summary()