In [39]:
!pip install river -q --quiet
print("✅ river library installed successfully.")

✅ river library installed successfully.


# Step 1: Setup and Data Upload

In [40]:
# --- 1. SETUP AND IMPORTS ---
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from google.colab import files
import numpy as np
import warnings

# Suppress ConvergenceWarning for cleaner output
warnings.filterwarnings('ignore', category=UserWarning)
print("✅ Libraries imported.")

# --- 2. FILE UPLOAD ---
# print("\n⬆️ Please upload 'KDDTrain+.txt' and 'KDDTest+.txt' if you haven't already.")
# uploaded = files.upload()
train_file = 'KDDTrain+.txt'
test_file = 'KDDTest+.txt'

# --- 3. PREPARE ENCODERS AND METADATA ---
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
    'attack', 'difficulty'
]
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = [col for col in columns if col not in categorical_cols + ['attack', 'difficulty']]

# Fit encoders on all possible categories from both files
df_train_cat = pd.read_csv(train_file, header=None, names=columns, usecols=categorical_cols)
df_test_cat = pd.read_csv(test_file, header=None, names=columns, usecols=categorical_cols)
combined_cat = pd.concat([df_train_cat, df_test_cat], ignore_index=True)

protocol_encoder = LabelEncoder().fit(combined_cat['protocol_type'])
service_encoder = LabelEncoder().fit(combined_cat['service'])
flag_encoder = LabelEncoder().fit(combined_cat['flag'])

all_classes = np.array([0, 1])
chunksize = 2048 # Use a larger, more stable chunksize

print("\n✅ Common setup complete. You can now run any of the model blocks below.")

✅ Libraries imported.

✅ Common setup complete. You can now run any of the model blocks below.


# Step 2: Preprocessing and Model Initialization

In [74]:
# --- 3. DATA AND MODEL INITIALIZATION ---

# Define column names for the dataset, as they are not in the file
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
    'attack', 'difficulty'
]

# Identify categorical and numerical columns for preprocessing
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = [col for col in columns if col not in categorical_cols + ['attack', 'difficulty']]

# Initialize tools for preprocessing
# LabelEncoders to convert categorical text to numbers
protocol_encoder = LabelEncoder()
service_encoder = LabelEncoder()
flag_encoder = LabelEncoder()

# We need to fit the encoders on all possible categories they might see.
# We'll combine the unique values from both train and test sets for a robust fit.
df_train_cat = pd.read_csv(train_file, header=None, names=columns, usecols=categorical_cols)
df_test_cat = pd.read_csv(test_file, header=None, names=columns, usecols=categorical_cols)
combined_cat = pd.concat([df_train_cat, df_test_cat], ignore_index=True)

protocol_encoder.fit(combined_cat['protocol_type'])
service_encoder.fit(combined_cat['service'])
flag_encoder.fit(combined_cat['flag'])

# StandardScaler to scale numerical features. It will be fitted incrementally.
scaler = StandardScaler()

# Define the models that support online learning via the `partial_fit` method
models = {
    # Stochastic Gradient Descent with log loss is equivalent to Logistic Regression
    "Online Logistic Regression (SGD)": SGDClassifier(loss='log_loss', random_state=42, learning_rate='adaptive', eta0=0.01),
    # MLPClassifier is a neural network, inherently suited for online learning
    "Multi-layer Perceptron": MLPClassifier(hidden_layer_sizes=(500, 50), activation='relu', solver='adam', random_state=42, learning_rate_init=0.01),
    # Gaussian Naive Bayes can be updated incrementally
    "Gaussian Naive Bayes": GaussianNB()
}

print("✅ Models and preprocessors initialized.")

✅ Models and preprocessors initialized.


# Step 3: Online Training

In [75]:
# --- 4. ONLINE TRAINING LOOP ---

print("🚀 Starting Online Training...")
chunksize = 200
all_classes = np.array([0, 1]) # [0 for normal, 1 for attack]

# Read the training data in chunks to simulate a stream
for chunk in pd.read_csv(train_file, header=None, names=columns, chunksize=chunksize):
    # --- Preprocessing the chunk ---
    # 1. Encode categorical features
    chunk['protocol_type'] = protocol_encoder.transform(chunk['protocol_type'])
    chunk['service'] = service_encoder.transform(chunk['service'])
    chunk['flag'] = flag_encoder.transform(chunk['flag'])

    # 2. Create the binary target variable (0 for 'normal', 1 for 'attack')
    y_chunk = chunk['attack'].apply(lambda x: 0 if x == 'normal' else 1)

    # 3. Drop unnecessary columns
    X_chunk = chunk.drop(columns=['attack', 'difficulty'])

    # 4. Scale numerical features. We use partial_fit to update the scaler's stats.
    scaler.partial_fit(X_chunk[numerical_cols])

    X_chunk[numerical_cols] = scaler.transform(X_chunk[numerical_cols])
    # --- Incremental Training ---
    # Update each model with the preprocessed chunk
    for model_name, model in models.items():
        # The `classes` parameter is required on the first call to `partial_fit`
        model.partial_fit(X_chunk, y_chunk, classes=all_classes)

print("✅ Online Training Complete.")

🚀 Starting Online Training...
✅ Online Training Complete.


# Step 4: Online Evaluation

In [76]:
# --- 5. ONLINE EVALUATION ---

print("\n🔬 Starting Online Evaluation on the test set...")

# Dictionary to store the final performance metrics
results = {}

for model_name, model in models.items():
    # Lists to store labels and predictions from all chunks
    all_y_true = []
    all_y_pred = []

    # Process the test data in chunks
    for chunk in pd.read_csv(test_file, header=None, names=columns, chunksize=chunksize):
        # --- Preprocessing the test chunk ---
        # Apply the SAME transformations fitted during training
        chunk['protocol_type'] = protocol_encoder.transform(chunk['protocol_type'])
        chunk['service'] = service_encoder.transform(chunk['service'])
        chunk['flag'] = flag_encoder.transform(chunk['flag'])

        y_true = chunk['attack'].apply(lambda x: 0 if x == 'normal' else 1)
        X_test_chunk = chunk.drop(columns=['attack', 'difficulty'])

        # Use the already fitted scaler to transform the test data
        X_test_chunk[numerical_cols] = scaler.transform(X_test_chunk[numerical_cols])

        # --- Prediction ---
        y_pred = model.predict(X_test_chunk)

        # Append chunk results to the main lists
        all_y_true.extend(y_true)
        all_y_pred.extend(y_pred)

    # --- Calculate overall metrics ---
    accuracy = accuracy_score(all_y_true, all_y_pred)
    precision = precision_score(all_y_true, all_y_pred, average='weighted')
    recall = recall_score(all_y_true, all_y_pred, average='weighted')
    f1 = f1_score(all_y_true, all_y_pred, average='weighted')

    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }
    print(f"  - Evaluation for {model_name} complete.")

print("✅ Online Evaluation Complete.")

# --- 6. DISPLAY RESULTS ---
print("\n--- 📊 Final Model Performance Comparison (Online Learning) 📊 ---")
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by='F1-Score', ascending=False)
print(results_df)


🔬 Starting Online Evaluation on the test set...
  - Evaluation for Online Logistic Regression (SGD) complete.
  - Evaluation for Multi-layer Perceptron complete.
  - Evaluation for Gaussian Naive Bayes complete.
✅ Online Evaluation Complete.

--- 📊 Final Model Performance Comparison (Online Learning) 📊 ---
                                  Accuracy  Precision    Recall  F1-Score
Multi-layer Perceptron            0.843329   0.871453  0.843329  0.843553
Gaussian Naive Bayes              0.770892   0.809196  0.770892  0.770184
Online Logistic Regression (SGD)  0.703380   0.814152  0.703380  0.690603
