In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset and create a copy
df_original = pd.read_csv("/kaggle/input/rt-iot2022/RT_IOT2022.csv")
df = df_original.copy()

print("Original dataset shape:", df_original.shape)
print("Original Attack Types:", df_original['Attack_type'].unique())

# Check for missing values
print("\nMissing values in dataset:", df.isnull().sum().sum())

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_columns)

# Make a copy of Attack_type before encoding
attack_type_copy = df['Attack_type'].copy()

# Encode categorical columns
label_encoders = {}
for col in categorical_columns:
    if col != 'Attack_type':  # Skip Attack_type for now
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col])

# Now encode Attack_type separately
label_encoders['Attack_type'] = LabelEncoder()
df['Attack_type'] = label_encoders['Attack_type'].fit_transform(attack_type_copy)

# Store the mapping of Attack_type labels
attack_type_mapping = dict(zip(attack_type_copy, df['Attack_type']))
print("\nAttack Type Mapping:")
print(attack_type_mapping)

# Create binary attack labels
if 'Attack_type' in df.columns:
    print("\nUnique Attack Types:", df['Attack_type'].unique())

    # Check if 'Normal' exists before transforming
    if 'Normal' in label_encoders['Attack_type'].classes_:
        normal_label = label_encoders['Attack_type'].transform(['Normal'])[0]
        df['Attack_label'] = (df['Attack_type'] != normal_label).astype(int)
    else:
        print("⚠️ Warning: 'Normal' label is missing from the dataset. Assigning default label.")
        df['Attack_label'] = (df['Attack_type'] != df['Attack_type'].mode()[0]).astype(int)

    print("\nBinary Label Distribution:")
    print(df['Attack_label'].value_counts())
else:
    print("⚠️ 'Attack_type' column is missing. Check dataset format.")

# Check for infinite values
print("\nInfinite values in dataset:", np.isinf(df.select_dtypes(include=[np.number])).sum().sum())

# Function to clean dataset
def clean_dataset(df):
    df_clean = df.copy()
    
    # Replace infinite values with NaN
    df_clean.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Drop columns with too many missing values (>50%)
    threshold = len(df_clean) * 0.5
    df_clean.dropna(thresh=threshold, axis=1, inplace=True)
    
    # Fill remaining NaN with median values
    df_clean.fillna(df_clean.median(), inplace=True)
    
    return df_clean

# Clean the dataset
df_processed = clean_dataset(df)

# Final check for NaN values
print("Final check for missing values:", df_processed.isnull().sum().sum())

# Check for class imbalance
if 'Attack_label' in df_processed.columns and df_processed['Attack_label'].nunique() == 1:
    majority_class = df_processed['Attack_label'].iloc[0]
    print(f"\n⚠️ WARNING: Only one class ({majority_class}) found in the dataset!")

    if majority_class == 1:  # All attacks
        print("Attempting to find original dataset with normal samples...")
        try:
            additional_df = pd.read_csv("/kaggle/input/rt-iot2022/Original_RT_IOT2022.csv")
            
            if 'Attack_type' in additional_df.columns:
                additional_df['Attack_label'] = (additional_df['Attack_type'] != 'Normal').astype(int)
                
                if additional_df['Attack_label'].nunique() > 1:
                    print("✅ Original dataset contains both classes. Using it instead.")
                    normal_samples = additional_df[additional_df['Attack_label'] == 0].sample(
                        min(1000, len(additional_df[additional_df['Attack_label'] == 0])),
                        random_state=42
                    )
                    attack_samples = df_processed.sample(min(1000, len(df_processed)), random_state=42)
                    df_processed = pd.concat([normal_samples, attack_samples]).reset_index(drop=True)
                    print(f"New dataset shape: {df_processed.shape}")
                    print(f"New class distribution: {df_processed['Attack_label'].value_counts()}")
        except:
            print("Could not find original dataset with normal samples.")
            print("Creating synthetic normal samples...")

            attack_samples = df_processed.sample(100, random_state=42)
            normal_samples = attack_samples.copy()
            numeric_cols = normal_samples.select_dtypes(include=[np.number]).columns
            
            for col in numeric_cols:
                if col != 'Attack_label':
                    std = normal_samples[col].std()
                    normal_samples[col] = normal_samples[col] + np.random.normal(0, std/2, len(normal_samples))

            if 'Attack_type' in normal_samples.columns:
                normal_samples['Attack_type'] = 'Normal'
            normal_samples['Attack_label'] = 0
            
            df_processed = pd.concat([df_processed, normal_samples]).reset_index(drop=True)
            
            print(f"New dataset with synthetic samples shape: {df_processed.shape}")
            print(f"New class distribution: {df_processed['Attack_label'].value_counts()}")

# Separate features and target
X = df_processed.drop(columns=["Attack_type", "Attack_label"])
y = df_processed["Attack_label"]  # Using binary labels for classification

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("\nDataset preprocessed successfully!")
print("Original dataset shape:", df_original.shape)
print("Processed dataset shape:", df_processed.shape)
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

# Save the mappings and processed data if needed
label_encoder_mappings = {
    col: dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_)))
    for col in label_encoders
}
print("\nLabel Encoder Mappings saved for future reference")

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
# Step 1: Preprocess Data
df = df.dropna()  # Remove NaN values
X = df.drop(columns=['Attack_label'])
y = df['Attack_label']
# Encode categorical variables
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col])
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)
# Step 2: Build a More Robust Autoencoder Model
input_dim = X_train.shape[1]
autoencoder = keras.models.Sequential([
    keras.layers.Dense(128, activation="relu", input_shape=(input_dim,)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(input_dim, activation="linear")  # Reconstruct original input
])
autoencoder.compile(optimizer="adam", loss="mse")
# Step 3: Train Autoencoder on Normal Data
X_train_normal = X_train[y_train == 0]  # Only use normal samples for training
history = autoencoder.fit(
    X_train_normal, X_train_normal, 
    epochs=200, batch_size=64, 
    validation_split=0.1, verbose=1
)
# Step 4: Anomaly Detection using Reconstruction Error
reconstructed = autoencoder.predict(X_test)
mse = np.mean(np.square(X_test - reconstructed), axis=1)
# Step 5: Dynamic Threshold Calculation
threshold = np.percentile(mse, 95)  # 95th percentile of normal reconstruction errors
y_pred_autoencoder = (mse > threshold).astype(int)  # 1 for anomaly, 0 for normal
# Step 6: Isolation Forest with Hyperparameter Tuning
iso_forest = IsolationForest(n_estimators=500, contamination="auto", random_state=42)
iso_forest.fit(mse.reshape(-1, 1))
y_pred_iso = iso_forest.predict(mse.reshape(-1, 1))
y_pred_iso = np.where(y_pred_iso == -1, 1, 0)  # Convert -1 (anomaly) to 1 (attack)
# Step 7: Combine Both Models (Majority Voting)
y_final_pred = (y_pred_autoencoder + y_pred_iso) > 0  # If either model predicts anomaly, classify as attack
y_final_pred = y_final_pred.astype(int)
# Step 8: Model Evaluation
accuracy = accuracy_score(y_test, y_final_pred)
roc_auc = roc_auc_score(y_test, mse)
conf_matrix = confusion_matrix(y_test, y_final_pred)
print("\n🔹 Accuracy:", accuracy)
print("\n🔹 ROC-AUC Score:", roc_auc)
print("\n🔹 Classification Report (Autoencoder + Isolation Forest):\n", classification_report(y_test, y_final_pred))
# Step 9: Confusion Matrix Visualization
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Normal", "Attack"], yticklabels=["Normal", "Attack"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
# Step 10: Save the Models
autoencoder.save("autoencoder_model.h5")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(iso_forest, "isolation_forest.pkl")

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import numpy as np

# Step 1: Recompute mse using the teacher model on training data and compute teacher soft labels
reconstructed_train = autoencoder.predict(X_train)
mse_train = np.mean(np.square(X_train - reconstructed_train), axis=1)
mse_scaled = (mse_train - np.min(mse_train)) / (np.max(mse_train) - np.min(mse_train))
teacher_preds_np = 1 / (1 + np.exp(-10 * (mse_scaled - 0.5)))  # Sigmoid transformation
teacher_preds_np = np.expand_dims(teacher_preds_np, axis=-1)  # shape: (n_samples, 1)

# Convert teacher predictions to a TensorFlow tensor (we'll use them per sample)
teacher_preds_tensor = tf.convert_to_tensor(teacher_preds_np, dtype=tf.float32)

# Step 2: Build the tf.data.Dataset that yields (x, y, teacher_pred)
batch_size = 64
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train, teacher_preds_tensor))
dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)

# Step 3: Define the Student Model (same as before)
student_model = keras.models.Sequential([
    keras.layers.Dense(64, activation="relu", input_shape=(input_dim,)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(1, activation="sigmoid")  # Binary classification
])

optimizer = tf.keras.optimizers.Adam()

# Step 4: Define the distillation loss as a function that works on a batch
def distillation_loss(y_true, y_pred, teacher_preds, temperature=5.0, alpha=0.7):
    # Hard loss: Binary crossentropy
    hard_loss = tf.keras.losses.binary_crossentropy(y_true, y_pred)
    # Soft loss: KL divergence between softened teacher and student outputs
    teacher_probs = tf.nn.softmax(teacher_preds / temperature)
    student_probs = tf.nn.softmax(y_pred / temperature)
    soft_loss = tf.keras.losses.KLDivergence()(teacher_probs, student_probs)
    # Combine the losses
    return alpha * hard_loss + (1 - alpha) * soft_loss

# Step 5: Custom training loop
epochs = 100

for epoch in range(epochs):
    epoch_loss = 0.0
    for step, (x_batch, y_batch, teacher_batch) in enumerate(dataset):
        with tf.GradientTape() as tape:
            # Forward pass
            predictions = student_model(x_batch, training=True)
            # Compute loss using the current batch's teacher predictions
            loss = distillation_loss(tf.cast(tf.expand_dims(y_batch, axis=-1), tf.float32),
                                     predictions,
                                     teacher_batch)
        # Compute gradients and update weights
        gradients = tape.gradient(loss, student_model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, student_model.trainable_variables))
        epoch_loss += tf.reduce_mean(loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss.numpy()/ (step+1)}")

# Step 6: Evaluate the student model on X_test
student_preds = student_model.predict(X_test)
student_preds_binary = (student_preds > 0.5).astype(int)

accuracy_student = accuracy_score(y_test, student_preds_binary)
roc_auc_student = roc_auc_score(y_test, student_preds_binary)

print("\n🔹 Student Model Accuracy:", accuracy_student)
print("\n🔹 Student Model ROC-AUC Score:", roc_auc_student)
print("\n🔹 Classification Report (Student Model):\n", classification_report(y_test, student_preds_binary))

# Optionally, save the student model
student_model.save("student_model.h5")
