In [6]:
import tensorflow as tf
from tensorflow.keras import mixed_precision
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
mixed_precision.set_global_policy('float32')

In [2]:


# Check if GPU is available and configure it
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    print("GPU is available:", physical_devices)
    # Configure GPU for optimal performance
    try:
        for gpu in physical_devices:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("Memory growth enabled")
    except:
        print("Memory growth already enabled")
    # Set mixed precision policy for faster GPU computation
    policy = tf.keras.mixed_precision.Policy('mixed_float16')
    tf.keras.mixed_precision.set_global_policy(policy)
    print("Mixed precision policy set to:", policy.name)
else:
    print("No GPU found")

GPU is available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
Memory growth enabled
Mixed precision policy set to: mixed_float16


In [3]:


# Load and preprocess the dataset
df = pd.read_csv("/kaggle/input/rt-iot2022real-time-internet-of-things/RT_IOT2022.csv")
df = df.drop(columns=['Unnamed: 0']) if 'Unnamed: 0' in df.columns else df

# Encode categorical variables
label_encoders = {}
categorical_columns = ['proto', 'service', 'Attack_type']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Prepare features and target
X = df.drop(columns=['Attack_type']).values
y = df['Attack_type'].values
num_classes = len(np.unique(y))

# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert to proper shape for CNN and convert labels to one-hot
X_train_reshaped = X_train.reshape(-1, X_train.shape[1], 1)
X_test_reshaped = X_test.reshape(-1, X_test.shape[1], 1)
y_train_categorical = tf.keras.utils.to_categorical(y_train, num_classes)
y_test_categorical = tf.keras.utils.to_categorical(y_test, num_classes)

# Calculate class weights instead of using SMOTE (faster)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

In [4]:


# Create TensorFlow datasets with prefetching for better GPU utilization
BATCH_SIZE = 512  # Larger batch size for GPU
AUTOTUNE = tf.data.AUTOTUNE

train_ds = tf.data.Dataset.from_tensor_slices((X_train_reshaped, y_train_categorical))
train_ds = train_ds.cache().shuffle(10000).batch(BATCH_SIZE).prefetch(AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((X_test_reshaped, y_test_categorical))
val_ds = val_ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

# TEACHER MODEL: More complex architecture
teacher_model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', padding='same', input_shape=(X_train.shape[1], 1)),
    BatchNormalization(momentum=0.9),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    
    Conv1D(128, kernel_size=3, activation='relu', padding='same'),
    BatchNormalization(momentum=0.9),
    MaxPooling1D(pool_size=2),
    Dropout(0.4),
    
    Conv1D(256, kernel_size=3, activation='relu', padding='same'),
    BatchNormalization(momentum=0.9),
    MaxPooling1D(pool_size=2),
    Dropout(0.4),
    
    Flatten(),
    Dense(128, activation='relu'),
    BatchNormalization(momentum=0.9),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile with proper dtype for mixed precision
teacher_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train teacher with optimized callbacks
teacher_callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001)
]

print("Training teacher model...")
teacher_history = teacher_model.fit(
    train_ds,
    epochs=15,  # Reduced epochs since we're using larger batches
    validation_data=val_ds,
    callbacks=teacher_callbacks,
    verbose=1
)

# Generate soft targets with teacher model
print("Generating soft targets from teacher model...")
temperature = 4.0
teacher_preds = teacher_model.predict(X_train_reshaped, batch_size=BATCH_SIZE)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training teacher model...
Epoch 1/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 45ms/step - accuracy: 0.6735 - loss: 1.3082 - val_accuracy: 0.9750 - val_loss: 0.0730 - learning_rate: 0.0010
Epoch 2/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9710 - loss: 0.0979 - val_accuracy: 0.9849 - val_loss: 0.0460 - learning_rate: 0.0010
Epoch 3/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9806 - loss: 0.0648 - val_accuracy: 0.9887 - val_loss: 0.0344 - learning_rate: 0.0010
Epoch 4/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9856 - loss: 0.0462 - val_accuracy: 0.9915 - val_loss: 0.0286 - learning_rate: 0.0010
Epoch 5/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9876 - loss: 0.0408 - val_accuracy: 0.9922 - val_loss: 0.0243 - learning_rate: 0.0010
Epoch 6/15
[1m193/193[0m [32m━━━━━━━━━━

In [8]:
# Create custom distillation loss with built-in temperature handling
class DistillationModel(tf.keras.Model):
    def __init__(self, student_model, teacher_predictions, temperature=4.0, alpha=0.1):
        super(DistillationModel, self).__init__()
        self.student_model = student_model
        self.teacher_predictions = teacher_predictions
        self.temperature = temperature
        self.alpha = alpha
        
    def compile(self, optimizer, metrics):
        super(DistillationModel, self).compile(optimizer=optimizer, metrics=metrics)
        self.loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
        
    def train_step(self, data):
        x, y_true = data
        
        with tf.GradientTape() as tape:
            y_pred = self.student_model(x, training=True)
            
            # Hard loss - standard cross-entropy with true labels
            hard_loss = self.loss_fn(y_true, y_pred)
            
            # Soft targets - using pre-computed teacher predictions
            # Get correct indices for the batch
            batch_indices = tf.range(tf.shape(x)[0])
            soft_targets = tf.gather(self.teacher_predictions, batch_indices)
            
            # Apply temperature scaling
            soft_targets = tf.nn.softmax(soft_targets / self.temperature)
            soft_pred = tf.nn.softmax(y_pred / self.temperature)
            
            # Soft loss - KL divergence
            soft_loss = tf.keras.losses.kullback_leibler_divergence(soft_targets, soft_pred)
            soft_loss = soft_loss * (self.temperature ** 2)
            
            # Combine losses
            total_loss = (1 - self.alpha) * hard_loss + self.alpha * soft_loss
        
        # Compute gradients and update weights
        gradients = tape.gradient(total_loss, self.student_model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.student_model.trainable_variables))
        
        # Update metrics
        self.compiled_metrics.update_state(y_true, y_pred)
        results = {m.name: m.result() for m in self.metrics}
        results.update({"loss": total_loss})
        
        return results
    
    def call(self, inputs):
        return self.student_model(inputs)

# Create small, fast student model
student_model = Sequential([
    Conv1D(16, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    
    Conv1D(32, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.4),
    
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(0.4),
    Dense(num_classes, activation='softmax')
])

# Wrap student model in the distillation model
distillation_model = DistillationModel(
    student_model=student_model,
    teacher_predictions=teacher_preds,
    temperature=temperature,
    alpha=0.3  # Weight for soft targets vs hard targets
)

# Compile distillation model
distillation_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# Train student model with distillation
print("Training student model with knowledge distillation...")
student_callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001)
]

student_history = distillation_model.fit(
    train_ds,
    epochs=15,
    validation_data=val_ds,
    callbacks=student_callbacks,
    verbose=1
)


Training student model with knowledge distillation...
Epoch 1/15
[1m177/193[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 3ms/step - accuracy: 0.7534 - loss: 0.4173

InvalidArgumentError: {{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} required broadcastable shapes [Op:AddV2] name: 

In [None]:


# Evaluate both models
print("Evaluating teacher model...")
teacher_eval = teacher_model.evaluate(val_ds, verbose=1)
print(f"Teacher model - Loss: {teacher_eval[0]:.4f}, Accuracy: {teacher_eval[1]:.4f}")

print("Evaluating student model...")
student_eval = distillation_model.evaluate(val_ds, verbose=1)
print(f"Student model - Loss: {student_eval[0]:.4f}, Accuracy: {student_eval[1]:.4f}")

# Make predictions with student model
print("Generating predictions with student model...")
y_pred = distillation_model.predict(X_test_reshaped, batch_size=BATCH_SIZE)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test_categorical, axis=1)

# Calculate metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("\nClassification Report (Student Model):")
print(classification_report(y_test_labels, y_pred_labels))

# Calculate precision, recall, and F1 score
accuracy = accuracy_score(y_test_labels, y_pred_labels)
print(f"Accuracy: {accuracy:.4f}")

# Compare model sizes
teacher_params = teacher_model.count_params()
student_params = student_model.count_params()
reduction_percentage = (1 - student_params / teacher_params) * 100

print(f"\nModel Size Comparison:")
print(f"Teacher model parameters: {teacher_params:,}")
print(f"Student model parameters: {student_params:,}")
print(f"Size reduction: {reduction_percentage:.2f}%")
print(f"Inference speed improvement should be approximately {1/(1-reduction_percentage/100):.2f}x faster")

# Save the student model
student_model.save('distilled_iot_model.h5')