In [16]:
# ------------------ STEP 0: Imports, seeds, checks & helpers ------------------
print("STEP 0: Importing libraries and setting up environment...")

import os
# reduce TF logging spam
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")

import math
import random
import gc
import sys
from glob import glob
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
import joblib

# sklearn utils
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, classification_report

# TensorFlow import
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    tf_version = tf.__version__
    print("TensorFlow version:", tf_version)
except Exception as e:
    tf = None
    print("Warning: TensorFlow import failed. Exception:", e)
    raise e

print("  -> imports done\n")

# ------------------ Seeds and reproducibility ------------------
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
if tf is not None:
    tf.random.set_seed(SEED)

# ------------------ Filesystem & environment checks ------------------
def ensure_dir(d):
    if not os.path.exists(d):
        os.makedirs(d, exist_ok=True)

# ------------------ GPU/TPU setup ------------------
if tf is not None:
    try:
        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            for g in gpus:
                tf.config.experimental.set_memory_growth(g, True)
            print(f"GPU devices found: {[g.name for g in gpus]}")
        else:
            print("No GPU devices found (running on CPU).")
    except Exception as e:
        print("Warning checking GPU devices:", e)

print("STEP 0 ready.")

STEP 0: Importing libraries and setting up environment...
TensorFlow version: 2.18.0
  -> imports done

GPU devices found: ['/physical_device:GPU:0', '/physical_device:GPU:1']
STEP 0 ready.


In [None]:
# ------------------ STEP 1: Configuration & Hyperparameters ------------------
print("STEP 1: Setting configuration...")

DATA_PATH = "/kaggle/input/multiclass-dataset" # OR "/kaggle/working/multiclass-dataset"
TRAIN_FILE = os.path.join(DATA_PATH, "final_train_multiclass.parquet")
TEST_FILE = os.path.join(DATA_PATH, "final_test_multiclass.parquet")

# --- Model Architecture ---
D_MODEL = 128            # Embedding dimension for the model
NUM_HEADS = 4            # Number of attention heads (if USE_MHA=True)
USE_MHA = False          # Set to True to use Multi-Head Attention

# --- MODIFIED: Added T3 config ---
COND_DIM_T2 = 16         # Dimension of the conditioning vector from T1 -> T2
HEAD2_HIDDEN = 64        # Hidden layer size for the Task 2 head
COND_DIM_T3 = 16         # NEW: Dimension of the conditioning vector from T2 -> T3
HEAD3_HIDDEN = 64        # NEW: Hidden layer size for the Task 3 head

# --- Training ---
BATCH_SIZE = 256
LEARNING_RATE = 1e-3

# --- Federated Learning ---
NUM_CLIENTS = 6          # Number of clients
NUM_ROUNDS = 10          # Number of federated rounds
EPOCHS_PER_CLIENT_ROUND = 1 # Local epochs each client runs per round

# --- Sanity Checks ---
if not os.path.isdir(DATA_PATH):
    print(f"WARNING: DATA_PATH '{DATA_PATH}' does not exist.")
if D_MODEL % NUM_HEADS != 0:
    print(f"WARNING: D_MODEL ({D_MODEL}) not divisible by NUM_HEADS ({NUM_HEADS}).")

print(f"  DATA_PATH: {DATA_PATH}")
print(f"  NUM_CLIENTS: {NUM_CLIENTS}")
print("STEP 1 ready.")

STEP 1: Setting configuration...
  DATA_PATH: /kaggle/input/multiclass-dataset
  NUM_CLIENTS: 6
STEP 1 ready.


In [18]:
import pandas as pd
import sys
import pprint
import os

# --- Define the *correct* file path here ---
# Use the same DATA_PATH as your main notebook
DATA_PATH = "/kaggle/input/multiclass-dataset" 
file_path = os.path.join(DATA_PATH, "final_test_multiclass.parquet") 

print(f"Attempting to read file from: {file_path}")

try:
    # Use 'pyarrow' as it's common in Kaggle environments
    df = pd.read_parquet(file_path, engine='pyarrow')
    print("Successfully read file.")
except Exception as e:
    print(f"Error: Could not read parquet file at '{file_path}'. {e}")
    print("Please ensure this path is correct and 'pyarrow' or 'fastparquet' is installed.")
    sys.exit(1)

all_columns = df.columns.tolist()

# --- Try to find the label column ---
label_col = None
# This is the most common name in your notebook
possible_label_names = {'label', 'Label', 'labels', 'CLASS', 'class', 'target', 'y'}

if 'label' in all_columns:
    label_col = 'label'
else:
    for col in all_columns:
        if col.lower() in possible_label_names:
            label_col = col
            break

if label_col:
    print(f"Detected '{label_col}' as the label column.")
    # Exclude the label column from the feature list
    feature_list = [col for col in all_columns if col != label_col]
else:
    print("WARNING: Could not auto-detect a label column (e.g., 'label').")
    print("The list below might include it. Please remove it manually.")
    feature_list = all_columns

print(f"\nFound {len(feature_list)} features.")
print("Copy the list block below into your STEP 2 cell:")
print("\n# --- START: Copy this list ---")
pprint.pprint(feature_list)
print("# --- END: Copy this list ---")

Attempting to read file from: /kaggle/input/multiclass-dataset/final_test_multiclass.parquet
Successfully read file.
Detected 'label' as the label column.

Found 45 features.
Copy the list block below into your STEP 2 cell:

# --- START: Copy this list ---
['Header_Length',
 'Protocol Type',
 'Duration',
 'Rate',
 'Srate',
 'Drate',
 'fin_flag_number',
 'syn_flag_number',
 'rst_flag_number',
 'psh_flag_number',
 'ack_flag_number',
 'ece_flag_number',
 'cwr_flag_number',
 'ack_count',
 'syn_count',
 'fin_count',
 'rst_count',
 'HTTP',
 'HTTPS',
 'DNS',
 'Telnet',
 'SMTP',
 'SSH',
 'IRC',
 'TCP',
 'UDP',
 'DHCP',
 'ARP',
 'ICMP',
 'IGMP',
 'IPv',
 'LLC',
 'Tot sum',
 'Min',
 'Max',
 'AVG',
 'Std',
 'Tot size',
 'IAT',
 'Number',
 'Magnitue',
 'Radius',
 'Covariance',
 'Variance',
 'Weight']
# --- END: Copy this list ---


In [19]:
# ------------------ STEP 2: Define Feature List ------------------
print("STEP 2: Defining feature list...")

features_to_keep = [
 'Header_Length',
 'Protocol Type',
 'Duration',
 'Rate',
 'Srate',
 'Drate',
 'fin_flag_number',
 'syn_flag_number',
 'rst_flag_number',
 'psh_flag_number',
 'ack_flag_number',
 'ece_flag_number',
 'cwr_flag_number',
 'ack_count',
 'syn_count',
 'fin_count',
 'rst_count',
 'HTTP',
 'HTTPS',
 'DNS',
 'Telnet',
 'SMTP',
 'SSH',
 'IRC',
 'TCP',
 'UDP',
 'DHCP',
 'ARP',
 'ICMP',
 'IGMP',
 'IPv',
 'LLC',
 'Tot sum',
 'Min',
 'Max',
 'AVG',
 'Std',
 'Tot size',
 'IAT',
 'Number',
 # 'Magnitue',  <-- REMOVED: This is now your TARGET (y) for Task 3
 'Radius',
 'Covariance',
 'Variance',
 'Weight'
]

if not features_to_keep:
     print("WARNING: 'features_to_keep' is empty or was not pasted.")
     print("Please run the script from the previous step and paste the real feature list.")

print(f"  Defined {len(features_to_keep)} features to keep.")
print("STEP 2 ready.")

STEP 2: Defining feature list...
  Defined 44 features to keep.
STEP 2 ready.


In [20]:
# ------------------ STEP 3: Label mapping ------------------
print("STEP 3: Label mapping (T1 Family / T2 Subtype / T3 Magnitude)...")

# --- task1 (family) mapping ---
class_map_task1 = {
    0: 'Benign', 1: 'Spoofing', 2: 'DDoS', 3: 'DoS', 4: 'Malformed', 5: 'Reconnaissance'
}
num_classes_task1 = len(class_map_task1)

# --- task2 (subtype) mapping ---
class_map_task2 = {
    0: 'Benign', 1: 'ARP_Spoofing', 2: 'MQTT-DDoS-Connect', 3: 'MQTT-DDoS-Publish',
    4: 'MQTT-DoS-Connect', 5: 'MQTT-DoS-Publish', 6: 'MQTT-Malformed', 7: 'Recon-OS_Scan',
    8: 'Recon-Ping_Sweep', 9: 'Recon-Port_Scan', 10: 'Recon-VulScan', 11: 'TCP_IP-DDoS-ICMP',
    12: 'TCP_IP-DDoS-SYN', 13: 'TCP_IP-DDoS-TCP', 14: 'TCP_IP-DDoS-UDP', 15: 'TCP_IP-DoS-ICMP',
    16: 'TCP_IP-DoS-SYN', 17: 'TCP_IP-DoS-TCP', 18: 'TCP_IP-DoS-UDP'
}
num_classes_task2 = len(class_map_task2)
BENIGN_IDX = 0 

# --- remap original fine-grained labels (0..18) -> task1 family (0..5) ---
label_remapping = {
    0: 0, 1: 1, 2: 2, 3: 2, 4: 3, 5: 3, 6: 4, 7: 5, 8: 5, 9: 5,
    10: 5, 11: 2, 12: 2, 13: 2, 14: 2, 15: 3, 16: 3, 17: 3, 18: 3
}

# --- Task 3 (Regression) Config ---
# We don't map classes. We define the target column name.
T3_TARGET_COL = 'Magnitue'  # Note: Matches the spelling in your file
print(f"  Task 3 will predict '{T3_TARGET_COL}' (Regression).")

print(f"  num_classes_task1={num_classes_task1}, num_classes_task2={num_classes_task2}, Task3=Regression")
print("STEP 3 ready.")

STEP 3: Label mapping (T1 Family / T2 Subtype / T3 Magnitude)...
  Task 3 will predict 'Magnitue' (Regression).
  num_classes_task1=6, num_classes_task2=19, Task3=Regression
STEP 3 ready.


In [21]:
# ------------------ STEP 4: Loading and preprocessing data ------------------
print("STEP 4: Locating parquet files, loading and preprocessing...")

# Load train data
if os.path.exists(TRAIN_FILE):
    print(f"Loading train parquet: {TRAIN_FILE}")
    train_df = pd.read_parquet(TRAIN_FILE, engine='pyarrow')
else:
    raise FileNotFoundError(f"Train file not found at: {TRAIN_FILE}")

# Load test data
if os.path.exists(TEST_FILE):
    print(f"Loading test parquet: {TEST_FILE}")
    test_df = pd.read_parquet(TEST_FILE, engine='pyarrow')
else:
    raise FileNotFoundError(f"Test file not found at: {TEST_FILE}")

print(f"  Loaded train rows: {len(train_df)}, test rows: {len(test_df)}")

# Validate features
missing_cols = [c for c in features_to_keep if c not in train_df.columns]
if missing_cols:
    print("WARNING: The following requested features are missing in train:", missing_cols)
    features = [c for c in features_to_keep if c in train_df.columns]
    if not features:
        raise ValueError("No features found. Did you update 'features_to_keep' in STEP 2?")
else:
    features = features_to_keep[:]
print(f"  Using {len(features)} features.")

# Detect Label Columns
label_col_t2 = 'label' # Contains 0-18
if label_col_t2 not in train_df.columns:
    raise ValueError(f"Missing '{label_col_t2}' column for classification labels.")

# Check for Regression Target
if T3_TARGET_COL not in train_df.columns:
    raise ValueError(f"Missing '{T3_TARGET_COL}' column for Task 3 regression.")

# --- MAP LABELS ---
def map_labels(df):
    df2 = df.copy()
    df2['orig_label_t2'] = df2[label_col_t2].astype(int)
    
    # Task 1: Map T2 label -> T1 family
    df2['y_task1'] = df2['orig_label_t2'].map(label_remapping).astype(int)
    
    # Task 2: Subtype
    df2['y_task2'] = df2['orig_label_t2'].astype(int)
    
    # Task 3: Regression Target (Magnitude)
    # Fill NaNs in magnitude with 0.0 just in case
    df2['y_task3'] = df2[T3_TARGET_COL].fillna(0.0).astype(float)

    # --- MASKING LOGIC ---
    mask = (df2['y_task1'] == BENIGN_IDX)
    
    # Task 2: Mask Benign as -1 (Loss function will ignore these)
    df2.loc[mask, 'y_task2'] = -1
    
    # Task 3: For Benign, FORCE Magnitude to 0.0 (Do NOT mask)
    # This teaches the model: "Benign traffic has 0 intensity."
    df2.loc[mask, 'y_task3'] = 0.0
    
    return df2

train_df = map_labels(train_df)
test_df  = map_labels(test_df)

# --- SCALING ---
print("  Filling NaNs in features and scaling X...")
train_median = train_df[features].median()
train_X = train_df[features].fillna(train_median).values.astype('float32')
test_X  = test_df[features].fillna(train_median).values.astype('float32')

# Scale Features (X)
scaler = MinMaxScaler().fit(train_X)
train_X = scaler.transform(train_X)
test_X  = scaler.transform(test_X)

# Scale Regression Target (y3)
# It is good practice to scale regression targets to [0,1] or [-1,1]
print(f"  Scaling Task 3 target ('{T3_TARGET_COL}')...")
train_y3_raw = train_df['y_task3'].values.reshape(-1, 1)
test_y3_raw  = test_df['y_task3'].values.reshape(-1, 1)

scaler_y3 = MinMaxScaler().fit(train_y3_raw)
train_y3 = scaler_y3.transform(train_y3_raw).flatten().astype('float32')
test_y3  = scaler_y3.transform(test_y3_raw).flatten().astype('float32')

# Prepare Classification Labels
train_y1 = train_df['y_task1'].values.astype('int32')
train_y2 = train_df['y_task2'].values.astype('int32')
test_y1  = test_df['y_task1'].values.astype('int32')
test_y2  = test_df['y_task2'].values.astype('int32')

# --- CLASS WEIGHTS ---
print("  Computing class weights...")
# Task 1
family_class_weights = compute_class_weight('balanced', classes=np.arange(num_classes_task1), y=train_y1)
family_class_weights_tf = tf.convert_to_tensor(family_class_weights, dtype=tf.float32)

# Task 2
y2_valid = train_y2[train_y2 != -1]
if len(y2_valid) > 0:
    subtype_classes = np.unique(y2_valid)
    subtype_class_weights = compute_class_weight('balanced', classes=subtype_classes, y=y2_valid)
    full_subtype_weights = np.zeros(num_classes_task2, dtype=np.float32)
    full_subtype_weights[subtype_classes] = subtype_class_weights
else:
    full_subtype_weights = np.ones(num_classes_task2, dtype=np.float32)
subtype_class_weights_tf = tf.convert_to_tensor(full_subtype_weights, dtype=tf.float32)

# Task 3 (Regression) -> No class weights needed!

print("  Preprocessing complete.")
print(f"    train_X: {train_X.shape}, y1: {train_y1.shape}, y2: {train_y2.shape}, y3: {train_y3.shape}")
print("STEP 4 done.")

STEP 4: Locating parquet files, loading and preprocessing...
Loading train parquet: /kaggle/input/multiclass-dataset/final_train_multiclass.parquet
Loading test parquet: /kaggle/input/multiclass-dataset/final_test_multiclass.parquet
  Loaded train rows: 7160831, test rows: 1614182
  Using 44 features.
  Filling NaNs in features and scaling X...
  Scaling Task 3 target ('Magnitue')...
  Computing class weights...
  Preprocessing complete.
    train_X: (7160831, 44), y1: (7160831,), y2: (7160831,), y3: (7160831,)
STEP 4 done.


In [22]:
# ------------------ STEP 5: Creating non-IID client partitions ------------------
print("STEP 5: Creating non-IID client partitions (X, y1, y2, y3)...")

def partition_non_iid(X, y1, y2, y3, num_clients, seed=SEED):
    if num_clients < 1:
        raise ValueError("num_clients must be >= 1")
    rng = np.random.RandomState(seed)

    N, feat_dim = X.shape
    # Initialize clients with empty lists for all 4 arrays
    clients = [dict(X=[], y1=[], y2=[], y3=[]) for _ in range(num_clients)]
    
    unique_fams = np.unique(y1)
    family_idx = {int(f): np.where(y1 == f)[0] for f in unique_fams}
    fams = sorted(family_idx.keys())

    print(f"  Partitioning data for {len(fams)} families across {num_clients} clients.")

    for i, fam in enumerate(fams):
        idxs = family_idx[fam].copy()
        if idxs.size == 0: continue
        rng.shuffle(idxs)
        
        primary_client = i % num_clients
        n = len(idxs)

        # Split Logic
        n_primary = max(1, int(0.7 * n)) if num_clients > 1 else n
        primary_idxs = idxs[:n_primary]
        rest_idxs = idxs[n_primary:]
        
        # Helper to append data
        def append_to_client(c_idx, indices):
            clients[c_idx]['X'].append(X[indices])
            clients[c_idx]['y1'].append(y1[indices])
            clients[c_idx]['y2'].append(y2[indices])
            clients[c_idx]['y3'].append(y3[indices]) # <--- ADDED y3

        # Assign Primary
        append_to_client(primary_client, primary_idxs)

        # Distribute Rest
        if num_clients > 1 and rest_idxs.size > 0:
            other_clients = [c for c in range(num_clients) if c != primary_client]
            splits = np.array_split(rest_idxs, len(other_clients))
            for j, r in enumerate(splits):
                if r.size > 0:
                    cidx = other_clients[j]
                    append_to_client(cidx, r)

    # Finalize
    for c in clients:
        if c['X']:
            c['X'] = np.vstack(c['X']).astype(np.float32)
            c['y1'] = np.concatenate(c['y1']).astype(np.int32)
            c['y2'] = np.concatenate(c['y2']).astype(np.int32)
            c['y3'] = np.concatenate(c['y3']).astype(np.float32) # <--- ADDED y3
            
            # Shuffle
            perm = rng.permutation(c['y1'].shape[0])
            c['X'] = c['X'][perm]
            c['y1'] = c['y1'][perm]
            c['y2'] = c['y2'][perm]
            c['y3'] = c['y3'][perm] # <--- ADDED y3
        else:
            c['X'] = np.zeros((0, feat_dim), dtype=np.float32)
            c['y1'] = np.zeros((0,), dtype=np.int32)
            c['y2'] = np.zeros((0,), dtype=np.int32)
            c['y3'] = np.zeros((0,), dtype=np.float32)

    return clients

# Run partitioning
clients = partition_non_iid(train_X, train_y1, train_y2, train_y3, NUM_CLIENTS, seed=SEED)

# Print client stats
for i, c in enumerate(clients):
    n = c['y1'].shape[0]
    if n > 0:
        counts = np.bincount(c['y1'], minlength=num_classes_task1)
        # Calculate avg magnitude for this client
        avg_mag = np.mean(c['y3'])
    else:
        counts = np.zeros(num_classes_task1, dtype=int)
        avg_mag = 0.0
    print(f"  Client {i}: samples={n}, T1 dist={counts.tolist()}, Avg Mag={avg_mag:.4f}")

print("STEP 5 done.")

STEP 5: Creating non-IID client partitions (X, y1, y2, y3)...
  Partitioning data for 6 families across 6 clients.
  Client 0: samples=553000, T1 dist=[134912, 963, 298832, 111761, 308, 6224], Avg Mag=0.1412
  Client 1: samples=439920, T1 dist=[11564, 11232, 298831, 111761, 308, 6224], Avg Mag=0.1891
  Client 2: samples=3617182, T1 dist=[11564, 963, 3486362, 111761, 308, 6224], Avg Mag=0.1835
  Client 3: samples=1621763, T1 dist=[11564, 963, 298831, 1303874, 308, 6223], Avg Mag=0.1880
  Client 4: samples=432932, T1 dist=[11564, 963, 298831, 111761, 3590, 6223], Avg Mag=0.1847
  Client 5: samples=496034, T1 dist=[11564, 963, 298831, 111760, 308, 72608], Avg Mag=0.1861
STEP 5 done.


In [23]:
# ------------------ STEP 6: Model building (Keras) ------------------
print("STEP 6: Building the joint model (FIXED: Sigmoid for 0-1 Regression)...")

tf.keras.backend.clear_session()

def build_joint_model_light(input_dim: int,
                            d_model: int = D_MODEL,
                            cond_dim_t2: int = COND_DIM_T2,
                            head2_hidden: int = HEAD2_HIDDEN,
                            cond_dim_t3: int = COND_DIM_T3,
                            head3_hidden: int = HEAD3_HIDDEN,
                            num_families: int = num_classes_task1,
                            num_subtypes: int = num_classes_task2) -> tf.keras.Model:
    """Lightweight model with 3 heads."""
    inp = layers.Input(shape=(input_dim,), name='features')
    
    # Shared Backbone
    x = layers.Dense(d_model, activation=tf.nn.gelu, kernel_initializer='glorot_uniform')(inp)
    r = layers.Dense(d_model, activation=tf.nn.gelu, kernel_initializer='glorot_uniform')(x)
    r = layers.Dense(d_model, activation=None, kernel_initializer='glorot_uniform')(r)
    x = layers.Add()([x, r])
    x = layers.LayerNormalization()(x)
    shared = layers.Dense(d_model, activation=tf.nn.gelu, kernel_initializer='glorot_uniform')(x)
    shared = layers.LayerNormalization(name='shared_embedding')(shared)

    # --- Task 1 Head (Family) ---
    logits1 = layers.Dense(num_families, name='logits_family')(shared)
    
    # --- Task 2 Head (Subtype) ---
    cond1 = layers.Dense(cond_dim_t2, activation='linear')(logits1)
    h2_in = layers.Concatenate(axis=-1)([shared, cond1])
    h2 = layers.Dense(head2_hidden, activation='relu', kernel_initializer='glorot_uniform')(h2_in)
    logits2 = layers.Dense(num_subtypes, name='logits_subtype')(h2)

    # --- Task 3 Head (Magnitude) ---
    cond2 = layers.Dense(cond_dim_t3, activation='linear')(logits2)
    h3_in = layers.Concatenate(axis=-1)([shared, cond2])
    h3 = layers.Dense(head3_hidden, activation='relu', kernel_initializer='glorot_uniform')(h3_in)
    
    # <<< FIX: Use 'sigmoid' to force output between 0.0 and 1.0 >>>
    output3 = layers.Dense(1, activation='sigmoid', name='output_magnitude')(h3)

    return tf.keras.Model(inputs=inp, outputs=[logits1, logits2, output3], name='joint_multitask_model_light')


def build_joint_model_with_mha(input_dim: int,
                               d_model: int = D_MODEL,
                               cond_dim_t2: int = COND_DIM_T2,
                               head2_hidden: int = HEAD2_HIDDEN,
                               cond_dim_t3: int = COND_DIM_T3,
                               head3_hidden: int = HEAD3_HIDDEN,
                               num_families: int = num_classes_task1,
                               num_subtypes: int = num_classes_task2,
                               num_heads: int = NUM_HEADS) -> tf.keras.Model:
    """MHA Model with 3 heads."""
    if d_model % num_heads != 0:
        raise ValueError(f"d_model ({d_model}) must be divisible by num_heads ({num_heads}).")
    inp = layers.Input(shape=(input_dim,), name='features')
    
    # Shared Backbone
    x = layers.Dense(d_model, activation=tf.nn.gelu, kernel_initializer='glorot_uniform')(inp)
    x_seq = layers.Reshape((1, d_model))(x)
    attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads, dropout=0.1)(x_seq, x_seq)
    attn = layers.Reshape((d_model,))(attn)
    x = layers.Add()([x, attn])
    x = layers.LayerNormalization()(x)
    shared = layers.Dense(d_model, activation=tf.nn.gelu, kernel_initializer='glorot_uniform')(x)
    shared = layers.LayerNormalization(name='shared_embedding')(shared)

    # --- Task 1 Head ---
    logits1 = layers.Dense(num_families, name='logits_family')(shared)

    # --- Task 2 Head ---
    cond1 = layers.Dense(cond_dim_t2, activation='linear')(logits1)
    h2_in = layers.Concatenate(axis=-1)([shared, cond1])
    h2 = layers.Dense(head2_hidden, activation='relu', kernel_initializer='glorot_uniform')(h2_in)
    logits2 = layers.Dense(num_subtypes, name='logits_subtype')(h2)

    # --- Task 3 Head ---
    cond2 = layers.Dense(cond_dim_t3, activation='linear')(logits2)
    h3_in = layers.Concatenate(axis=-1)([shared, cond2])
    h3 = layers.Dense(head3_hidden, activation='relu', kernel_initializer='glorot_uniform')(h3_in)
    
    # <<< FIX: Use 'sigmoid' to force output between 0.0 and 1.0 >>>
    output3 = layers.Dense(1, activation='sigmoid', name='output_magnitude')(h3)

    return tf.keras.Model(inputs=inp, outputs=[logits1, logits2, output3], name='joint_multitask_model_mha')

# ------------------- Build -------------------
input_dim = int(train_X.shape[1])
if USE_MHA:
    print("Building MHA model...")
    model = build_joint_model_with_mha(input_dim)
else:
    print("Building light model...")
    model = build_joint_model_light(input_dim)

model.summary()
print("STEP 6 done.")

STEP 6: Building the joint model (FIXED: Sigmoid for 0-1 Regression)...
Building light model...


STEP 6 done.


In [24]:
# ------------------ STEP 7: Loss functions & trainer helpers ------------------
print("STEP 7: Define losses (Weighted for Sigmoid Regression)...")

# --- Task 1 Weights & Loss ---
family_class_weights = compute_class_weight('balanced', classes=np.arange(num_classes_task1), y=train_y1)
family_class_weights_tf = tf.convert_to_tensor(family_class_weights, dtype=tf.float32)
loss_fn_task1 = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

def compute_task1_loss(logits, y_true):
    # per-sample CE (shape (B,))
    per_sample = loss_fn_task1(y_true, logits)
    # gather per-sample class weights
    weights = tf.gather(family_class_weights_tf, tf.cast(y_true, tf.int32))
    # Apply weights
    weighted = per_sample * weights
    return tf.reduce_mean(weighted)

# --- Task 2 Weights & Loss (Masked) ---
# `subtype_class_weights_tf` was created in STEP 4
loss_fn_task2 = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

def compute_task2_loss(logits, y_true):
    """
    Computes weighted, masked loss for Task 2.
    Ignores samples where y_true == -1.
    """
    # Create a boolean mask for valid (non -1) labels
    mask_bool = tf.not_equal(y_true, -1)
    mask = tf.cast(mask_bool, tf.float32)

    # Create a "safe" y_true to index weights, replacing -1 with 0
    safe_y = tf.where(mask_bool, y_true, tf.zeros_like(y_true))

    # Calculate per-sample loss
    per_sample_loss = loss_fn_task2(safe_y, logits)  # (B,)
    
    # Get per-sample weights
    per_sample_weights = tf.gather(subtype_class_weights_tf, safe_y)

    # Apply both weights and mask
    weighted_masked_loss = per_sample_loss * per_sample_weights * mask

    # Normalize by the number of *valid* samples in the batch
    denom = tf.reduce_sum(mask) + 1e-8
    loss = tf.reduce_sum(weighted_masked_loss) / denom
    n_valid = tf.reduce_sum(mask)

    return loss, n_valid

# --- Task 3 Loss (Regression) ---
# We use Mean Squared Error (MSE). Since output is Sigmoid (0-1), this works well.
loss_fn_task3 = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)

def compute_task3_loss(preds, y_true):
    # Ensure y_true is float and shaped correctly (B, 1)
    y_true = tf.cast(y_true, tf.float32)
    y_true = tf.reshape(y_true, (-1, 1))
    return loss_fn_task3(y_true, preds)

# --- Optimizer ---
opt = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

# --- TF.Function for training ---
@tf.function
def train_step(x_batch, y1_batch, y2_batch, y3_batch, model_to_train, optimizer):
    with tf.GradientTape() as tape:
        l1_logits, l2_logits, out3 = model_to_train(x_batch, training=True)
        
        # Calculate losses
        loss1 = compute_task1_loss(l1_logits, y1_batch)
        loss2, n_valid = compute_task2_loss(l2_logits, y2_batch)
        loss3 = compute_task3_loss(out3, y3_batch)
        
        # Total loss
        # We multiply loss3 by 10.0 to give it more importance, 
        # since MSE on 0-1 values is naturally very small compared to CrossEntropy.
        loss = loss1 + loss2 + (10.0 * loss3)
        
    grads = tape.gradient(loss, model_to_train.trainable_variables)
    optimizer.apply_gradients(zip(grads, model_to_train.trainable_variables))
    return loss1, loss2, loss3, n_valid

# --- Model weight helpers ---
def get_model_weights(model):
    return model.get_weights()

def set_model_weights(model, weights):
    model.set_weights(weights)

print("STEP 7 ready.")

STEP 7: Define losses (Weighted for Sigmoid Regression)...
STEP 7 ready.


In [25]:
# ------------------ STEP 8: Federated simulation (FedAvg) ------------------
print("STEP 8: Federated simulation - FedAvg across clients...")

# --- Evaluation & Checkpoint helper ---
WRITE_PATH = "/kaggle/working/fl_model_output"
best_f1 = -1.0
save_path = os.path.join(WRITE_PATH, "best_global_model.keras")
ensure_dir(os.path.dirname(save_path))

def evaluate_global_model(model, val_X, val_y1, val_y2, val_y3):
    batch = 512
    preds1 = []
    preds2 = []
    preds3 = [] # New for T3
    
    for i in range(0, val_X.shape[0], batch):
        xb = val_X[i:i+batch]
        # Model now returns 3 outputs
        l1, l2, l3 = model(xb, training=False)
        
        p1 = np.argmax(l1.numpy(), axis=1)
        p2 = np.argmax(l2.numpy(), axis=1)
        p3 = l3.numpy() # Regression output (continuous)
        
        preds1.append(p1)
        preds2.append(p2)
        preds3.append(p3)
        
    preds1 = np.concatenate(preds1)
    preds2 = np.concatenate(preds2)
    preds3 = np.concatenate(preds3).flatten() # Flatten (N,1) -> (N,)
    
    # Task 1 Acc
    fam_acc = accuracy_score(val_y1, preds1)
    
    # Task 2 Masked F1
    mask = (val_y2 != -1)
    if mask.sum() > 0:
        sub_f1_macro = f1_score(val_y2[mask], preds2[mask], average='macro', zero_division=0)
    else:
        sub_f1_macro = 0.0

    # Task 3 MSE (Mean Squared Error)
    # We compare predicted magnitude vs true magnitude
    mse_mag = np.mean((val_y3 - preds3) ** 2)
        
    return fam_acc, sub_f1_macro, mse_mag

def checkpoint_model(model, current_f1, best_f1, save_path):
    saved = False
    # We still checkpoint based on T2 F1 as it's the hardest classification task
    if current_f1 > best_f1:
        print(f"  Improved T2 macro-F1: {best_f1:.4f} -> {current_f1:.4f}. Saving model...")
        model.save(save_path, include_optimizer=False) 
        best_f1 = current_f1
        saved = True
    return best_f1, saved

# --- Prepare TF datasets per client ---
client_datasets = []
client_sizes = []

for i, c in enumerate(clients):
    # UNPACK y3 HERE
    Xc, y1c, y2c, y3c = c['X'], c['y1'], c['y2'], c['y3']
    n = Xc.shape[0]
    client_sizes.append(max(1, n))
    
    if n == 0:
        client_datasets.append(None)
        print(f"Client {i}: empty dataset (will be skipped)")
        continue
    
    # ADD y3 TO DATASET
    ds = tf.data.Dataset.from_tensor_slices((Xc, y1c, y2c, y3c))
    ds = ds.shuffle(buffer_size=min(10000, n), seed=SEED)
    ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    client_datasets.append(ds)
    print(f"Client {i}: dataset prepared, samples={n}")

# --- Federated training loop ---
print("\nStarting Federated Training (T1+T2+T3)...")
global_weights = get_model_weights(model)
best_f1 = -1.0 

for rnd in range(NUM_ROUNDS):
    print(f"\n=== Fed Round {rnd+1}/{NUM_ROUNDS} ===")
    client_weights_list = []
    client_effective_sizes = []

    # --- Local Client Training ---
    for i, ds_c in enumerate(client_datasets):
        if ds_c is None or client_sizes[i] == 0:
            continue
            
        # Set model to global state
        set_model_weights(model, global_weights)
        
        # Client-side optimizer
        client_opt = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
        
        # Init optimizer vars
        zero_grads = [tf.zeros_like(v) for v in model.trainable_variables]
        client_opt.apply_gradients(zip(zero_grads, model.trainable_variables))

        print(f"  Client {i} training...")
        for epoch in range(EPOCHS_PER_CLIENT_ROUND):
            # UNPACK y3 HERE
            for xb, y1b, y2b, y3b in ds_c:
                # PASS y3b TO TRAIN_STEP
                l1_val, l2_val, l3_val, n_valid = train_step(xb, y1b, y2b, y3b, model, client_opt)
            
        # Collect updated weights
        client_w = get_model_weights(model)
        client_weights_list.append(client_w)
        client_effective_sizes.append(float(client_sizes[i]))

    # --- Federated Averaging (Aggregation) ---
    if not client_weights_list:
        print("No client updates collected this round.")
        continue

    total_size = float(sum(client_effective_sizes))
    new_global = []
    
    for weights_tuple in zip(*client_weights_list):
        averaged = np.zeros_like(weights_tuple[0], dtype=np.float64)
        for cw, size in zip(weights_tuple, client_effective_sizes):
            averaged += (cw.astype(np.float64) * (size / total_size))
        new_global.append(averaged.astype(np.float32))

    global_weights = new_global
    set_model_weights(model, global_weights)
    print("  Aggregation complete.")

    # --- Evaluation & Checkpoint ---
    # PASS test_y3 HERE
    fam_acc, sub_f1_macro, mag_mse = evaluate_global_model(model, test_X, test_y1, test_y2, test_y3)
    
    print(f"  Eval Round {rnd+1}: T1 Acc={fam_acc:.4f} | T2 F1={sub_f1_macro:.4f} | T3 MSE={mag_mse:.5f}")
    
    best_f1, saved = checkpoint_model(model, sub_f1_macro, best_f1, save_path)

print("\nSTEP 8 (Federated Training) done.")

STEP 8: Federated simulation - FedAvg across clients...
Client 0: dataset prepared, samples=553000
Client 1: dataset prepared, samples=439920
Client 2: dataset prepared, samples=3617182
Client 3: dataset prepared, samples=1621763
Client 4: dataset prepared, samples=432932
Client 5: dataset prepared, samples=496034

Starting Federated Training (T1+T2+T3)...

=== Fed Round 1/10 ===
  Client 0 training...
  Client 1 training...
  Client 2 training...
  Client 3 training...
  Client 4 training...
  Client 5 training...
  Aggregation complete.
  Eval Round 1: T1 Acc=0.7511 | T2 F1=0.4976 | T3 MSE=0.00348
  Improved T2 macro-F1: -1.0000 -> 0.4976. Saving model...

=== Fed Round 2/10 ===
  Client 0 training...
  Client 1 training...
  Client 2 training...
  Client 3 training...
  Client 4 training...
  Client 5 training...
  Aggregation complete.
  Eval Round 2: T1 Acc=0.7563 | T2 F1=0.4527 | T3 MSE=0.00083

=== Fed Round 3/10 ===
  Client 0 training...
  Client 1 training...
  Client 2 train

In [26]:
# ------------------ STEP 9: Evaluate global model on test set ------------------
print("STEP 9: Evaluate final global model on held-out test set...")

# <<< FIX: Load from the new .keras file path >>>
WRITE_PATH = "/kaggle/working/fl_model_output" 
best_model_path = os.path.join(WRITE_PATH, "best_global_model.keras")

if os.path.exists(best_model_path):
    print(f"Loading best saved model from checkpoint: {best_model_path}")
    # We must provide custom_objects if we used custom loss, but here we used standard 
    # keras losses in the compile/train loop, so load_model usually works fine 
    # if the architecture is standard.
    model = tf.keras.models.load_model(best_model_path)
else:
    print("No checkpointed model found, evaluating final round model.")

# Use model.predict for efficiency
print("  Running predictions...")
pred_results = model.predict(test_X, batch_size=BATCH_SIZE)

# Unpack 3 outputs
l1_all = np.asarray(pred_results[0])
l2_all = np.asarray(pred_results[1])
l3_all = np.asarray(pred_results[2]).flatten() # Task 3 (Regression)

# Predicted labels (argmax for classification)
y1_pred = np.argmax(l1_all, axis=-1).astype(np.int32)
y2_pred = np.argmax(l2_all, axis=-1).astype(np.int32)

# True labels
y1_true = np.asarray(test_y1).astype(np.int32)
y2_true = np.asarray(test_y2).astype(np.int32)
y3_true = np.asarray(test_y3).astype(np.float32) # Scaled true values

# --- Task1: family accuracy & report ---
print("\n" + "="*30)
print("  Task 1 - Family Classification Report")
print("="*30)
fam_acc = accuracy_score(y1_true, y1_pred)
print(f"  Task1 (family) accuracy on test: {fam_acc:.4f}\n")
task1_target_names = [class_map_task1.get(i, str(i)) for i in range(num_classes_task1)]
print(classification_report(y1_true, y1_pred, target_names=task1_target_names, zero_division=0))

# --- Task2: masked evaluation ---
print("\n" + "="*30)
print("  Task 2 - Subtype Classification Report (Masked)")
print("="*30)
mask = (y2_true != -1)
n_mask = int(np.sum(mask))
if n_mask > 0:
    y2_true_masked = y2_true[mask]
    y2_pred_masked = y2_pred[mask]
    
    sub_f1_macro = f1_score(y2_true_masked, y2_pred_masked, average='macro', zero_division=0)
    print(f"  Task2 (subtype) macro-F1 (masked, {n_mask} rows): {sub_f1_macro:.4f}\n")
    
    present_labels = sorted(np.unique(y2_true_masked))
    target_names = [class_map_task2.get(int(lbl), str(int(lbl))) for lbl in present_labels]
    print(classification_report(y2_true_masked, y2_pred_masked, labels=present_labels, target_names=target_names, zero_division=0))
else:
    print("  No valid subtype rows in test (all benign or masked).")

# --- Task3: Regression evaluation ---
print("\n" + "="*30)
print("  Task 3 - Magnitude Regression Report")
print("="*30)

# 1. Scaled metrics (MSE)
mse_scaled = np.mean((y3_true - l3_all) ** 2)
print(f"  Scaled MSE: {mse_scaled:.5f} (Target range 0.0-1.0)")

# 2. Real-world metrics (MAE on original scale)
# We use the 'scaler_y3' defined in STEP 4 to reverse the normalization
if 'scaler_y3' in globals():
    # Inverse transform
    y3_true_real = scaler_y3.inverse_transform(y3_true.reshape(-1, 1)).flatten()
    y3_pred_real = scaler_y3.inverse_transform(l3_all.reshape(-1, 1)).flatten()
    
    # Calculate MAE
    mae_real = np.mean(np.abs(y3_true_real - y3_pred_real))
    print(f"  Mean Absolute Error (Original Scale): {mae_real:.4f}")
    
    print("\n  Sample Predictions (Real Scale):")
    print("    True  |  Pred  |  Diff")
    print("    ------------------------")
    # Show 5 random samples
    sample_idxs = np.random.choice(len(y3_true_real), 5, replace=False)
    for idx in sample_idxs:
        t_val = y3_true_real[idx]
        p_val = y3_pred_real[idx]
        print(f"    {t_val:5.2f} | {p_val:5.2f} | {abs(t_val-p_val):5.2f}")
else:
    print("  WARNING: 'scaler_y3' not found. Cannot calculate real-world MAE.")

print("\nSTEP 9 done.")

STEP 9: Evaluate final global model on held-out test set...
Loading best saved model from checkpoint: /kaggle/working/fl_model_output/best_global_model.keras
  Running predictions...
[1m6306/6306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step

  Task 1 - Family Classification Report
  Task1 (family) accuracy on test: 0.7241

                precision    recall  f1-score   support

        Benign       0.97      0.79      0.87     37607
      Spoofing       0.16      0.82      0.27      1744
          DDoS       0.77      0.86      0.82   1117096
           DoS       0.49      0.35      0.41    428312
     Malformed       0.33      0.81      0.47      1747
Reconnaissance       0.97      0.92      0.94     27676

      accuracy                           0.72   1614182
     macro avg       0.62      0.76      0.63   1614182
  weighted avg       0.71      0.72      0.71   1614182


  Task 2 - Subtype Classification Report (Masked)
  Task2 (subtype) macro-F1 (masked, 157657

In [27]:
# ------------------ STEP 10: Save model & scaler ------------------
print("STEP 10: Save final model & scalers...")

WRITE_PATH = "/kaggle/working/fl_model_output" 
save_dir = os.path.join(WRITE_PATH, "multitask_saved_final")
ensure_dir(save_dir)

# 1. Save Model
model_save_path = os.path.join(save_dir, "joint_multitask_model.keras")
model.save(model_save_path, include_optimizer=False)
print(f"Saved model to {model_save_path}")

# 2. Save Feature Scaler (X)
scaler_save_path = os.path.join(save_dir, "scaler_minmax.joblib")
joblib.dump(scaler, scaler_save_path)
print(f"Saved feature scaler to {scaler_save_path}")

# 3. Save Task 3 Target Scaler (y3)
# CRITICAL: You need this to interpret the magnitude predictions later
scaler_y3_save_path = os.path.join(save_dir, "scaler_y3_magnitude.joblib")
if 'scaler_y3' in globals():
    joblib.dump(scaler_y3, scaler_y3_save_path)
    print(f"Saved T3 magnitude scaler to {scaler_y3_save_path}")
else:
    print("WARNING: scaler_y3 not found in globals(), could not save.")

print("STEP 10 done.\n")
print("ALL STEPS finished.")

STEP 10: Save final model & scalers...
Saved model to /kaggle/working/fl_model_output/multitask_saved_final/joint_multitask_model.keras
Saved feature scaler to /kaggle/working/fl_model_output/multitask_saved_final/scaler_minmax.joblib
Saved T3 magnitude scaler to /kaggle/working/fl_model_output/multitask_saved_final/scaler_y3_magnitude.joblib
STEP 10 done.

ALL STEPS finished.
