In [35]:
import pandas as pd

import os

os.environ["OMP_NUM_THREADS"] = "2"
os.environ["OPENBLAS_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "2"
os.environ["VECLIB_MAXIMUM_THREADS"] = "2"
os.environ["NUMEXPR_NUM_THREADS"] = "2"



import pandas as pd

# Load and combine datasets (as before)
datasets = {
    'benign': pd.read_csv('labeled_dataset_benign.csv'),
    'dns': pd.read_csv('labeled_dataset_DNS_Spoofing.csv'),
    'mitm': pd.read_csv('labeled_dataset_MITM_ArpSpoofing.csv')
}
combined_df = pd.concat([datasets['benign'], datasets['dns'], datasets['mitm']], ignore_index=True)
combined_df = combined_df.drop('dst_mac', axis=1)

print(combined_df.shape)

  'dns': pd.read_csv('labeled_dataset_DNS_Spoofing.csv'),
  'mitm': pd.read_csv('labeled_dataset_MITM_ArpSpoofing.csv')


(883525, 135)


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

def is_hex_mac(mac):
    if pd.isna(mac) or mac == 'unknown':
        return False
    pattern = r'^([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})$'
    return bool(re.match(pattern, str(mac)))


# Remove rows where src_mac is a real MAC address
combined_df = combined_df[~combined_df['src_mac'].apply(is_hex_mac)]

# Drop duplicates
print(f"Number of duplicate rows: {combined_df.duplicated().sum()}")
combined_df = combined_df.drop_duplicates()
print(f"New shape after removing duplicates: {combined_df.shape}")

# Identify categorical columns
categorical_cols = [col for col in combined_df.columns if combined_df[col].nunique() < 10 or combined_df[col].dtype == 'object']
print(f"Categorical features: {categorical_cols}")

for col in categorical_cols:
    combined_df[col] = combined_df[col].astype(str).fillna('unknown')

numerical_cols = combined_df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].median())

# --- Step 2: Split target labels and features ---
X = combined_df.drop(['Label', 'src_mac'], axis=1, errors='ignore')
y_attack = combined_df['Label']
y_device = combined_df['src_mac']

# Encode device labels
le_device = LabelEncoder()
y_device_encoded = le_device.fit_transform(y_device)

from sklearn.preprocessing import MultiLabelBinarizer

combined_df['All_Labels'] = combined_df.apply(
    lambda row: [row['Label'], row['src_mac']], axis=1
)

# 2. Initialize and fit the binarizer
mlb = MultiLabelBinarizer()
Y_multilabel = mlb.fit_transform(combined_df['All_Labels'])

multilabel_class_names = mlb.classes_


# Keep only numeric columns
X_numeric = X.select_dtypes(include=['number'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)

# Apply PCA to retain 95% variance
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_scaled)

pca_feature_names = [f'PC{i+1}' for i in range(X_reduced.shape[1])]

X_reduced_df = pd.DataFrame(X_reduced, columns=pca_feature_names, index=X.index)

X_train, X_test, Y_train, Y_test = train_test_split(
    X_reduced_df,       
    Y_multilabel,       
    test_size=0.2, 
    random_state=42
)


Number of duplicate rows: 0
New shape after removing duplicates: (432555, 135)
Categorical features: ['src_mac', 'src_ip', 'dst_ip', 'port_class_dst', 'l4_tcp', 'l4_udp', 'ttl', 'handshake_version', 'handshake_ciphersuites', 'tls_server', 'http_request_method', 'http_host', 'http_response_code', 'user_agent', 'dns_server', 'dns_query_type', 'dns_len_ans', 'device_mac', 'eth_src_oui', 'eth_dst_oui', 'highest_layer', 'http_uri', 'http_content_len', 'http_content_type', 'icmp_type', 'icmp_checksum_status', 'icmp_data_size', 'Label']


  combined_df['All_Labels'] = combined_df.apply(


In [None]:
from sklearn.multioutput import MultiOutputClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, jaccard_score, hamming_loss, classification_report

# --- 1. Define the Base Estimator (LightGBM) ---
print("--- Training Multi-Label LightGBM ---")
# Apply regularization and weighted loss to the base model
lgb_base = lgb.LGBMClassifier(
    n_estimators=100, 
    random_state=42, 
    n_jobs=2,
    max_depth=8,              # Regularization: Limit tree depth
    reg_alpha=0.2,            # Regularization: L1
    class_weight='balanced'   
)

# --- 2. Wrap it with MultiOutputClassifier ---
lgb_multioutput = MultiOutputClassifier(lgb_base, n_jobs=-1)

# --- 3. Fit the Model ---
lgb_multioutput.fit(X_train, Y_train)

# --- 4. Prediction ---
Y_pred_lgbm = lgb_multioutput.predict(X_test)

--- Training Multi-Label LightGBM ---


In [38]:
def evaluate_multilabel_model(Y_test, Y_pred, class_names):
    """Calculates and prints key multi-label metrics."""
    print("\n======== Evaluation: LightGBM Multi-Output ========")
    
    # 1. Subset Accuracy (STRICTEST: Requires *ALL* labels to match exactly)
    subset_acc = accuracy_score(Y_test, Y_pred)
    print(f"Subset Accuracy (Exact Match): {subset_acc:.4f}")

    print(classification_report(Y_test, Y_pred, 
                                target_names=class_names, 
                                ))

# Run the evaluation
evaluate_multilabel_model(Y_test, Y_pred_lgbm, multilabel_class_names)


Subset Accuracy (Exact Match): 0.6805
                                            precision    recall  f1-score   support

                       AMCREST WiFi Camera       0.30      0.97      0.46       473
                     AeoTec Smart Home Hub       0.33      0.97      0.49      1014
                         Amazon Echo Dot 1       0.25      0.96      0.40      1503
                         Amazon Echo Dot 2       0.79      0.99      0.88      5075
                          Amazon Echo Show       0.73      0.97      0.83      5430
                          Amazon Echo Spot       0.55      0.97      0.70      3409
                        Amazon Echo Studio       0.99      0.98      0.99     32421
                               Amazon Plug       0.46      0.91      0.61       109
                         Arlo Base Station       0.55      0.97      0.70      1812
                      Arlo Q Indoor Camera       0.72      0.98      0.83      2125
                        Atomi Coffee

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, jaccard_score, hamming_loss, classification_report

# --- 1. Define the XGBoost Base Estimator ---
print("--- Training Multi-Label XGBoost ---")

N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Apply regularization (L1/L2)
xgb_base = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False, 
    eval_metric='logloss',  # Use binary logloss for each output
    n_jobs=2,
    # === Regularization Parameters ===
    max_depth=8,
    reg_alpha=0.2,  # L1 regularization
    reg_lambda=1.5  # L2 regularization
    # =================================
)

# --- 2. Wrap it with MultiOutputClassifier ---
xgb_multioutput = MultiOutputClassifier(xgb_base, n_jobs=-1)

# --- 3. Fit the Model ---
print("Fitting the Multi-Output XGBoost model...")
xgb_multioutput.fit(X_train, Y_train)

# --- 4. Prediction ---
Y_pred_xgb = xgb_multioutput.predict(X_test)

--- Training Multi-Label XGBoost ---


Fitting the Multi-Output XGBoost model...


In [40]:


def evaluate_multilabel_model(Y_test, Y_pred, class_names):
    """Calculates and prints key multi-label metrics."""
    print("\n======== Evaluation: XGBoost Multi-Output (Weighted) ========")
    
    subset_acc = accuracy_score(Y_test, Y_pred)
    print(f"Subset Accuracy (Exact Match): {subset_acc:.4f}")

    
    print("\n--- Micro-Averaged Metrics ---")
    print(classification_report(Y_test, Y_pred, 
                                target_names=class_names, 
                                ))


evaluate_multilabel_model(Y_test, Y_pred_xgb, multilabel_class_names)


Subset Accuracy (Exact Match): 0.8605

--- Micro-Averaged Metrics ---
                                            precision    recall  f1-score   support

                       AMCREST WiFi Camera       0.97      0.78      0.86       473
                     AeoTec Smart Home Hub       0.89      0.72      0.80      1014
                         Amazon Echo Dot 1       0.98      0.67      0.79      1503
                         Amazon Echo Dot 2       0.97      0.97      0.97      5075
                          Amazon Echo Show       0.98      0.92      0.95      5430
                          Amazon Echo Spot       0.96      0.89      0.93      3409
                        Amazon Echo Studio       1.00      0.99      0.99     32421
                               Amazon Plug       0.96      0.74      0.84       109
                         Arlo Base Station       0.99      0.89      0.94      1812
                      Arlo Q Indoor Camera       0.99      0.95      0.97      2125
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, jaccard_score


rf_basic = RandomForestClassifier(
    n_estimators=100,           
    random_state=42,
    n_jobs=-1,                  
)
rf_multioutput = MultiOutputClassifier(rf_basic, n_jobs=-1)

print("Fitting the Basic Multi-Label Random Forest model on all training data...")

rf_multioutput.fit(X_train, Y_train)

Y_pred_rf = rf_multioutput.predict(X_test)

subset_acc = accuracy_score(Y_test, Y_pred_rf)

jaccard = jaccard_score(Y_test, Y_pred_rf, average='samples', zero_division=0)

print("\n" + "="*50)
print("FINAL BASIC RANDOM FOREST TEST SET RESULTS")
print("="*50)
print(f"Subset Accuracy (Exact Match): {subset_acc:.4f}")
print(f"Jaccard Score (Label Similarity): {jaccard:.4f}")

Fitting the Basic Multi-Label Random Forest model on all training data...


KeyboardInterrupt: 