# 1. Data Collection

### Import Necessary Library

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score, accuracy_score
from os import cpu_count
from math import floor
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
# import shap
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
# shap.initjs()

### Load Dataset

In [None]:
data=pd.read_csv("../dataset/NF_TON_IoT_V2/NF-ToN-IoT-v2.csv")

# 2. Data Preprocessing

### Summary of Stastics

In [None]:
data.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,59.166.0.5,1305,149.171.126.8,21,6,1.0,9,1,193,3,...,0,7240,0,0,0,0,0,331.0,0,Benign
1,59.166.0.5,1305,149.171.126.8,21,6,1.0,261,5,469,7,...,8688,8688,18944,74,0,0,0,230.0,0,Benign
2,59.166.0.5,1305,149.171.126.8,21,6,1.0,481,9,750,11,...,10136,10136,33792,132,0,0,0,229.0,0,Benign
3,59.166.0.5,1305,149.171.126.8,21,6,1.0,701,13,1054,15,...,11584,11584,48640,190,0,0,0,125.0,0,Benign
4,59.166.0.5,1305,149.171.126.8,21,6,1.0,1031,19,1474,21,...,14480,13032,64256,251,0,0,0,230.0,0,Benign


In [None]:
data.dtypes

IPV4_SRC_ADDR                   object
L4_SRC_PORT                      int64
IPV4_DST_ADDR                   object
L4_DST_PORT                      int64
PROTOCOL                         int64
L7_PROTO                       float64
IN_BYTES                         int64
IN_PKTS                          int64
OUT_BYTES                        int64
OUT_PKTS                         int64
TCP_FLAGS                        int64
CLIENT_TCP_FLAGS                 int64
SERVER_TCP_FLAGS                 int64
FLOW_DURATION_MILLISECONDS       int64
DURATION_IN                      int64
DURATION_OUT                     int64
MIN_TTL                          int64
MAX_TTL                          int64
LONGEST_FLOW_PKT                 int64
SHORTEST_FLOW_PKT                int64
MIN_IP_PKT_LEN                   int64
MAX_IP_PKT_LEN                   int64
SRC_TO_DST_SECOND_BYTES        float64
DST_TO_SRC_SECOND_BYTES        float64
RETRANSMITTED_IN_BYTES           int64
RETRANSMITTED_IN_PKTS    

In [None]:
data.Label.value_counts()

Label
0    2295222
1      95053
Name: count, dtype: int64

In [None]:
data.Attack.value_counts()

Attack
Benign            2295222
Exploits            31551
Fuzzers             22310
Generic             16560
Reconnaissance      12779
DoS                  5794
Analysis             2299
Backdoor             2169
Shellcode            1427
Worms                 164
Name: count, dtype: int64

In [None]:
data=data.drop(columns=['L4_SRC_PORT', 'L4_DST_PORT']) #dropping metadata

In [None]:
training_set = data.sample(frac=0.05, replace=False,random_state=42)
# 1%train, 99% test
testing_set = data.drop(index=training_set.index)

In [None]:
training_set.Attack.value_counts()

DDoS              713308
DoS               682874
Reconnaissance    118292
Benign              6425
Theft                105
Name: Attack, dtype: int64

In [None]:
attacks=training_set.Attack.unique()
attacks=['Benign','Reconnaissance', 'DDoS', 'DoS', 'Theft']

In [None]:
corr = training_set.corr()
corr_features={corr.columns[i] : corr.columns[(corr>0.9).iloc[i]].values.tolist() for i in range(0,corr.shape[0])}
corr_list=[]
for key,value in corr_features.items():
#     check if we already have this set
    have_set=False
    for set_s in corr_list:
        if key in set_s:
#             we have found a set
            have_set=True
            break
    if have_set==False and len(value)>1:
        corr_list.append(value)

In [None]:
corr_list

[['PROTOCOL', 'L7_PROTO'],
 ['IN_BYTES',
  'IN_PKTS',
  'NUM_PKTS_512_TO_1024_BYTES',
  'NUM_PKTS_1024_TO_1514_BYTES'],
 ['OUT_BYTES', 'OUT_PKTS', 'NUM_PKTS_1024_TO_1514_BYTES'],
 ['TCP_FLAGS', 'SERVER_TCP_FLAGS', 'MIN_IP_PKT_LEN'],
 ['MIN_TTL', 'MAX_TTL'],
 ['LONGEST_FLOW_PKT', 'MAX_IP_PKT_LEN'],
 ['RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS'],
 ['RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS'],
 ['ICMP_TYPE', 'ICMP_IPV4_TYPE']]

In [None]:
#correction because NUM_PKTS_1024_TO_1514_BYTES appears twice
corr_list[2]=corr_list[2][:-1]
corr_list

[['PROTOCOL', 'L7_PROTO'],
 ['IN_BYTES',
  'IN_PKTS',
  'NUM_PKTS_512_TO_1024_BYTES',
  'NUM_PKTS_1024_TO_1514_BYTES'],
 ['OUT_BYTES', 'OUT_PKTS'],
 ['TCP_FLAGS', 'SERVER_TCP_FLAGS', 'MIN_IP_PKT_LEN'],
 ['MIN_TTL', 'MAX_TTL'],
 ['LONGEST_FLOW_PKT', 'MAX_IP_PKT_LEN'],
 ['RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS'],
 ['RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS'],
 ['ICMP_TYPE', 'ICMP_IPV4_TYPE']]

In [None]:
pip install torch-geometric




In [None]:
pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.loader import DataLoader
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)


AttributeError: partially initialized module 'torch_geometric' has no attribute 'typing' (most likely due to a circular import)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.loader import DataLoader
import networkx as nx
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# 1. Data Loading and Preprocessing
def load_and_preprocess_data(file_path):
    # Load data
    data = pd.read_parquet(file_path)
    
    # Handle negative port values
    data['L4_SRC_PORT'] = data['L4_SRC_PORT'].abs()
    data['L4_DST_PORT'] = data['L4_DST_PORT'].abs()
    
    # Select numerical features
    numerical_features = [
        'L4_SRC_PORT', 'L4_DST_PORT', 'IN_BYTES', 'IN_PKTS', 
        'OUT_BYTES', 'OUT_PKTS', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT'
    ]
    
    # Handle missing values
    data[numerical_features] = data[numerical_features].fillna(0)
    
    # Encode categorical variables
    le_attack = LabelEncoder()
    data['Attack_encoded'] = le_attack.fit_transform(data['Attack'])
    
    # Standardize numerical features
    scaler = StandardScaler()
    data[numerical_features] = scaler.fit_transform(data[numerical_features])
    
    return data, numerical_features, le_attack

# 2. Exploratory Data Analysis
def perform_eda(data, numerical_features):
    print("\nEDA Summary:")
    print("\nDataset Shape:", data.shape)
    print("\nMissing Values:\n", data.isnull().sum().sum())
    print("\nLabel Distribution:\n", data['Label'].value_counts())
    print("\nAttack Type Distribution:\n", data['Attack'].value_counts())
    
    # Correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(data[numerical_features].corr(), annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Heatmap')
    plt.savefig('correlation_heatmap.png')
    plt.close()
    
    # Attack type distribution plot
    plt.figure(figsize=(10, 6))
    sns.countplot(x='Attack', data=data)
    plt.xticks(rotation=45)
    plt.title('Attack Type Distribution')
    plt.savefig('attack_distribution.png')
    plt.close()

# 3. Graph Construction
def create_graph_data(data, numerical_features, target_col='Label'):
    # Create edge index based on similar source/destination ports
    edge_index = []
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            if (data.iloc[i]['L4_SRC_PORT'] == data.iloc[j]['L4_SRC_PORT'] or 
                data.iloc[i]['L4_DST_PORT'] == data.iloc[j]['L4_DST_PORT']):
                edge_index.append([i, j])
                edge_index.append([j, i])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    
    # Node features
    x = torch.tensor(data[numerical_features].values, dtype=torch.float)
    
    # Labels
    y = torch.tensor(data[target_col].values, dtype=torch.long)
    
    # Create PyG data object
    graph_data = Data(x=x, edge_index=edge_index, y=y)
    
    # Create train/test mask
    train_mask, test_mask = train_test_split(
        range(len(data)), test_size=0.2, random_state=42, stratify=data[target_col]
    )
    graph_data.train_mask = torch.tensor(train_mask, dtype=torch.long)
    graph_data.test_mask = torch.tensor(test_mask, dtype=torch.long)
    
    return graph_data

# 4. Attention-based GNN Model
class AttentionGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=4):
        super(AttentionGNN, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads)
        self.conv2 = GATConv(hidden_dim * heads, hidden_dim, heads=heads)
        self.fc1 = nn.Linear(hidden_dim * heads, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = self.dropout(x)
        
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = self.dropout(x)
        
        x = self.fc1(x)
        x = F.elu(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

# 5. Training Function
def train_model(model, data, optimizer, criterion, epochs=100):
    model.train()
    train_losses = []
    
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item():.4f}')
    
    return train_losses

# 6. Evaluation Function
def evaluate_model(model, data, le_attack=None):
    model.eval()
    with torch.no_grad():
        pred = model(data).argmax(dim=1)
        
        # Get true and predicted labels
        y_true = data.y[data.test_mask].numpy()
        y_pred = pred[data.test_mask].numpy()
        
        # Classification report
        print("\nClassification Report:")
        if le_attack is not None:
            target_names = le_attack.classes_
        else:
            target_names = ['Benign', 'Malicious']
        print(classification_report(y_true, y_pred, target_names=target_names))
        
        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.savefig('confusion_matrix.png')
        plt.close()
        
        # ROC AUC for binary classification
        if len(np.unique(data.y)) == 2:
            probs = torch.softmax(model(data), dim=1)[:, 1]
            roc_auc = roc_auc_score(y_true, probs[data.test_mask].numpy())
            print(f"\nROC AUC Score: {roc_auc:.4f}")

# Main Execution
if __name__ == "__main__":
    # Load and preprocess data
    file_path = "../dataset/NF-BoT-IoT-V2.parquet"
    data, numerical_features, le_attack = load_and_preprocess_data(file_path)
    
    # Perform EDA
    perform_eda(data, numerical_features)
    
    # Binary Classification
    print("\n=== Binary Classification ===")
    binary_graph = create_graph_data(data, numerical_features, 'Label')
    
    # Initialize model
    binary_model = AttentionGNN(
        input_dim=len(numerical_features),
        hidden_dim=64,
        output_dim=2
    )
    
    # Training setup
    optimizer = torch.optim.Adam(binary_model.parameters(), lr=0.01)
    criterion = nn.NLLLoss()
    
    # Train model
    binary_losses = train_model(binary_model, binary_graph, optimizer, criterion)
    
    # Evaluate model
    evaluate_model(binary_model, binary_graph)
    
    # Multiclass Classification
    print("\n=== Multiclass Classification ===")
    multiclass_graph = create_graph_data(data, numerical_features, 'Attack_encoded')
    
    # Initialize model
    multiclass_model = AttentionGNN(
        input_dim=len(numerical_features),
        hidden_dim=64,
        output_dim=len(le_attack.classes_)
    )
    
    # Training setup
    optimizer = torch.optim.Adam(multiclass_model.parameters(), lr=0.01)
    criterion = nn.NLLLoss()
    
    # Train model
    multiclass_losses = train_model(multiclass_model, multiclass_graph, optimizer, criterion)
    
    # Evaluate model
    evaluate_model(multiclass_model, multiclass_graph, le_attack)
    
    # Plot training losses
    plt.figure(figsize=(10, 6))
    plt.plot(binary_losses, label='Binary Classification')
    plt.plot(multiclass_losses, label='Multiclass Classification')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss Curves')
    plt.legend()
    plt.savefig('training_loss.png')
    plt.close()

ModuleNotFoundError: No module named 'torch_geometric'