In [None]:
import pandas as pd
import numpy as np
import torch
import scipy.sparse as sp
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(nfeat, nhid)  # 第一层图卷积
        self.gc2 = GraphConvolution(nhid, nclass)  # 第二层图卷积
        self.dropout = dropout

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))  # 图卷积 + ReLU
        x = F.dropout(x, self.dropout, training=self.training)  # 防止过拟合
        x = self.gc2(x, adj)  # 输出层
        return F.log_softmax(x, dim=1)

class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / np.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

dataset = pd.read_csv("UGR_sample_5M_balanced.csv")

In [16]:
dataset.head()

Unnamed: 0,Date time,Duration,Source IP,Destination IP,Source Port,Destination Port,Protocol,Flag,Forwarding status,ToS,Packets,Bytes,Label
0,2016-07-27 13:43:29,0.0,143.72.8.137,42.219.158.161,53,43192,UDP,.A....,0,0,1,214,background
1,2016-07-27 13:43:29,0.0,42.219.154.119,143.72.8.137,60185,53,UDP,.A....,0,0,1,72,background
2,2016-07-27 13:43:30,0.0,42.219.154.107,143.72.8.137,48598,53,UDP,.A....,0,0,1,77,background
3,2016-07-27 13:43:30,0.0,42.219.154.98,143.72.8.137,51465,53,UDP,.A....,0,0,1,63,background
4,2016-07-27 13:43:30,0.0,43.164.49.177,42.219.155.26,80,37934,TCP,.A...F,0,0,1,52,background


In [17]:
X = dataset.drop(columns=['Label'])
y = dataset['Label']

train_data = pd.DataFrame(X, columns=X.columns)
train_data['Label'] = y

# 查看数据集类别分布
print("数据集类别分布：")
print(train_data['Label'].value_counts())

数据集类别分布：
Label
background      4931787
dos               27419
blacklist         13770
scan44            13025
nerisbotnet        6234
anomaly-spam       4961
scan11             2804
Name: count, dtype: int64


In [28]:
major_data = dataset[dataset['Label'] == "background"]
major_data = major_data.sample(n=10000, random_state=42)
minor_data = dataset[dataset['Label'] != "background"]
minor_data = minor_data.sample(n=10000, random_state=42)
data = pd.concat([major_data, minor_data], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# print(data.head())
print(data['Label'].value_counts())
# print(data['Source IP'].value_counts)

Label
background      5000
dos             2015
blacklist       1045
scan44           957
nerisbotnet      428
anomaly-spam     355
scan11           200
Name: count, dtype: int64


In [29]:
def build_graph(data):
    """
    构建图结构，包括节点特征矩阵和邻接矩阵
    """
    # 提取特征
    features_columns = ['Duration', 'Source Port', 'Destination Port', 'Packets', 'Bytes']
    features = data[features_columns]

    # 转换 IP 地址为数值
    data['Source IP'] = data['Source IP'].apply(lambda x: int(''.join(x.split('.'))))
    data['Destination IP'] = data['Destination IP'].apply(lambda x: int(''.join(x.split('.'))))

    # 归一化特征
    scaler = MinMaxScaler()
    features = scaler.fit_transform(features)

    # 使用哈希表加速构图
    ip_to_indices = {}
    for idx, ip in enumerate(data['Source IP']):
        if ip not in ip_to_indices:
            ip_to_indices[ip] = []
        ip_to_indices[ip].append(idx)

    adj_list = []
    for idx, dst_ip in enumerate(data['Destination IP']):
        if dst_ip in ip_to_indices:
            for neighbor_idx in ip_to_indices[dst_ip]:
                adj_list.append((idx, neighbor_idx))

    # 构造邻接矩阵
    rows, cols = zip(*adj_list)
    adj = sp.coo_matrix(
        (np.ones(len(rows)), (rows, cols)),
        shape=(len(data), len(data)),
        dtype=np.float32
    )

    return torch.FloatTensor(features), sparse_mx_to_torch_sparse_tensor(adj)


# 稀疏矩阵转 PyTorch 稀疏张量
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

features, adj = build_graph(data)


In [30]:
data['Label'] = data['Label'].apply(lambda x: 0 if x == 'background' else 1)
labels = data['Label']
labels = torch.LongTensor(labels.values)

# 将数据移动到 GPU
features = features.to(device)
adj = adj.to(device)

In [31]:
print(data['Label'].value_counts())

Label
1    5000
0    5000
Name: count, dtype: int64


In [32]:
# 划分训练集、验证集和测试集
idx_train, idx_temp, labels_train, labels_temp = train_test_split(
    range(len(labels)), labels, test_size=0.4, stratify=labels, random_state=42
)
idx_val, idx_test, labels_val, labels_test = train_test_split(
    idx_temp, labels_temp, test_size=0.5, stratify=labels_temp, random_state=42
)

labels = labels.to(device)

In [33]:
print(labels[idx_test].unique())
print(labels[idx_train].unique())
print(labels[idx_val].unique())

tensor([0, 1], device='cuda:0')
tensor([0, 1], device='cuda:0')
tensor([0, 1], device='cuda:0')


In [34]:
# idx_train = torch.LongTensor(idx_train)
# idx_val = torch.LongTensor(idx_val)
# idx_test = torch.LongTensor(idx_test)

idx_train = torch.LongTensor(idx_train).to(device)
idx_val = torch.LongTensor(idx_val).to(device)
idx_test = torch.LongTensor(idx_test).to(device)

In [35]:
def train_gcn(adj, features, labels, idx_train, idx_val, nfeat, nhid, nclass, epochs=200, lr=0.01, weight_decay=5e-4, dropout=0.5):
    # 初始化模型和优化器
    # model = GCN(nfeat=nfeat, nhid=nhid, nclass=nclass, dropout=dropout)
    model = GCN(nfeat=nfeat, nhid=nhid, nclass=nclass, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # 训练模型
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(features, adj)  # 前向传播
        loss_train = F.nll_loss(output[idx_train], labels[idx_train])  # 计算损失
        loss_train.backward()
        optimizer.step()

        # 验证集评估
        model.eval()
        output = model(features, adj)
        loss_val = F.nll_loss(output[idx_val], labels[idx_val])
        acc_val = accuracy(output[idx_val], labels[idx_val])
        print(f"Epoch {epoch+1}: Train Loss = {loss_train.item():.4f}, Val Loss = {loss_val.item():.4f}, Val Accuracy = {acc_val:.4f}")
    return model

def accuracy(output, labels):
    """计算分类准确率"""
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

def evaluate_gcn(model, adj, features, labels, idx_test):
    from tabulate import tabulate
    """评估模型"""
    model.eval()
    output = model(features, adj)
    preds = output[idx_test].max(1)[1].type_as(labels)
    
    # 获取分类报告
    report = classification_report(labels[idx_test].cpu().numpy(), preds.cpu().numpy(), output_dict=True)
    
    # 格式化并打印分类报告
    print("Test Classification Report:")
    print(tabulate(
        [[key] + [f"{value:.2f}" for value in metrics.values()] for key, metrics in report.items() if isinstance(metrics, dict)],
        headers=["Class", "Precision", "Recall", "F1-Score", "Support"],
        tablefmt="grid"
    ))

    # 打印 accuracy, macro avg, weighted avg
    accuracy = report['accuracy']
    print(f"Accuracy: {accuracy:.4f}")


In [36]:
model = train_gcn(adj, features, labels, idx_train, idx_val, nfeat=features.shape[1], nhid=16, nclass=2)

Epoch 1: Train Loss = 312.8475, Val Loss = 218.7255, Val Accuracy = 0.7295
Epoch 2: Train Loss = 272.3918, Val Loss = 181.9314, Val Accuracy = 0.7845
Epoch 3: Train Loss = 226.2786, Val Loss = 145.8314, Val Accuracy = 0.7845
Epoch 4: Train Loss = 180.8818, Val Loss = 110.6456, Val Accuracy = 0.7695
Epoch 5: Train Loss = 136.5224, Val Loss = 336.0614, Val Accuracy = 0.5905
Epoch 6: Train Loss = 347.6242, Val Loss = 81.1807, Val Accuracy = 0.7955
Epoch 7: Train Loss = 100.0411, Val Loss = 76.2767, Val Accuracy = 0.8020
Epoch 8: Train Loss = 94.1534, Val Loss = 69.1644, Val Accuracy = 0.8025
Epoch 9: Train Loss = 85.4900, Val Loss = 60.0418, Val Accuracy = 0.8045
Epoch 10: Train Loss = 74.3001, Val Loss = 49.2769, Val Accuracy = 0.8060
Epoch 11: Train Loss = 61.0431, Val Loss = 37.0403, Val Accuracy = 0.8080
Epoch 12: Train Loss = 45.9407, Val Loss = 23.3707, Val Accuracy = 0.8180
Epoch 13: Train Loss = 29.0655, Val Loss = 8.5797, Val Accuracy = 0.8145
Epoch 14: Train Loss = 10.7645, Val 

In [37]:
evaluate_gcn(model, adj, features, labels, idx_test)

Test Classification Report:
+--------------+-------------+----------+------------+-----------+
| Class        |   Precision |   Recall |   F1-Score |   Support |
| 0            |        0.77 |     0.96 |       0.85 |      1000 |
+--------------+-------------+----------+------------+-----------+
| 1            |        0.94 |     0.71 |       0.81 |      1000 |
+--------------+-------------+----------+------------+-----------+
| macro avg    |        0.85 |     0.83 |       0.83 |      2000 |
+--------------+-------------+----------+------------+-----------+
| weighted avg |        0.85 |     0.83 |       0.83 |      2000 |
+--------------+-------------+----------+------------+-----------+
Accuracy: 0.8320
