In [8]:
import pandas as pd
import numpy as np
import torch
import scipy.sparse as sp
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(nfeat, nhid)  # 第一层图卷积
        self.gc2 = GraphConvolution(nhid, nclass)  # 第二层图卷积
        self.dropout = dropout

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))  # 图卷积 + ReLU
        x = F.dropout(x, self.dropout, training=self.training)  # 防止过拟合
        x = self.gc2(x, adj)  # 输出层
        return F.log_softmax(x, dim=1)  # Log-Softmax 用于多分类任务

class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / np.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

dataset = pd.read_csv("processed_data.csv")

In [9]:
features_columns = ['Duration', 'Source Port', 'Destination Port', 'Packets', 'Bytes','Label']
features = dataset[features_columns]
print(features)

         Duration  Source Port  Destination Port  Packets  Bytes       Label
0             0.0           53             43192        1    214  background
1             0.0        60185                53        1     72  background
2             0.0        48598                53        1     77  background
3             0.0        51465                53        1     63  background
4             0.0           80             37934        1     52  background
...           ...          ...               ...      ...    ...         ...
4999995       0.0        43924             53413        1    151  background
4999996       0.0        54339             53413        1    151  background
4999997       0.0        43355             53413        1    151  background
4999998       0.0        25985                53        1     75  background
4999999       0.0           53             58327        1     93  background

[5000000 rows x 6 columns]


In [10]:
X = dataset.drop(columns=['Label'])
y = dataset['Label']

train_data = pd.DataFrame(X, columns=X.columns)
train_data['Label'] = y

# 查看数据集类别分布
print("数据集类别分布：")
print(train_data['Label'].value_counts())

数据集类别分布：
Label
background      4931787
dos               27419
blacklist         13770
scan44            13025
nerisbotnet        6234
anomaly-spam       4961
scan11             2804
Name: count, dtype: int64


In [11]:
data_0 = dataset[dataset['Label'] == "background"].sample(n=20000, random_state=42)
data_1 = dataset[dataset['Label'] == "dos"].sample(n=5000, random_state=42)
data_2 = dataset[dataset['Label'] == "blacklist"].sample(n=5000, random_state=42)
data_3 = dataset[dataset['Label'] == "scan44"].sample(n=5000, random_state=42)
data_4 = dataset[dataset['Label'] == "nerisbotnet"].sample(n=5000, random_state=42)
data_5 = dataset[dataset['Label'] == "anomaly-spam"].sample(n=3000, random_state=42)
data_6 = dataset[dataset['Label'] == "scan11"].sample(n=2000, random_state=42)
data = pd.concat([data_0, data_1, data_2, data_3, data_4, data_5, data_6], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# print(data.head())
print('数据集类别分布')
print(data['Label'].value_counts())
# print(data['Source IP'].value_counts)

数据集类别分布
Label
background      20000
nerisbotnet      5000
blacklist        5000
dos              5000
scan44           5000
anomaly-spam     3000
scan11           2000
Name: count, dtype: int64


In [12]:
def build_graph(data):
    """
    构建图结构，包括节点特征矩阵和邻接矩阵
    """
    # 提取特征
    features_columns = ['Duration', 'Source Port', 'Destination Port', 'Packets', 'Bytes']
    features = data[features_columns]

    # 转换 IP 地址为数值
    data['Source IP'] = data['Source IP'].apply(lambda x: int(''.join(x.split('.'))))
    data['Destination IP'] = data['Destination IP'].apply(lambda x: int(''.join(x.split('.'))))

    # 归一化特征
    scaler = MinMaxScaler()
    features = scaler.fit_transform(features)

    # 使用哈希表加速构图
    ip_to_indices = {}
    for idx, ip in enumerate(data['Source IP']):
        if ip not in ip_to_indices:
            ip_to_indices[ip] = []
        ip_to_indices[ip].append(idx)

    adj_list = []
    for idx, dst_ip in enumerate(data['Destination IP']):
        if dst_ip in ip_to_indices:
            for neighbor_idx in ip_to_indices[dst_ip]:
                adj_list.append((idx, neighbor_idx))

    # 构造邻接矩阵
    rows, cols = zip(*adj_list)
    adj = sp.coo_matrix(
        (np.ones(len(rows)), (rows, cols)),
        shape=(len(data), len(data)),
        dtype=np.float32
    )

    return torch.FloatTensor(features), sparse_mx_to_torch_sparse_tensor(adj)


# 稀疏矩阵转 PyTorch 稀疏张量
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

features, adj = build_graph(data)


KeyboardInterrupt: 

In [64]:
# 使用 LabelEncoder 对标签进行编码
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'])

# 标签现在是整数形式的编码
labels = torch.LongTensor(data['Label'].values)

# 将数据移动到 GPU
features = features.to(device)
adj = adj.to(device)

In [65]:
print(data['Label'].value_counts())

Label
6    200
1    200
2    200
3    200
4    200
0    200
5    200
Name: count, dtype: int64


In [66]:
# 划分训练集、验证集和测试集
idx_train, idx_temp, labels_train, labels_temp = train_test_split(
    range(len(labels)), labels, test_size=0.4, stratify=labels, random_state=42
)
idx_val, idx_test, labels_val, labels_test = train_test_split(
    idx_temp, labels_temp, test_size=0.5, stratify=labels_temp, random_state=42
)

labels = labels.to(device)

In [57]:
print(labels[idx_test].unique())
print(labels[idx_train].unique())
print(labels[idx_val].unique())

tensor([0, 1, 2, 3, 4, 5, 6], device='cuda:0')
tensor([0, 1, 2, 3, 4, 5, 6], device='cuda:0')
tensor([0, 1, 2, 3, 4, 5, 6], device='cuda:0')


In [67]:
# idx_train = torch.LongTensor(idx_train)
# idx_val = torch.LongTensor(idx_val)
# idx_test = torch.LongTensor(idx_test)

idx_train = torch.LongTensor(idx_train).to(device)
idx_val = torch.LongTensor(idx_val).to(device)
idx_test = torch.LongTensor(idx_test).to(device)

In [68]:
def train_gcn(adj, features, labels, idx_train, idx_val, nfeat, nhid, nclass, epochs=200, lr=0.01, weight_decay=5e-4, dropout=0.5):
    # 初始化模型和优化器
    # model = GCN(nfeat=nfeat, nhid=nhid, nclass=nclass, dropout=dropout)
    model = GCN(nfeat=nfeat, nhid=nhid, nclass=nclass, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    # 训练模型
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(features, adj)  # 前向传播
        loss_train = criterion(output[idx_train], labels[idx_train])  # 计算损失
        loss_train.backward()
        optimizer.step()

        # 验证集评估
        model.eval()
        output = model(features, adj)
        loss_val = criterion(output[idx_val], labels[idx_val])
        acc_val = accuracy(output[idx_val], labels[idx_val])
        print(f"Epoch {epoch+1}: Train Loss = {loss_train.item():.4f}, Val Loss = {loss_val.item():.4f}, Val Accuracy = {acc_val:.4f}")
    return model

def accuracy(output, labels):
    """计算分类准确率"""
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

def evaluate_gcn(model, adj, features, labels, idx_test):
    """评估模型"""
    from tabulate import tabulate
    """评估模型"""
    model.eval()
    output = model(features, adj)
    preds = output[idx_test].max(1)[1].type_as(labels)
    
    # 获取分类报告
    report = classification_report(labels[idx_test].cpu().numpy(), preds.cpu().numpy(), output_dict=True)
    
    # 格式化并打印分类报告
    print("Test Classification Report:")
    print(tabulate(
        [[key] + [f"{value:.2f}" for value in metrics.values()] for key, metrics in report.items() if isinstance(metrics, dict)],
        headers=["Class", "Precision", "Recall", "F1-Score", "Support"],
        tablefmt="grid"
    ))

    # 打印 accuracy, macro avg, weighted avg
    accuracy = report['accuracy']
    print(f"Accuracy: {accuracy:.4f}")
    


In [69]:
model = train_gcn(adj, features, labels, idx_train, idx_val, nfeat=features.shape[1], nhid=16, nclass=7)

Epoch 1: Train Loss = 127.3781, Val Loss = 115.0087, Val Accuracy = 0.3536
Epoch 2: Train Loss = 107.1866, Val Loss = 96.0868, Val Accuracy = 0.3536
Epoch 3: Train Loss = 87.8084, Val Loss = 75.9729, Val Accuracy = 0.4071
Epoch 4: Train Loss = 68.0462, Val Loss = 64.4046, Val Accuracy = 0.3964
Epoch 5: Train Loss = 56.3322, Val Loss = 56.4479, Val Accuracy = 0.3429
Epoch 6: Train Loss = 47.6187, Val Loss = 47.8621, Val Accuracy = 0.2286
Epoch 7: Train Loss = 40.3102, Val Loss = 44.3751, Val Accuracy = 0.1464
Epoch 8: Train Loss = 38.7277, Val Loss = 36.4127, Val Accuracy = 0.1464
Epoch 9: Train Loss = 32.2485, Val Loss = 34.0831, Val Accuracy = 0.3250
Epoch 10: Train Loss = 31.2485, Val Loss = 29.5552, Val Accuracy = 0.3643
Epoch 11: Train Loss = 27.7100, Val Loss = 22.9180, Val Accuracy = 0.3214
Epoch 12: Train Loss = 21.6726, Val Loss = 22.8997, Val Accuracy = 0.2929
Epoch 13: Train Loss = 21.3615, Val Loss = 21.0674, Val Accuracy = 0.2000
Epoch 14: Train Loss = 20.4484, Val Loss = 2

In [70]:
evaluate_gcn(model, adj, features, labels, idx_test)

Test Classification Report:
+--------------+-------------+----------+------------+-----------+
| Class        |   Precision |   Recall |   F1-Score |   Support |
| 0            |        1    |     0.95 |       0.97 |        40 |
+--------------+-------------+----------+------------+-----------+
| 1            |        0.45 |     0.85 |       0.59 |        40 |
+--------------+-------------+----------+------------+-----------+
| 2            |        0.37 |     0.28 |       0.31 |        40 |
+--------------+-------------+----------+------------+-----------+
| 3            |        0.38 |     0.07 |       0.12 |        40 |
+--------------+-------------+----------+------------+-----------+
| 4            |        0.81 |     0.33 |       0.46 |        40 |
+--------------+-------------+----------+------------+-----------+
| 5            |        0.67 |     1    |       0.8  |        40 |
+--------------+-------------+----------+------------+-----------+
| 6            |        0.52 |    