In [2]:
import pandas as pd
import numpy as np
import torch
import scipy.sparse as sp
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

path = 'UGR_sample_5M.csv'
data = pd.read_csv(path)
# 对数据集进行抽样，减少数据量
data = data.sample(n=5000, random_state=42)

features_columns = ['Duration', 'Source Port', 'Destination Port', 'Packets', 'Bytes']
features = data[features_columns]

# 转换 IP 地址为数值
data['Source IP'] = data['Source IP'].apply(lambda x: int(''.join(x.split('.'))))
data['Destination IP'] = data['Destination IP'].apply(lambda x: int(''.join(x.split('.'))))

# 归一化特征
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

src_ips = data['Source IP']
dst_ips = data['Destination IP']
adj_list = []


In [5]:
src_ips = src_ips.astype(str).apply(lambda x: int(''.join(x.split('.'))))
dst_ips = dst_ips.astype(str).apply(lambda x: int(''.join(x.split('.'))))
if src_ips.isnull().any() or dst_ips.isnull().any():
    raise ValueError("Source IPs or Destination IPs contain null values!")


In [10]:
ip_to_indices = {}
for idx, ip in enumerate(src_ips):
    if ip not in ip_to_indices:
        ip_to_indices[ip] = []
    ip_to_indices[ip].append(idx)

adj_list = []
for idx, dst_ip in enumerate(dst_ips):
    if dst_ip in ip_to_indices:
        for neighbor_idx in ip_to_indices[dst_ip]:
            adj_list.append((idx, neighbor_idx))

# print(adj_list[:10, :10])
print(len(adj_list))

280760


In [16]:
rows, cols = zip(*adj_list)
adj = sp.coo_matrix(
        (np.ones(len(rows)), (rows, cols)),
        shape=(len(data), len(data)),
        dtype=np.float32
    )

In [None]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)


In [28]:
lables = data['Label'].replace(['background'], 0).replace(['anomaly-spam'], 1).replace(['blacklist'], 2).replace(['dos'], 3).replace(['nerisbotnet'], 4).replace(['scan44'], 5).replace(['scan11'], 6)

In [30]:
print(data['Label'].dtype)
print(data['Label'].unique())

int64
[0 5 3 2 1 4]


In [31]:
labels = torch.LongTensor(lables.values)

In [37]:
print(data['Label'].value_counts())

Label
0    4924
3      31
5      20
2      13
1       6
4       6
Name: count, dtype: int64


In [34]:
 # 划分训练集、验证集和测试集
idx_train, idx_temp, labels_train, labels_temp = train_test_split(
    range(len(labels)), labels, test_size=0.4, stratify=labels, random_state=42
)
idx_val, idx_test, labels_val, labels_test = train_test_split(
    idx_temp, labels_temp, test_size=0.5, stratify=labels_temp, random_state=42
)

In [35]:
idx_train = torch.LongTensor(idx_train)
idx_val = torch.LongTensor(idx_val)
idx_test = torch.LongTensor(idx_test)

In [32]:
import torch.nn as nn
import torch.nn.functional as F

class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(nfeat, nhid)  # 第一层图卷积
        self.gc2 = GraphConvolution(nhid, nclass)  # 第二层图卷积
        self.dropout = dropout

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))  # 图卷积 + ReLU
        x = F.dropout(x, self.dropout, training=self.training)  # Dropout 防止过拟合
        x = self.gc2(x, adj)  # 输出层
        return F.log_softmax(x, dim=1)  # Log-Softmax 用于多分类任务

class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

In [33]:
def accuracy(output, labels):
    """计算分类准确率"""
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

def evaluate_gcn(model, adj, features, labels, idx_test):
    """评估模型"""
    model.eval()
    output = model(features, adj)
    preds = output[idx_test].max(1)[1].type_as(labels)
    print("Test Classification Report:")
    print(classification_report(labels[idx_test].cpu().numpy(), preds.cpu().numpy()))


def train_gcn(adj, features, labels, idx_train, idx_val, nfeat, nhid, nclass, epochs=200, lr=0.01, weight_decay=5e-4, dropout=0.5):
    # 初始化模型和优化器
    model = GCN(nfeat=nfeat, nhid=nhid, nclass=nclass, dropout=dropout)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # 训练模型
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(features, adj)  # 前向传播
        loss_train = F.nll_loss(output[idx_train], labels[idx_train])  # 计算损失
        loss_train.backward()
        optimizer.step()

        # 验证集评估
        model.eval()
        output = model(features, adj)
        loss_val = F.nll_loss(output[idx_val], labels[idx_val])
        acc_val = accuracy(output[idx_val], labels[idx_val])
        print(f"Epoch {epoch+1}: Train Loss = {loss_train.item():.4f}, Val Loss = {loss_val.item():.4f}, Val Accuracy = {acc_val:.4f}")
    return model

In [38]:
model = train_gcn(adj, features, labels, idx_train, idx_val, nfeat=features.shape[1], nhid=16, nclass=3)


AttributeError: 'GraphConvolution' object has no attribute 'reset_parameters'