In [1]:
!conda env list

# conda environments:
#
base                     D:\Anaconda3
AntiFraud                D:\Anaconda3\envs\AntiFraud
AntiFraud2            *  D:\Anaconda3\envs\AntiFraud2
Basket                   D:\Anaconda3\envs\Basket
JadeWeb                  D:\Anaconda3\envs\JadeWeb
ML                       D:\Anaconda3\envs\ML
streamlit3               D:\Anaconda3\envs\streamlit3
AntiFraud2               d:\Anaconda3\envs\AntiFraud2
ML                       d:\Anaconda3\envs\ML



In [2]:
import torch
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.neighbors import kneighbors_graph
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import dgl
from models.GCN import GCN
# import xgboost as xgb
from models.RNN import RNN
from models.SVM import SVM
from models.DecisionTree import DecisionTreeID3, DecisionTreeCART
from models.LogisticRegression import LogisticRegression
from models.RandomForest import train_random_forest
from models.CNN import SimpleCNN
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 使用KNN图构建
def create_graph_from_data(features, n_neighbors=15):  # 增加邻居数
    knn_graph = kneighbors_graph(features, n_neighbors=n_neighbors, mode='connectivity', include_self=False)
    knn_graph = knn_graph.astype(np.float32).todense()

    # 转换为DGL图
    src, dst = np.where(knn_graph > 0)  # 获取非零元素的索引
    src = torch.tensor(src, dtype=torch.int64)
    dst = torch.tensor(dst, dtype=torch.int64)

    # 创建DGL图
    g = dgl.graph((src, dst))
    g.ndata['feat'] = torch.tensor(features, dtype=torch.float32)

    # 计算节点度数并进行归一化
    in_degrees = g.in_degrees().float()
    norm = 1.0 / in_degrees
    g.ndata['norm'] = norm

    return g

In [4]:
# 读取数据
mushroom_data = pd.read_csv('data/mushrooms.csv', dtype=str)

In [5]:
# 分离目标和特征
target = mushroom_data['class']
inputs = mushroom_data.drop(['class'], axis=1)

In [6]:
# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=24, stratify=target)

In [7]:
# 编码
enc_i = OrdinalEncoder()
enc_t = LabelEncoder()

x_train_transf = enc_i.fit_transform(X_train)
x_test_transf = enc_i.transform(X_test)

y_train_transf = enc_t.fit_transform(y_train)
y_test_transf = enc_t.transform(y_test)

In [8]:
# 确定分类任务的类别数
num_classes = len(np.unique(y_train_transf))

In [9]:
# 构建训练图
X_train_tensor = torch.tensor(x_train_transf, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_transf, dtype=torch.long)
X_test_tensor = torch.tensor(x_test_transf, dtype=torch.float32)

# 数据标准化
X_train_tensor = F.normalize(X_train_tensor, p=2, dim=1)
X_test_tensor = F.normalize(X_test_tensor, p=2, dim=1)

In [10]:
# 标准化数据
scaler = StandardScaler()
x_train_transf_scaler = scaler.fit_transform(x_train_transf)
x_test_transf_scaler = scaler.transform(x_test_transf)

In [11]:
# 初始化并训练Logistic Regression模型
log_reg_model = LogisticRegression(learning_rate=0.01, n_iter=1000, verbose=True, early_stopping=True)
log_reg_model.fit(x_train_transf_scaler, y_train_transf)

# 预测与评估
log_reg_predictions = log_reg_model.predict(x_test_transf_scaler)
accuracy_log_reg = accuracy_score(y_test_transf, log_reg_predictions)

# 计算训练集准确率
train_predictions = log_reg_model.predict(x_train_transf_scaler)
train_accuracy = accuracy_score(y_train_transf, train_predictions)
print(f'Training Accuracy (Logistic Regression): {train_accuracy * 100:.2f}%')

# 输出测试准确率
print(f'Test Accuracy (Logistic Regression): {accuracy_log_reg * 100:.2f}%')

Iteration 0: Loss = 0.6931
Iteration 100: Loss = 0.4415
Iteration 200: Loss = 0.3561
Iteration 300: Loss = 0.3149
Iteration 400: Loss = 0.2906
Iteration 500: Loss = 0.2745
Early stopping at iteration 597, Loss = 0.2633
Training Accuracy (Logistic Regression): 90.71%
Test Accuracy (Logistic Regression): 89.78%


In [12]:
"""
# SVM训练：使用自定义的SVM实现
svm_model = SVM(kernel='rbf', learning_rate=0.001, n_iters=10, C=1.0)
svm_model.train(x_train_transf_scaler, y_train_transf)

# 预测
svm_predictions = svm_model.predict(x_test_transf_scaler)

# 计算测试准确率
accuracy_svm = accuracy_score(y_test_transf, svm_predictions)
print(f'Test Accuracy (SVM): {accuracy_svm * 100:.2f}%')
"""

"\n# SVM训练：使用自定义的SVM实现\nsvm_model = SVM(kernel='rbf', learning_rate=0.001, n_iters=10, C=1.0)\nsvm_model.train(x_train_transf_scaler, y_train_transf)\n\n# 预测\nsvm_predictions = svm_model.predict(x_test_transf_scaler)\n\n# 计算测试准确率\naccuracy_svm = accuracy_score(y_test_transf, svm_predictions)\nprint(f'Test Accuracy (SVM): {accuracy_svm * 100:.2f}%')\n"

In [13]:
# 使用ID3算法训练决策树
id3_model = DecisionTreeID3(max_depth=5)  # 可以根据需要调整最大深度
id3_model.fit(x_train_transf, y_train_transf)

# 在测试集上进行预测
id3_predictions = id3_model.predict(x_test_transf)

# 计算准确率
id3_accuracy = accuracy_score(y_test_transf, id3_predictions)
print(f'Test Accuracy (ID3): {id3_accuracy * 100:.2f}%')

Test Accuracy (ID3): 97.66%


In [14]:
# 使用CART算法训练决策树
cart_model = DecisionTreeCART(max_depth=5)  # 可以根据需要调整最大深度
cart_model.fit(x_train_transf, y_train_transf)

# 在测试集上进行预测
cart_predictions = cart_model.predict(x_test_transf)

# 计算准确率
cart_accuracy = accuracy_score(y_test_transf, cart_predictions)
print(f'Test Accuracy (CART): {cart_accuracy * 100:.2f}%')

Test Accuracy (CART): 97.54%


In [15]:
G_train = create_graph_from_data(X_train_tensor, n_neighbors=15)  # 增加邻居数

# 定义超参数
in_feats = X_train_tensor.shape[1]
h_feats = 256  # 增大隐藏层维度
num_classes = len(np.unique(y_train_transf))
dropout = 0.5

# 创建GCN模型
model = GCN(in_feats=in_feats, hidden_feats=h_feats, out_feats=num_classes, dropout_rate=dropout)

# 优化器与学习率调度器
optimizer = optim.AdamW(model.parameters(), lr=0.005, weight_decay=1e-5)  # 使用AdamW
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

# 损失函数
loss_fn = torch.nn.CrossEntropyLoss()

# 训练过程
num_epochs = 100  # 增加训练轮数
train_losses = []
train_accuracies = []

for epoch in range(num_epochs):
    model.train()

    # 前向传播
    logits = model(G_train, G_train.ndata['feat'])

    # 计算损失
    loss = loss_fn(logits, y_train_tensor)

    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # 更新学习率
    scheduler.step()

    # 记录训练损失
    train_losses.append(loss.item())

    # 计算训练精度
    _, train_preds = torch.max(logits, dim=1)
    train_accuracy = accuracy_score(y_train_tensor.cpu(), train_preds.cpu())
    train_accuracies.append(train_accuracy)

    # 每10轮打印一次损失和训练精度
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}, Train Accuracy: {train_accuracy:.4f}')

# 测试过程
model.eval()
with torch.no_grad():
    G_test = create_graph_from_data(X_test_tensor, n_neighbors=15)
    logits_test = model(G_test, G_test.ndata['feat'])
    predictions = torch.argmax(logits_test, dim=1)

    # 计算测试准确率
    accuracy = accuracy_score(y_test_transf, predictions.numpy())
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

  del sys.path[0]


Epoch 0, Loss: 0.7545259594917297, Train Accuracy: 0.4411
Epoch 10, Loss: 0.47636887431144714, Train Accuracy: 0.7964
Epoch 20, Loss: 0.3649664521217346, Train Accuracy: 0.8561
Epoch 30, Loss: 0.2502155601978302, Train Accuracy: 0.8855
Epoch 40, Loss: 0.22881688177585602, Train Accuracy: 0.9084
Epoch 50, Loss: 0.18591415882110596, Train Accuracy: 0.9280
Epoch 60, Loss: 0.176894873380661, Train Accuracy: 0.9352
Epoch 70, Loss: 0.14234943687915802, Train Accuracy: 0.9425
Epoch 80, Loss: 0.11782790720462799, Train Accuracy: 0.9638
Epoch 90, Loss: 0.09628640115261078, Train Accuracy: 0.9706
Test Accuracy: 96.25%


  del sys.path[0]


In [16]:
# 随机森林
rf_model = train_random_forest(x_train_transf_scaler, y_train_transf, x_test_transf_scaler, y_test_transf)

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.93      0.95       842
           1       0.93      0.97      0.95       783

    accuracy                           0.95      1625
   macro avg       0.95      0.95      0.95      1625
weighted avg       0.95      0.95      0.95      1625

Confusion Matrix:
 [[787  55]
 [ 24 759]]
AUC Score: 0.952013996960348
AP Score: 0.9186213587592897


In [17]:
# CNN 
# 初始化和训练 CNN 模型
input_shape = (x_train_transf_scaler.shape[1], 1)
cnn_model = SimpleCNN(input_shape=input_shape)
# 训练 CNN 模型
cnn_model.train(x_train_transf_scaler, y_train_transf, epochs=20, batch_size=32, validation_split=0.2) 
# 评估 CNN 模型
cnn_model.evaluate(x_test_transf_scaler, y_test_transf)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       842
           1       1.00      1.00      1.00       783

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

AUC: 1.0000
Average Precision (AP): 1.0000


In [None]:
# XGBoost训练
dtrain = xgb.DMatrix(x_train_transf, label=y_train_transf)
dtest = xgb.DMatrix(x_test_transf, label=y_test_transf)

# XGBoost超参数设置
params = {
    'objective': 'binary:logistic',  # 二分类
    'eval_metric': 'logloss',
    'max_depth': 6,  # 树的最大深度
    'eta': 0.1,  # 学习率
    'subsample': 0.8,  # 子样本比例
    'colsample_bytree': 0.8,  # 树的列采样
    'n_jobs': 4
}

# 训练
num_round = 1000
bst = xgb.train(params, dtrain, num_round)

# 预测
y_pred_xgb = bst.predict(dtest)
y_pred_xgb = (y_pred_xgb > 0.5).astype(int)  # 二分类，阈值设置为0.5

# 计算准确率
accuracy_xgb = accuracy_score(y_test_transf, y_pred_xgb)
print(f'Test Accuracy (XGBoost): {accuracy_xgb * 100:.2f}%')

In [None]:
# RNN训练
rnn_model = RNN(input_size=x_train_transf_scaler.shape[1], hidden_size=256, output_size=num_classes)
rnn_model.train(x_train_transf_scaler.tolist(), y_train_transf.tolist(), epochs=100, learning_rate=1e-3)

# 预测
rnn_predictions = rnn_model.predict(x_test_transf_scaler.tolist())

# 计算测试准确率
accuracy_rnn = accuracy_score(y_test_transf, rnn_predictions)
print(f'Test Accuracy (RNN): {accuracy_rnn * 100:.2f}%')