In [3]:

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import json
from tqdm import tqdm


# ===== 1. 加载BERT教师模型 =====
# 设备设置
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 加载模型和 tokenizer
model_path = "C:/Users/ASUS/BERT/saved_model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# 将模型移动到设备并设置为评估模式
model.to(device)
model.eval()



  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
import pandas as pd
import re
from nltk.corpus import stopwords
from string import punctuation
# 数据读取

data = pd.read_csv('C:/Users/ASUS/BERT/AI_Human.csv')

# 数据采样与清洗
ai_samples = data[data['generated'] == 1]
human_samples = data[data['generated'] == 0]
data = pd.concat([ai_samples.sample(n=5000, random_state=42), human_samples.sample(n=5000, random_state=42)])
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# 清洗函数
def remove_punc(text):
    return ''.join([char for char in text if char not in punctuation])

def remove_stop(text):
    stops = set(stopwords.words('english'))
    return " ".join([word for word in text.split() if word.lower() not in stops])

# 文本清洗
data['cleaned'] = data['text'].str.lower()
data['cleaned'] = data['cleaned'].apply(lambda x: re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
data['cleaned'] = data['cleaned'].apply(lambda x: re.sub(r'<.*?>', '', x))
data['cleaned'] = data['cleaned'].apply(remove_punc)
data['cleaned'] = data['cleaned'].apply(remove_stop)

data = data[['cleaned', 'generated']]
data.rename(columns={'generated': 'label'}, inplace=True)
data

Unnamed: 0,cleaned,label
0,dear principal hearing quite lot subject commu...,0.0
1,dear state senator writing express opinion ele...,1.0
2,high school students constantly bombarded info...,1.0
3,hi im 6th garden think zoos ane nearly cool iv...,1.0
4,sure jars attempt writing essay average 8tj gr...,1.0
...,...,...
9995,good actions helpful ways good altered led goo...,0.0
9996,article unmaking face mars explains face mars ...,0.0
9997,driving extremely dangerous anyone else especi...,0.0
9998,hey know people say kindness goes long way yea...,1.0


In [8]:
data_enhanced = pd.read_csv('C:/Users/ASUS/BERT/data_Enhance/enhanceded_ai_human.csv')
# 文本清洗
data_enhanced ['cleaned'] = data_enhanced ['text'].str.lower()
data_enhanced ['cleaned'] = data_enhanced ['cleaned'].apply(lambda x: re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
data_enhanced ['cleaned'] = data_enhanced ['cleaned'].apply(lambda x: re.sub(r'<.*?>', '', x))
data_enhanced ['cleaned'] = data_enhanced ['cleaned'].apply(remove_punc)
data_enhanced ['cleaned'] = data_enhanced ['cleaned'].apply(remove_stop)

data_enhanced  = data_enhanced [['cleaned', 'label']]
data_enhanced 

Unnamed: 0,cleaned,label
0,here’s reworded passage repetitive phrases pat...,1
1,certainly reworded version passage repetitive ...,1
2,certainly reworded version passage repetitive ...,1
3,here’s reworded passage repetitive phrases pat...,1
4,certainly here’s reworded version repetitive p...,1
...,...,...
3995,curiosity nature humans curiosity gotten us pl...,0
3996,dear principal area decision thinking making c...,0
3997,every told adults always around teenagers righ...,0
3998,people complain car arent allowed complete cer...,0


## 使用BERT为增强数据生成 soft label（logits）

In [7]:
import torch
from tqdm import tqdm
import json
import os

def generate_soft_labels(df, model, tokenizer, device, max_len=256):
    distilled_data = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row['cleaned']
        label = int(row['label'])

        # BERT 编码
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)

        # 前向传播（不计算梯度）
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits.squeeze().cpu().tolist()

        # 保存为 soft label
        distilled_data.append({
            "text": text,
            "label": label,
            "teacher_logits": logits
        })

    return distilled_data

# 创建保存目录
os.makedirs("distilled_data", exist_ok=True)

# 示例：对原始数据进行蒸馏
distilled_original = generate_soft_labels(data, model, tokenizer, device)
with open("distilled_data/distill_original.json", "w", encoding='utf-8') as f:
    json.dump(distilled_original, f, ensure_ascii=False, indent=2)

# 示例：对增强数据进行蒸馏
distilled_augmented = generate_soft_labels(data_enhanced, model, tokenizer, device)
with open("distilled_data/distill_augmented.json", "w", encoding='utf-8') as f:
    json.dump(distilled_augmented, f, ensure_ascii=False, indent=2)


  7%|█████▌                                                                        | 709/10000 [00:15<03:17, 46.97it/s]

KeyboardInterrupt



## 加载数据

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import json
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import json
import numpy as np

class DistilledDataset(Dataset):
    def __init__(self, json_file, tokenizer, max_len=256):
        with open(json_file, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoded = self.tokenizer(item['text'], padding='max_length', truncation=True,
                                 max_length=self.max_len, return_tensors='pt')
        input_ids = encoded['input_ids'].squeeze()
        attention_mask = encoded['attention_mask'].squeeze()
        soft_labels = torch.tensor(item['teacher_logits'], dtype=torch.float32)
        hard_label = torch.tensor(item['label'], dtype=torch.long)
        return input_ids, attention_mask, soft_labels, hard_label



## 定义 LSTM 学生模型

In [10]:
class LSTMStudent(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_classes=2):
        super(LSTMStudent, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, input_ids):
        embeds = self.embedding(input_ids)
        _, (hidden, _) = self.lstm(embeds)
        logits = self.classifier(hidden[-1])
        return logits


## 训练函数（带蒸馏 loss）

In [11]:
def distillation_loss(student_logits, teacher_logits, true_labels, T=2.0, alpha=0.7):
    loss_fn = nn.KLDivLoss(reduction='batchmean')
    ce_loss_fn = nn.CrossEntropyLoss()

    soft_loss = loss_fn(
        nn.functional.log_softmax(student_logits / T, dim=1),
        nn.functional.softmax(teacher_logits / T, dim=1)
    ) * (T * T)

    hard_loss = ce_loss_fn(student_logits, true_labels)

    return alpha * soft_loss + (1 - alpha) * hard_loss

def train_lstm_model(model, dataloader, optimizer, device, epochs=5):
    model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for input_ids, attention_mask, teacher_logits, labels in dataloader:
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            teacher_logits = teacher_logits.to(device)

            optimizer.zero_grad()
            student_logits = model(input_ids)
            loss = distillation_loss(student_logits, teacher_logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")


## 加载数据并训练两个模型

In [12]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("C:/Users/ASUS/BERT/saved_model")

# 加载数据集
dataset_orig = DistilledDataset("distilled_data/distill_original.json", tokenizer)
dataset_aug = DistilledDataset("distilled_data/distill_augmented.json", tokenizer)


from torch.utils.data import random_split

# 假设你想用 10% 作为测试集
test_ratio = 0.1
total_size = len(dataset_orig)
test_size = int(total_size * test_ratio)
train_size = total_size - test_size

train_dataset, test_dataset = random_split(dataset_orig, [train_size, test_size])
dataset_orig=train_dataset

In [13]:


dataloader_orig = DataLoader(dataset_orig, batch_size=32, shuffle=True)
dataloader_aug = DataLoader(dataset_aug, batch_size=32, shuffle=True)

# 词表大小
vocab_size = tokenizer.vocab_size

# 初始化两个学生模型
lstm_orig = LSTMStudent(vocab_size)
lstm_aug = LSTMStudent(vocab_size)

# 优化器
optimizer_orig = torch.optim.Adam(lstm_orig.parameters(), lr=2e-4)
optimizer_aug = torch.optim.Adam(lstm_aug.parameters(), lr=2e-4)

# 训练
train_lstm_model(lstm_orig, dataloader_orig, optimizer_orig, device)
train_lstm_model(lstm_aug, dataloader_aug, optimizer_aug, device)


Epoch 1, Loss: 1.7726769734781684
Epoch 2, Loss: 1.7275419400093404
Epoch 3, Loss: 1.4880417908759827
Epoch 4, Loss: 0.9800037436240109
Epoch 5, Loss: 1.3112991045341424
Epoch 1, Loss: 1.5974644343058269
Epoch 2, Loss: 1.0241202890078227
Epoch 3, Loss: 1.02368297568957
Epoch 4, Loss: 0.7204754469792048
Epoch 5, Loss: 0.6120695053736369


In [14]:
torch.save(lstm_orig.state_dict(), "lstm_original.pt")
torch.save(lstm_aug.state_dict(), "lstm_augmented.pt")

## 测试并加权融合两个模型

In [23]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# ========== 定义模型结构 ==========
class LSTMStudent(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_classes=2):
        super(LSTMStudent, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, input_ids):
        embeds = self.embedding(input_ids)
        _, (hidden, _) = self.lstm(embeds)
        logits = self.classifier(hidden[-1])
        return logits

# ========== 模型初始化与加载 ==========
vocab_size = 30522  # 与训练时一致
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 初始化模型并加载权重
lstm_orig = LSTMStudent(vocab_size).to(device)
lstm_aug = LSTMStudent(vocab_size).to(device)

lstm_orig.load_state_dict(torch.load("lstm_original.pt", map_location=device))
lstm_aug.load_state_dict(torch.load("lstm_augmented.pt", map_location=device))

lstm_orig.eval()
lstm_aug.eval()

# ========== 定义预测函数 ==========
def predict_lstm(model, dataloader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for input_ids, _, _, _ in dataloader:
            input_ids = input_ids.to(device)
            logits = model(input_ids)
            probs = torch.softmax(logits, dim=1)
            preds.append(probs.cpu())
    return torch.cat(preds, dim=0)

# ========== 执行预测 ==========
# 提前准备好 test_dataset
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 预测两个模型输出的概率分布
probs_orig = predict_lstm(lstm_orig, test_loader, device)
probs_aug = predict_lstm(lstm_aug, test_loader, device)

# ========== 加权融合 ==========0.3，0.7
final_probs = 0.5* probs_orig + 0.5 * probs_aug
final_preds = torch.argmax(final_probs, dim=1)

# 输出预测结果
print(final_preds)
true_labels = torch.tensor([label.item() for _, _, _, label in test_dataset])
# 计算准确率
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(true_labels, final_preds)
print(f"Test Accuracy: {accuracy:.4f}")


  lstm_orig.load_state_dict(torch.load("lstm_original.pt", map_location=device))
  lstm_aug.load_state_dict(torch.load("lstm_augmented.pt", map_location=device))


tensor([1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
        0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
        0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
        1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
        1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,

## 攻击


In [16]:
from torch.utils.data import Dataset
import pandas as pd
import torch

class CSVDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_length=128):
        self.data = pd.read_csv(csv_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data.iloc[idx]['text'])
        label = int(self.data.iloc[idx]['label'])

        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        return input_ids, 0, 0, torch.tensor(label)
test_dataset_attack = CSVDataset("C:/Users/ASUS/BERT/attack/ai_rewritten_in_human_style.csv", tokenizer)


In [24]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# ========== 定义模型结构 ==========
class LSTMStudent(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_classes=2):
        super(LSTMStudent, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, input_ids):
        embeds = self.embedding(input_ids)
        _, (hidden, _) = self.lstm(embeds)
        logits = self.classifier(hidden[-1])
        return logits

# ========== 模型初始化与加载 ==========
vocab_size = 30522  # 与训练时一致
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 初始化模型并加载权重
lstm_orig = LSTMStudent(vocab_size).to(device)
lstm_aug = LSTMStudent(vocab_size).to(device)

lstm_orig.load_state_dict(torch.load("lstm_original.pt", map_location=device))
lstm_aug.load_state_dict(torch.load("lstm_augmented.pt", map_location=device))

lstm_orig.eval()
lstm_aug.eval()

# ========== 定义预测函数 ==========
def predict_lstm(model, dataloader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for input_ids, _, _, _ in dataloader:
            input_ids = input_ids.to(device)
            logits = model(input_ids)
            probs = torch.softmax(logits, dim=1)
            preds.append(probs.cpu())
    return torch.cat(preds, dim=0)

# ========== 执行预测 ==========
# 提前准备好 test_dataset
test_loader = DataLoader(test_dataset_attack, batch_size=32, shuffle=False)

# 预测两个模型输出的概率分布
probs_orig = predict_lstm(lstm_orig, test_loader, device)
probs_aug = predict_lstm(lstm_aug, test_loader, device)

# ========== 加权融合 ==========0.3，0.7
final_probs = 0.5* probs_orig + 0.5 * probs_aug
final_preds = torch.argmax(final_probs, dim=1)

# 输出预测结果
print(final_preds)
true_labels = torch.tensor([label.item() for _, _, _, label in test_dataset_attack])
# 计算准确率
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(true_labels, final_preds)
print(f"Test Accuracy: {accuracy:.4f}")


  lstm_orig.load_state_dict(torch.load("lstm_original.pt", map_location=device))
  lstm_aug.load_state_dict(torch.load("lstm_augmented.pt", map_location=device))


tensor([1, 1, 1,  ..., 1, 1, 1])
Test Accuracy: 0.9740
