In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter
import torch
from sklearn.model_selection import train_test_split


# 数据读取
data = pd.read_csv('./AI_Human.csv')

# 数据采样与清洗
ai_samples = data[data['generated'] == 1]
human_samples = data[data['generated'] == 0]
data = pd.concat([ai_samples.sample(n=5000, random_state=42), human_samples.sample(n=5000, random_state=42)])
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# 清洗函数
def remove_punc(text):
    return ''.join([char for char in text if char not in punctuation])

def remove_stop(text):
    stops = set(stopwords.words('english'))
    return " ".join([word for word in text.split() if word.lower() not in stops])

# 文本清洗
data['cleaned'] = data['text'].str.lower()
data['cleaned'] = data['cleaned'].apply(lambda x: re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
data['cleaned'] = data['cleaned'].apply(lambda x: re.sub(r'<.*?>', '', x))
data['cleaned'] = data['cleaned'].apply(remove_punc)
data['cleaned'] = data['cleaned'].apply(remove_stop)

In [2]:
data = data[['cleaned', 'generated']]
data.rename(columns={'generated': 'label'}, inplace=True)
data.head()

Unnamed: 0,cleaned,label
0,dear principal hearing quite lot subject commu...,0.0
1,dear state senator writing express opinion ele...,1.0
2,high school students constantly bombarded info...,1.0
3,hi im 6th garden think zoos ane nearly cool iv...,1.0
4,sure jars attempt writing essay average 8tj gr...,1.0


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

# 数据划分
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned'].tolist(),
    data['label'].tolist(),
    test_size=0.3,
    random_state=42
)

# 初始化 BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('./bert-base-uncased')
max_length = 256
batch_size = 16

# 自定义 Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',  # 填充到 max_length
            truncation=True,       # 截断到 max_length
            return_attention_mask=True,
            return_tensors='pt'    # 返回 PyTorch 张量
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # 去掉 batch 维度
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# 批次合并函数
def collate_fn(batch):
    # 用 pad_sequence 处理 input_ids 和 attention_mask，确保批次内序列对齐
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=0)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.long)

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}


# 构建数据集和 DataLoader
train_dataset = TextDataset(X_train, y_train, tokenizer, max_length)
test_dataset = TextDataset(X_test, y_test, tokenizer, max_length)

# 只保留一个 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)




In [4]:
# 初始化 BERT 模型
model = BertForSequenceClassification.from_pretrained('./bert-base-uncased', num_labels=2)

# 配置优化器和设备
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 训练模型
model.train()
EPOCHS = 3

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    epoch_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # 前向传播
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # outputs 是一个元组，我们需要从中获取损失和 logits
        loss = outputs[0]  # 获取第一个元素，即损失
        logits = outputs[1]  # 获取第二个元素，即 logits

        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数

        epoch_loss += loss.item()  # 累加损失

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader)}")





Epoch 1/3


Training: 100%|██████████████████████████████████████████████████████████████████████| 438/438 [45:25<00:00,  6.22s/it]


Epoch 1 Loss: 0.1903910584211962
Epoch 2/3


Training: 100%|██████████████████████████████████████████████████████████████████████| 438/438 [44:47<00:00,  6.14s/it]


Epoch 2 Loss: 0.042395465088195965
Epoch 3/3


Training: 100%|██████████████████████████████████████████████████████████████████████| 438/438 [44:46<00:00,  6.13s/it]


Epoch 3 Loss: 0.023080260041818346


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 188/188 [00:57<00:00,  3.29it/s]


Accuracy: 0.9807
Classification Report:
               precision    recall  f1-score   support

Human-written       0.99      0.97      0.98      1461
   AI-written       0.98      0.99      0.98      1539

     accuracy                           0.98      3000
    macro avg       0.98      0.98      0.98      3000
 weighted avg       0.98      0.98      0.98      3000



In [5]:
# 测试模型
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # 测试阶段不需要传入 labels
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]  # 获取 logits

        # 预测标签
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# 评估模型
accuracy = accuracy_score(true_labels, predictions)
report = classification_report(true_labels, predictions, target_names=["Human-written", "AI-written"])

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Evaluating: 100%|████████████████████████████████████████████████████████████████████| 188/188 [00:57<00:00,  3.28it/s]

Accuracy: 0.9807
Classification Report:
               precision    recall  f1-score   support

Human-written       0.99      0.97      0.98      1461
   AI-written       0.98      0.99      0.98      1539

     accuracy                           0.98      3000
    macro avg       0.98      0.98      0.98      3000
 weighted avg       0.98      0.98      0.98      3000






In [10]:
# 保存模型
model.save_pretrained('./saved_model')

# 保存 tokenizer
tokenizer.save_pretrained('./saved_model')


('./saved_model\\vocab.txt',
 './saved_model\\special_tokens_map.json',
 './saved_model\\added_tokens.json')

In [1]:
from transformers import BertForSequenceClassification, BertTokenizer

# 加载模型
model = BertForSequenceClassification.from_pretrained('./saved_model')

# 加载 tokenizer
tokenizer = BertTokenizer.from_pretrained('./saved_model')


  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)


In [1]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# 设备设置
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 加载模型和 tokenizer
model_path = "./saved_model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# 将模型移动到设备并设置为评估模式
model.to(device)
model.eval()

# 预处理文本函数
def preprocess_text(text, tokenizer, max_length=256):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"  # 返回 PyTorch 张量
    )
    return encoding['input_ids'].to(device), encoding['attention_mask'].to(device)

# 推理函数
def predict(text):
    input_ids, attention_mask = preprocess_text(text, tokenizer)
    
    with torch.no_grad():  # 关闭梯度计算
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()  # 取最大概率的类别
        
    return "AI 生成" if prediction == 1 else "人类撰写"

# 测试推理
if __name__ == "__main__":
    test_texts = [
        "The advancements in AI have led to remarkable breakthroughs in various fields.",
        "This article discusses the impact of social media on human behavior."
    ]

    for text in test_texts:
        result = predict(text)
        print(f"输入文本: {text}\n预测结果: {result}\n")


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)


输入文本: The advancements in AI have led to remarkable breakthroughs in various fields.
预测结果: AI 生成

输入文本: This article discusses the impact of social media on human behavior.
预测结果: AI 生成

