In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter
import torch
from sklearn.model_selection import train_test_split


# 数据读取
data = pd.read_csv('./AI_Human.csv')

# 数据采样与清洗
ai_samples = data[data['generated'] == 1]
human_samples = data[data['generated'] == 0]
data = pd.concat([ai_samples.sample(n=5000, random_state=42), human_samples.sample(n=5000, random_state=42)])
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# 清洗函数
def remove_punc(text):
    return ''.join([char for char in text if char not in punctuation])

def remove_stop(text):
    stops = set(stopwords.words('english'))
    return " ".join([word for word in text.split() if word.lower() not in stops])

# 文本清洗
data['cleaned'] = data['text'].str.lower()
data['cleaned'] = data['cleaned'].apply(lambda x: re.sub(r'https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))
data['cleaned'] = data['cleaned'].apply(lambda x: re.sub(r'<.*?>', '', x))
data['cleaned'] = data['cleaned'].apply(remove_punc)
data['cleaned'] = data['cleaned'].apply(remove_stop)

data = data[['cleaned', 'generated']]
data.rename(columns={'generated': 'label'}, inplace=True)

  from pandas.core import (


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from data_processing import data
from transformers import RobertaTokenizer

vocab_file = ' roberta-base/vocab.json'

merges_file = ' roberta-base/merges.txt'


# 数据划分
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned'].tolist(),
    data['label'].tolist(),
    test_size=0.3,
    random_state=42
)

# 初始化 BERT Tokenizer
tokenizer = RobertaTokenizer(vocab_file, merges_file)
max_length = 256
batch_size = 16

# 自定义 Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',  # 填充到 max_length
            truncation=True,       # 截断到 max_length
            return_attention_mask=True,#Attention Mask，用于指示填充部分（0）和有效部分（1）。
            return_tensors='pt'    # 返回 PyTorch 张量
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),  # 去掉 batch 维度
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# 批次合并函数
def collate_fn(batch):
    # 用 pad_sequence 处理 input_ids 和 attention_mask，确保批次内序列对齐
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=0)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.long)

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}


# 构建数据集和 DataLoader
train_dataset = TextDataset(X_train, y_train, tokenizer, max_length)
test_dataset = TextDataset(X_test, y_test, tokenizer, max_length)

# 只保留一个 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from data_processing import data
from tokenizer_roberta import train_loader,test_loader


# 初始化 RoBERTa 模型
model = RobertaForSequenceClassification.from_pretrained(' roberta-base', num_labels=2)

# 配置优化器和设备
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 训练模型
model.train()
EPOCHS = 3

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    epoch_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # 前向传播
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # outputs 是一个元组，我们需要从中获取损失和 logits
        loss = outputs[0]  # 获取第一个元素，即损失
        logits = outputs[1]  # 获取第二个元素，即 logits

        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数

        epoch_loss += loss.item()  # 累加损失

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader)}")

# 测试模型
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# 计算精度和其他评估指标
print("Classification Report:\n", classification_report(true_labels, predictions))
print("Accuracy: ", accuracy_score(true_labels, predictions))



  return torch.load(checkpoint_file, map_location=map_location)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at  roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training: 100%|██████████████████████████████████████████████████████████████████████| 438/438 [32:51<00:00,  4.50s/it]


Epoch 1 Loss: 0.1828142536174693
Epoch 2/3


Training: 100%|██████████████████████████████████████████████████████████████████████| 438/438 [32:49<00:00,  4.50s/it]


Epoch 2 Loss: 0.05431840938333099
Epoch 3/3


Training: 100%|██████████████████████████████████████████████████████████████████████| 438/438 [32:49<00:00,  4.50s/it]


Epoch 3 Loss: 0.021057236635938954


Testing: 100%|███████████████████████████████████████████████████████████████████████| 188/188 [00:51<00:00,  3.66it/s]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      1461
           1       0.97      0.99      0.98      1539

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000

Accuracy:  0.98





In [4]:
# 保存模型
model.save_pretrained("./roberta_model")

# 保存 tokenizer
tokenizer.save_pretrained('./roberta_model')

('./roberta_model\\tokenizer_config.json',
 './roberta_model\\special_tokens_map.json',
 './roberta_model\\vocab.json',
 './roberta_model\\merges.txt',
 './roberta_model\\added_tokens.json')