In [1]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import random

# 设置PyTorch随机种子
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# 设置Python的随机种子
random.seed(seed)

# 设置NumPy的随机种子
np.random.seed(seed)

# 加载数据
train_df = pd.read_table('input/train.txt', header=None)
dev_df = pd.read_table('input/dev.txt', header=None)

# 设置模型和tokenizer
model_name = "model/XLM-RoBERTa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6, ignore_mismatched_sizes=True)

# 添加Dropout层
dropout_rate = 0.2
model.classifier.dropout = nn.Dropout(p=dropout_rate)

# 修改模型的输出层
num_labels = 6  # 新的类别数
model.classifier.out_proj = nn.Linear(in_features=768, out_features=num_labels, bias=True)

# 数据预处理
def preprocess_data(df, tokenizer, max_length=64):
    texts = df[0].tolist()
    labels = df[1].tolist()
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    labels = torch.tensor(labels)
    return inputs, labels

train_inputs, train_labels = preprocess_data(train_df, tokenizer)
dev_inputs, dev_labels = preprocess_data(dev_df, tokenizer)

# 创建数据加载器
train_batch_size = 16  # 训练批次大小
eval_batch_size = 16  # 评估批次大小
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
dev_dataset = TensorDataset(dev_inputs['input_ids'], dev_inputs['attention_mask'], dev_labels)
dev_loader = DataLoader(dev_dataset, batch_size=eval_batch_size, shuffle=False)

# 定义损失函数
loss_fn = nn.CrossEntropyLoss()

# 定义优化器和学习率调度器
learning_rate = 5e-05
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=len(train_loader) * 1, num_training_steps=len(train_loader) * 10)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        # 计算损失
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # 累加损失值
        total_loss += loss.item()

        # 计算准确率
        preds = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(preds == labels).item()
        total_predictions += len(labels)
    
    # 计算平均损失
    average_loss = total_loss / len(train_loader)

    # 输出训练集的损失和准确率
    print(f"Epoch {epoch + 1}:", end='\t')
    print(f"Training Loss: {average_loss:.4f}", end='\t')
    train_accuracy = correct_predictions / total_predictions
    print(f"Training Accuracy: {train_accuracy:.4f}", end='\t')

    # 在每个训练周期结束后评估模型并输出验证集的准确率
    model.eval()
    dev_preds = []
    dev_true = []

    with torch.no_grad():
        for batch in dev_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            dev_preds.extend(preds.cpu().numpy())
            dev_true.extend(labels.cpu().numpy())

    dev_accuracy = accuracy_score(dev_true, dev_preds)
    print(f"Validation Accuracy: {dev_accuracy:.4f}")

# 最后输出模型的评估结果
print("Training finished. Final Evaluation on Dev Set:")
dev_accuracy = accuracy_score(dev_true, dev_preds)
print(f"Validation Accuracy: {dev_accuracy:.4f}")


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at model/XLM-RoBERTa and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([20, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([20]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:	Training Loss: 1.1501	Training Accuracy: 0.5692	Validation Accuracy: 0.7760
Epoch 2:	Training Loss: 0.6508	Training Accuracy: 0.7868	Validation Accuracy: 0.8180
Epoch 3:	Training Loss: 0.4576	Training Accuracy: 0.8622	Validation Accuracy: 0.8390
Epoch 4:	Training Loss: 0.3843	Training Accuracy: 0.8763	Validation Accuracy: 0.8560
Epoch 5:	Training Loss: 0.3122	Training Accuracy: 0.8992	Validation Accuracy: 0.8550
Epoch 6:	Training Loss: 0.2628	Training Accuracy: 0.9132	Validation Accuracy: 0.8550
Epoch 7:	Training Loss: 0.2016	Training Accuracy: 0.9370	Validation Accuracy: 0.8580
Epoch 8:	Training Loss: 0.1543	Training Accuracy: 0.9512	Validation Accuracy: 0.8560
Epoch 9:	Training Loss: 0.1137	Training Accuracy: 0.9672	Validation Accuracy: 0.8570
Epoch 10:	Training Loss: 0.0892	Training Accuracy: 0.9718	Validation Accuracy: 0.8560
Epoch 11:	Training Loss: 0.0736	Training Accuracy: 0.9792	Validation Accuracy: 0.8560
Epoch 12:	Training Loss: 0.0766	Training Accuracy: 0.9780	Valid