In [1]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
import numpy as np
import random

# 设置PyTorch随机种子
seed = 1314
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# 设置Python的随机种子
random.seed(seed)

# 设置NumPy的随机种子
np.random.seed(seed)

# 加载数据
train_df = pd.read_table('input/train.txt', header=None)
dev_df = pd.read_table('input/dev.txt', header=None)

# 设置模型和tokenizer
model_name = "model/ernie-1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# # 冻结ernie模型参数
# for param in model.parameters():
#     param.requires_grad_(False)

# # 解冻部分ernie模型参数
# for param in model.encoder.layer[-2:].parameters():
#     param.requires_grad_(False)

# for param in model.encoder.layer[:2].parameters():
#     param.requires_grad_(False)

# 添加Dropout层
dropout_rate = 0.1
model.dropout = nn.Dropout(p=dropout_rate)

# 数据预处理
def preprocess_data(df, tokenizer, max_length=128):
    texts = df[0].tolist()
    labels = df[1].tolist()
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    labels = torch.tensor(labels)
    return inputs, labels

train_inputs, train_labels = preprocess_data(train_df, tokenizer)
dev_inputs, dev_labels = preprocess_data(dev_df, tokenizer)

# 创建数据加载器
batch_size = 16
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_dataset = TensorDataset(dev_inputs['input_ids'], dev_inputs['attention_mask'], dev_labels)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)

# 定义优化器和学习率调度器
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=len(train_loader) * 1, num_training_steps=len(train_loader) * 10)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.last_hidden_state[:, 0]  # 使用[CLS]的表示进行分类
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # 计算损失
        total_loss += loss.item()

        # 计算准确率
        preds = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(preds == labels).item()
        total_predictions += len(labels)

    # 计算平均损失和准确率
    average_loss = total_loss / len(train_loader)
    train_accuracy = correct_predictions / total_predictions

    # 输出训练集的损失和准确率
    print(f"Epoch {epoch + 1}:", end='\t')
    print(f"Training Loss: {average_loss:.4f}", end='\t')
    print(f"Training Accuracy: {train_accuracy:.4f}", end='\t')

    # 在每个训练周期结束后评估模型并输出验证集的准确率
    model.eval()
    dev_preds = []
    dev_true = []

    with torch.no_grad():
        for batch in dev_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.last_hidden_state[:, 0]
            preds = torch.argmax(logits, dim=1)
            dev_preds.extend(preds.cpu().numpy())
            dev_true.extend(labels.cpu().numpy())

    dev_accuracy = accuracy_score(dev_true, dev_preds)
    print(f"Validation Accuracy: {dev_accuracy:.4f}")

# 最后输出模型的评估结果
print("Training finished. Final Evaluation on Dev Set:")
dev_accuracy = accuracy_score(dev_true, dev_preds)
print(f"Validation Accuracy: {dev_accuracy:.4f}")

Epoch 1:	Training Loss: 1.9971	Training Accuracy: 0.5933	Validation Accuracy: 0.8140
Epoch 2:	Training Loss: 0.5674	Training Accuracy: 0.8458	Validation Accuracy: 0.8770
Epoch 3:	Training Loss: 0.3947	Training Accuracy: 0.8923	Validation Accuracy: 0.8520
Epoch 4:	Training Loss: 0.3006	Training Accuracy: 0.9193	Validation Accuracy: 0.8640
Epoch 5:	Training Loss: 0.2226	Training Accuracy: 0.9415	Validation Accuracy: 0.8580
Epoch 6:	Training Loss: 0.1675	Training Accuracy: 0.9567	Validation Accuracy: 0.8750
Epoch 7:	Training Loss: 0.1154	Training Accuracy: 0.9720	Validation Accuracy: 0.8760
Epoch 8:	Training Loss: 0.0936	Training Accuracy: 0.9783	Validation Accuracy: 0.8730
Epoch 9:	Training Loss: 0.0682	Training Accuracy: 0.9865	Validation Accuracy: 0.8730
Epoch 10:	Training Loss: 0.0637	Training Accuracy: 0.9873	Validation Accuracy: 0.8710
Epoch 11:	Training Loss: 0.0528	Training Accuracy: 0.9897	Validation Accuracy: 0.8710
Epoch 12:	Training Loss: 0.0448	Training Accuracy: 0.9907	Valid

In [2]:
test_df = pd.read_table('input/pred.txt', header=None)

test_df[1] = 0

test_inputs, test_labels = preprocess_data(test_df, tokenizer)

batch_size = 16
test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()
test_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.last_hidden_state[:, 0]
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        
test_df[1] = test_preds

In [3]:
# 获取测试数据的第一列
text = test_df[0]
label = test_df[1]

# 指定要保存的文件名
output_file = f"output/res{dev_accuracy}.txt"

# 打开文件并将数据写入
with open(output_file, "w", encoding="utf-8") as file:
    for text,label in zip(text,label):
        file.write(str(text) + "\t")
        file.write(str(label) + "\n")