In [2]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re

# 检查是否有可用的GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# 加载BERT模型和tokenizer，并将它们移动到GPU上
tokenizer = BertTokenizer.from_pretrained("model/roberta/tokenizer")
model = BertForSequenceClassification.from_pretrained("model/roberta/model",num_labels=6).to(device)

# # 冻结BERT模型参数
# for param in model.parameters():
#     param.requires_grad_(False)

# # 解冻部分BERT模型参数
# for param in model.bert.encoder.layer[-3:].parameters():
#     param.requires_grad_(True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at model/roberta/model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# 数据预处理
MAX_LEN = 64

def preprocess_data(df, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    labels = []

    def clean_text(text):
        # 只删除数字
        cleaned_text = re.sub(r'\d', '', text)
        return cleaned_text
    
    for index, row in df.iterrows():
        text = row[0]
        # text = clean_text(text)        
        label = row[1]
        
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])
        labels.append(label)

    input_ids = torch.cat(input_ids, dim=0).to(device)
    attention_masks = torch.cat(attention_masks, dim=0).to(device)
    labels = torch.tensor(labels, dtype=torch.long).to(device)

    return input_ids, attention_masks, labels


In [4]:
# 准备训练数据和验证数据
train_df = pd.read_table('input/train.txt',header=None)
dev_df = pd.read_table('input/dev.txt',header=None)

train_input_ids, train_attention_masks, train_labels = preprocess_data(train_df, tokenizer, MAX_LEN)
dev_input_ids, dev_attention_masks, dev_labels = preprocess_data(dev_df, tokenizer, MAX_LEN)

# 创建TensorDataset和DataLoader
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
dev_dataset = TensorDataset(dev_input_ids, dev_attention_masks, dev_labels)

train_batch_size = 32
dev_batch_size = 64

train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=dev_batch_size)

In [5]:
# 定义优化器和损失函数
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 训练模型
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}",end='\t')

    # 在验证集上评估模型性能
    model.eval()
    val_preds = []
    val_labels = []

    for batch in dev_dataloader:
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        logits = outputs.logits  
        val_preds.extend(logits.argmax(dim=1).tolist())
        val_labels.extend(labels.tolist())

    correct_predictions = [1 if p == t else 0 for p, t in zip(val_preds, val_labels)]
    accuracy = sum(correct_predictions) / len(correct_predictions)
    print(f'Test_Accuracy: {accuracy * 100:.2f}%')

Epoch 1/20, Average Loss: 0.5569	Test_Accuracy: 86.40%
Epoch 2/20, Average Loss: 0.2852	Test_Accuracy: 85.70%
Epoch 3/20, Average Loss: 0.1848	Test_Accuracy: 86.70%
Epoch 4/20, Average Loss: 0.1272	Test_Accuracy: 85.30%
Epoch 5/20, Average Loss: 0.0868	Test_Accuracy: 85.30%
Epoch 6/20, Average Loss: 0.0588	Test_Accuracy: 86.50%
Epoch 7/20, Average Loss: 0.0525	Test_Accuracy: 86.10%
Epoch 8/20, Average Loss: 0.0463	Test_Accuracy: 85.40%
Epoch 9/20, Average Loss: 0.0431	Test_Accuracy: 85.50%
Epoch 10/20, Average Loss: 0.0461	Test_Accuracy: 85.80%
Epoch 11/20, Average Loss: 0.0274	Test_Accuracy: 87.10%
Epoch 12/20, Average Loss: 0.0264	Test_Accuracy: 86.20%
Epoch 13/20, Average Loss: 0.0232	Test_Accuracy: 85.40%
Epoch 14/20, Average Loss: 0.0216	Test_Accuracy: 86.60%


KeyboardInterrupt: 

In [None]:
test_df = pd.read_table('input/dev.txt',header=None)

# 数据预处理
test_input_ids, test_attention_masks, _ = preprocess_data(test_df, tokenizer, MAX_LEN)

# 创建 DataLoader
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_batch_size = 64
test_dataloader = DataLoader(test_dataset, batch_size=test_batch_size)

# 将模型设置为评估模式
model.eval()

# 存储预测结果
predictions = []

# 遍历测试数据并进行预测
for batch in test_dataloader:
    input_ids, attention_mask = batch
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predicted_labels = logits.argmax(dim=1).tolist()
    predictions.extend(predicted_labels)

In [None]:
true_labels = test_df[1]

# 计算准确度
correct_predictions = [1 if p == t else 0 for p, t in zip(predictions, true_labels)]
accuracy = sum(correct_predictions) / len(correct_predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')