In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

# 1. 加载数据集
train_data = pd.read_csv('train.tsv', sep='\t', header=None)
valid_data = pd.read_csv('valid.tsv', sep='\t', header=None)
test_data = pd.read_csv('test.tsv', sep='\t', header=None)

# 使用第三列作为文本，第二列作为标签
train_texts = train_data.iloc[:, 2].values
train_labels = train_data.iloc[:, 1].values

valid_texts = valid_data.iloc[:, 2].values
valid_labels = valid_data.iloc[:, 1].values

test_texts = test_data.iloc[:, 2].values
test_labels = test_data.iloc[:, 1].values

# 标签编码，将文本标签转换为数字
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
valid_labels = label_encoder.transform(valid_labels)
test_labels = label_encoder.transform(test_labels)

# 2. 数据预处理
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 数据集准备
MAX_LEN = 128
BATCH_SIZE = 16

train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
valid_dataset = TextDataset(valid_texts, valid_labels, tokenizer, MAX_LEN)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 3. 定义模型、优化器和学习率调度器
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(train_labels)))
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * 3  # 假设3个epoch
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# 损失函数
loss_fn = torch.nn.CrossEntropyLoss().to(device)

# 训练函数
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, epoch):
    model = model.train()
    total_loss = 0
    total_correct = 0
    total_examples = 0

    for step, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        total_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        total_correct += (preds == labels).sum().item()
        total_examples += labels.size(0)

        loss.backward()
        optimizer.step()
        scheduler.step()

        # 打印当前进度
        if step % 10 == 0 or step == len(data_loader) - 1:
            print(f'Epoch [{epoch+1}], Step [{step+1}/{len(data_loader)}], Loss: {loss.item():.4f}, Accuracy: {(total_correct/total_examples):.4f}')
    
    avg_loss = total_loss / len(data_loader)
    avg_acc = total_correct / total_examples
    return avg_loss, avg_acc

# 验证函数
def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()
    total_loss = 0
    total_correct = 0
    total_examples = 0

    with torch.no_grad():
        for step, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels)

            total_loss += loss.item()
            _, preds = torch.max(logits, dim=1)
            total_correct += (preds == labels).sum().item()
            total_examples += labels.size(0)

            # 打印验证集进度
            if step % 10 == 0 or step == len(data_loader) - 1:
                print(f'Validation Step [{step+1}/{len(data_loader)}], Loss: {loss.item():.4f}, Accuracy: {(total_correct/total_examples):.4f}')

    avg_loss = total_loss / len(data_loader)
    avg_acc = total_correct / total_examples
    return avg_loss, avg_acc

# 4. 训练和验证
EPOCHS = 3
for epoch in range(EPOCHS):
    print(f'{"="*20} Epoch {epoch + 1}/{EPOCHS} {"="*20}')
    train_loss, train_acc = train_epoch(model, train_loader, loss_fn, optimizer, device, scheduler, epoch)
    print(f'Epoch {epoch + 1} Train loss: {train_loss:.4f}, Train accuracy: {train_acc:.4f}')

    val_loss, val_acc = eval_model(model, valid_loader, loss_fn, device)
    print(f'Epoch {epoch + 1} Validation loss: {val_loss:.4f}, Validation accuracy: {val_acc:.4f}')

# 5. 测试模型
print(f'{"="*20} Testing {"="*20}')
test_loss, test_acc = eval_model(model, test_loader, loss_fn, device)
print(f'Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1], Step [1/640], Loss: 1.8006, Accuracy: 0.1875
Epoch [1], Step [11/640], Loss: 1.7393, Accuracy: 0.1932
Epoch [1], Step [21/640], Loss: 1.6193, Accuracy: 0.2173
Epoch [1], Step [31/640], Loss: 1.7899, Accuracy: 0.2198
Epoch [1], Step [41/640], Loss: 1.7695, Accuracy: 0.2256
Epoch [1], Step [51/640], Loss: 1.6703, Accuracy: 0.2255
