In [1]:
import torch
import json
import pandas as pd
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_linear_schedule_with_warmup
import os
from tqdm import tqdm

def read_json_data(file_path):
    with open(file_path, "r") as f:
        raw_data = json.load(f)
    return pd.DataFrame.from_dict(raw_data, orient='index')

evd = pd.read_json('../data/evidence.json', orient='index')
train = read_json_data('../data/train-claims.json')
dev = read_json_data('../data/dev-claims.json')
test = read_json_data('../data/test-claims-unlabelled.json')

# 合并训练集和验证集
train = pd.concat([train, dev])

# 数据预处理
label_map = {"SUPPORTS": 0, "REFUTES": 1, "DISPUTED": 2, "NOT_ENOUGH_INFO": 3}
train["claim_label"] = train["claim_label"].map(label_map)

# 划分训练集和验证集
train_data, val_data = train_test_split(train, test_size=0.1)
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 构建自定义数据集
class ClaimDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.loc[idx, "claim_text"]
        label = self.data.loc[idx, "claim_label"]
        encoded_inputs = self.tokenizer(text, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        input_ids = encoded_inputs["input_ids"].squeeze()
        attention_mask = encoded_inputs["attention_mask"].squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": torch.tensor([label])}


# 设置tokenizer和模型
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_map))

# 创建数据加载器

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

In [2]:
train_dataset = ClaimDataset(train_data, tokenizer)
val_dataset = ClaimDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=True)

# 设置训练参数
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 1000
total_steps = len(train_loader) * num_epochs


# 创建训练和验证的函数
def train_epoch(model, train_loader, optimizer, scheduler, device):
    batch_size = train_loader.batch_size
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    return total_loss / len(train_loader)

def eval_epoch(model, val_loader, device):
    batch_size = train_loader.batch_size
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(val_loader)

# 进行训练和验证
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
import matplotlib.pyplot as plt

def train_with_early_stopping(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs, patience):
    best_val_loss = float("inf")
    train_losses = []
    val_losses = []
    no_improvement_count = 0

    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
        val_loss = eval_epoch(model, val_loader, device)
        print(f"Epoch {epoch+1}/{num_epochs}: Train loss = {train_loss:.4f}, Val loss = {val_loss:.4f}")

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improvement_count = 0
            torch.save(model.state_dict(), "best_model.pt")
            print("Best model saved.")
        else:
            no_improvement_count += 1
            print(f"No improvement for {no_improvement_count} epochs.")

        if no_improvement_count >= patience:
            print("Early stopping triggered.")
            break

    # 绘制损失曲线
    plt.plot(train_losses, label="Train Loss")
    plt.plot(val_losses, label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.title("Loss Curves")
    plt.show()

patience = 5
train_with_early_stopping(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs, patience)


SyntaxError: invalid syntax (953925660.py, line 11)