In [1]:
import os
os.environ['NLTK_DATA'] = 'nltk_data'
from peft import LoraConfig, get_peft_model, TaskType
import torch
torch.cuda.empty_cache()
from transformers import *
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
# import tensorflow as tf
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from torch.nn.utils.rnn import pad_sequence
import sentencepiece
# from keras.preprocessing.sequence import pad_sequences




In [2]:
# 验证实际可见的GPU设备
print("\n===== 实际使用的GPU设备 =====")
print(f"可见GPU数量: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    # device = torch.device("cuda")
    device = torch.device("cuda:0")
    print('There is/are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Set the seed value all over the place to make this reproducible. Somehow this isn't working!
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 创建保存模型和预测的文件夹
os.makedirs('./saved_preds', exist_ok=True)
os.makedirs('./saved_models', exist_ok=True)


===== 实际使用的GPU设备 =====
可见GPU数量: 1
GPU 0: NVIDIA GeForce RTX 4090
There is/are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 4090


定义集成模型：

In [3]:
# 定义本地模型路径
LOCAL_MODEL_PATHS = {
    'bert-large-uncased': 'bert-large-uncased',
    'roberta-large': 'roberta-large',
    'xlnet-large-cased': 'xlnet-large-cased'
}

# 启用多个模型配置
MODELS = [
    (BertForSequenceClassification, BertTokenizer, 'bert-large-uncased'),
    (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-large'),
    (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-large-cased'),
]

In [5]:
test_corpus = pd.read_csv("subtask1_test.csv", encoding='utf-8')
master_corpus = pd.read_csv("subtask1_train_with_tense.csv", encoding = 'utf-8')

In [8]:
# Multi-Task Learning with LoRA

# 数据集类保持不变
class MultiTaskClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, corpus, train_corpus, tokenizer, max_len=128, is_test=False):
        self.corpus = corpus
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test
        
        # 计算主任务的类别权重
        label_counts = train_corpus['gold_label'].value_counts()
        total = len(train_corpus)
        self.main_task_weights = torch.tensor([total / label_counts[i] for i in range(len(label_counts))], dtype=torch.float32).to(device)
        
        # 只在训练集上计算辅助任务的权重
        if not is_test and 'tense' in corpus.columns:
            # 计算辅助任务（时态）的类别权重
            tense_counts = train_corpus['tense'].value_counts()
            self.num_tenses = len(tense_counts)
            self.tense_weights = torch.tensor([total / tense_counts[i] for i in range(len(tense_counts))], dtype=torch.float32).to(device)
            
            # 创建时态到索引的映射
            self.tense_to_idx = {tense: idx for idx, tense in enumerate(sorted(train_corpus['tense'].unique()))}
        else:
            # 对于测试集，我们需要从训练集获取时态信息
            if 'tense' in train_corpus.columns:
                tense_counts = train_corpus['tense'].value_counts()
                self.num_tenses = len(tense_counts)
                self.tense_weights = torch.tensor([total / tense_counts[i] for i in range(len(tense_counts))], dtype=torch.float32).to(device)
                self.tense_to_idx = {tense: idx for idx, tense in enumerate(sorted(train_corpus['tense'].unique()))}
            else:
                self.num_tenses = 0
                self.tense_weights = None
                self.tense_to_idx = {}
        
    def __len__(self):
        return len(self.corpus)
    
    def __getitem__(self, idx):
        row = self.corpus.iloc[idx]
        text = row['sentence']
        main_label = row['gold_label']
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        result = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'main_label': torch.tensor(main_label, dtype=torch.long),
        }
        
        # 只在训练时添加时态标签
        if not self.is_test and 'tense' in row.index:
            tense_label = self.tense_to_idx[row['tense']]
            result['tense_label'] = torch.tensor(tense_label, dtype=torch.long)
        else:
            # 对于测试集，使用一个占位符
            result['tense_label'] = torch.tensor(0, dtype=torch.long)
            
        return result

# 创建一个支持LoRA的多任务模型包装器
class MultiTaskLoRAModel(nn.Module):
    def __init__(self, base_model, num_main_labels=2, num_tense_labels=None):
        super(MultiTaskLoRAModel, self).__init__()
        self.base_model = base_model
        self.num_main_labels = num_main_labels
        self.num_tense_labels = num_tense_labels
        
        # 获取隐藏层大小
        hidden_size = base_model.config.hidden_size
        
        # 主任务分类头
        self.main_classifier = nn.Linear(hidden_size, num_main_labels)
        
        # 辅助任务（时态）分类头
        self.tense_classifier = nn.Linear(hidden_size, num_tense_labels)
        
        # Dropout层
        dropout = getattr(base_model.config, 'hidden_dropout_prob', None)
        if dropout is None:
            dropout = getattr(base_model.config, 'dropout', 0.1)
        self.dropout = nn.Dropout(dropout)
        
        # 确定基础模型的类型
        self.model_type = base_model.config.model_type
        
    def forward(self, input_ids, attention_mask=None):
        # 获取基础模型的名称
        if hasattr(self.base_model, 'base_model'):
            # 如果是PEFT模型，获取实际的基础模型
            actual_base_model = self.base_model.base_model
        else:
            actual_base_model = self.base_model
            
        # 根据模型类型使用不同的方法获取表示
        if self.model_type == 'bert':
            # 直接调用bert层
            if hasattr(actual_base_model, 'bert'):
                outputs = actual_base_model.bert(input_ids=input_ids, attention_mask=attention_mask)
                pooled_output = outputs.pooler_output
            else:
                # 如果是PEFT包装的模型
                outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
                # 从隐藏状态中获取
                if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
                    pooled_output = outputs.hidden_states[-1][:, 0]
                elif hasattr(outputs, 'last_hidden_state'):
                    pooled_output = outputs.last_hidden_state[:, 0]
                else:
                    # 使用logits前的表示
                    pooled_output = self.base_model.get_input_embeddings()(input_ids).mean(dim=1)
                    
        elif self.model_type == 'roberta':
            # 对于RoBERTa，需要特殊处理
            if hasattr(actual_base_model, 'roberta'):
                # 直接调用roberta层
                outputs = actual_base_model.roberta(input_ids=input_ids, attention_mask=attention_mask)
                pooled_output = outputs.last_hidden_state[:, 0]
            else:
                # 如果是PEFT包装的模型，使用不同的方法
                # 临时设置输出隐藏状态
                self.base_model.config.output_hidden_states = True
                outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
                
                # 获取隐藏状态
                if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
                    # 使用最后一层的隐藏状态
                    last_hidden_state = outputs.hidden_states[-1]
                    pooled_output = last_hidden_state[:, 0]
                else:
                    # 如果没有hidden_states，尝试从基础模型获取
                    # 这里我们需要手动提取特征
                    base_model_output = actual_base_model.roberta(input_ids=input_ids, attention_mask=attention_mask)
                    pooled_output = base_model_output.last_hidden_state[:, 0]
                    
        elif self.model_type == 'xlnet':
            if hasattr(actual_base_model, 'transformer'):
                outputs = actual_base_model.transformer(input_ids=input_ids, attention_mask=attention_mask)
                sequence_output = outputs.last_hidden_state
            else:
                self.base_model.config.output_hidden_states = True
                outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
                if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
                    sequence_output = outputs.hidden_states[-1]
                else:
                    sequence_output = outputs.last_hidden_state
                    
            # 获取每个序列的最后一个有效token
            if attention_mask is not None:
                seq_lengths = attention_mask.sum(dim=1) - 1
                batch_size = input_ids.size(0)
                pooled_output = sequence_output[range(batch_size), seq_lengths]
            else:
                pooled_output = sequence_output[:, -1]
                
        else:
            # 其他模型的通用处理
            self.base_model.config.output_hidden_states = True
            outputs = self.base_model(
                input_ids=input_ids, 
                attention_mask=attention_mask
            )
            
            if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
                pooled_output = outputs.pooler_output
            elif hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
                pooled_output = outputs.hidden_states[-1][:, 0]
            elif hasattr(outputs, 'last_hidden_state'):
                pooled_output = outputs.last_hidden_state[:, 0]
            else:
                raise ValueError(f"Cannot extract pooled output from model type: {self.model_type}")
        
        # 应用dropout
        pooled_output = self.dropout(pooled_output)
        
        # 主任务输出
        main_logits = self.main_classifier(pooled_output)
        
        # 辅助任务输出
        tense_logits = self.tense_classifier(pooled_output)
        
        return {
            'main_logits': main_logits,
            'tense_logits': tense_logits
        }
    
# 创建一个函数来查找模型中的所有模块名称
def find_all_linear_names(model):
    """
    找出模型中所有的线性层名称
    """
    cls = torch.nn.Linear
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[-1])
    
    # 移除一些不应该被替换的层
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    if 'classifier' in lora_module_names:
        lora_module_names.remove('classifier')
    if 'qa_outputs' in lora_module_names:
        lora_module_names.remove('qa_outputs')
    
    return list(lora_module_names)

# 修改主训练循环中的LoRA配置部分
for model_class, tokenizer_class, pretrained_weights in MODELS:
    print(f"\n\n=== Training Model with LoRA: {pretrained_weights} ===\n")
    
    # Loading the data
    train_corpus, test_corpus = master_corpus, test_corpus
    
    # 从本地加载预训练模型
    local_path = LOCAL_MODEL_PATHS[pretrained_weights]
    tokenizer = tokenizer_class.from_pretrained(local_path)
    base_model = model_class.from_pretrained(
        local_path,
        num_labels=2,
        output_hidden_states=True,
        output_attentions=False
    )
    
    # 在应用LoRA之前，确保模型配置正确
    base_model.config.output_hidden_states = True
    
    # 根据模型类型配置LoRA
    if base_model.config.model_type == 'xlnet':
        # 对于XLNet，我们需要找到正确的模块名称
        print("Finding XLNet modules...")
        linear_modules = find_all_linear_names(base_model)
        print(f"Found linear modules: {linear_modules}")
        
        # XLNet特定的配置
        lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=linear_modules,  # 使用找到的所有线性层
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.SEQ_CLS,
        )
    elif base_model.config.model_type == 'bert':
        lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["query", "key", "value", "dense"],
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.SEQ_CLS,
        )
    elif base_model.config.model_type == 'roberta':
        lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["query", "key", "value", "dense"],
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.SEQ_CLS,
        )
    else:
        # 通用配置 - 自动查找所有线性层
        linear_modules = find_all_linear_names(base_model)
        lora_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=linear_modules,
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.SEQ_CLS,
        )
    
    # 应用LoRA到基础模型
    base_model = get_peft_model(base_model, lora_config)
    
    # 打印可训练参数信息
    base_model.print_trainable_parameters()
    
    # 创建多任务数据集
    train_dataset = MultiTaskClassificationDataset(train_corpus, train_corpus, tokenizer, max_len=128, is_test=False)
    test_dataset = MultiTaskClassificationDataset(test_corpus, train_corpus, tokenizer, max_len=128, is_test=True)
    
    # 创建多任务模型（使用LoRA增强的基础模型）
    model = MultiTaskLoRAModel(base_model, num_main_labels=2, num_tense_labels=train_dataset.num_tenses)
    model.to(device)
    
    # 创建数据加载器
    def collate_fn(batch):
        input_ids = torch.stack([item['input_ids'] for item in batch])
        attention_mask = torch.stack([item['attention_mask'] for item in batch])
        main_labels = torch.stack([item['main_label'] for item in batch])
        tense_labels = torch.stack([item['tense_label'] for item in batch])
        return input_ids, attention_mask, main_labels, tense_labels
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
    
    # 定义损失函数
    main_criterion = nn.CrossEntropyLoss(weight=train_dataset.main_task_weights)
    tense_criterion = nn.CrossEntropyLoss(weight=train_dataset.tense_weights)
    
    # 定义多任务损失权重
    main_task_weight = 0.7
    tense_task_weight = 0.3
    
    epochs = 10
    
    # 优化器 - 使用更高的学习率因为LoRA参数较少
    optimizer = AdamW(model.parameters(), lr=3e-4, eps=1e-8)
    
    # 学习率调度器
    total_train_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(0.1 * total_train_steps),  # 10% warmup
        num_training_steps=total_train_steps
    )
    
    # 训练循环
    best_f1 = 0
    for epoch in range(epochs):   
        model.train()
        
        train_main_preds = []
        train_main_labels = []
        train_tense_preds = []
        train_tense_labels = []
        total_loss = 0
        
        for i, (input_ids, attention_mask, main_labels, tense_labels) in enumerate(train_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            main_labels = main_labels.to(device)
            tense_labels = tense_labels.to(device)
            
            optimizer.zero_grad()
            
            # 前向传播
            outputs = model(input_ids, attention_mask)
            
            # 计算多任务损失
            main_loss = main_criterion(outputs['main_logits'], main_labels)
            tense_loss = tense_criterion(outputs['tense_logits'], tense_labels)
            
            # 组合损失
            loss = main_task_weight * main_loss + tense_task_weight * tense_loss
            
            # 反向传播
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            # 记录预测结果
            main_preds = torch.argmax(outputs['main_logits'], dim=1).cpu().numpy()
            tense_preds = torch.argmax(outputs['tense_logits'], dim=1).cpu().numpy()
            
            train_main_preds.extend(main_preds)
            train_main_labels.extend(main_labels.cpu().numpy())
            train_tense_preds.extend(tense_preds)
            train_tense_labels.extend(tense_labels.cpu().numpy())
            
            total_loss += loss.item()
        
        # 计算主任务指标
        main_acc = accuracy_score(train_main_labels, train_main_preds)
        main_prec = precision_score(train_main_labels, train_main_preds)
        main_rec = recall_score(train_main_labels, train_main_preds)
        main_f1 = f1_score(train_main_labels, train_main_preds)
        
        # 计算辅助任务指标
        tense_acc = accuracy_score(train_tense_labels, train_tense_preds)
        
        print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader):.4f}")
        print(f"Main Task - Accuracy: {main_acc:.4f}, Precision: {main_prec:.4f}, Recall: {main_rec:.4f}, F1: {main_f1:.4f}")
        print(f"Tense Task - Accuracy: {tense_acc:.4f}")
        
        # 验证阶段
        model.eval()
        val_main_preds = []
        val_main_labels = []
        
        with torch.no_grad():
            for input_ids, attention_mask, main_labels, _ in test_loader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                main_labels = main_labels.to(device)
                
                outputs = model(input_ids, attention_mask)
                main_preds = torch.argmax(outputs['main_logits'], dim=1).cpu().numpy()
                
                val_main_preds.extend(main_preds)
                val_main_labels.extend(main_labels.cpu().numpy())
        
        val_f1 = f1_score(val_main_labels, val_main_preds)
        
        # 保存最佳模型
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_state = model.state_dict()
            print(f"New best F1 score: {best_f1:.4f}")
    
    # 加载最佳模型进行最终测试
    model.load_state_dict(best_model_state)
    
    # 测试阶段 - 只关注主任务
    model.eval()
    test_main_preds = []
    test_main_labels = []
    
    with torch.no_grad():
        for input_ids, attention_mask, main_labels, _ in test_loader:  # 忽略时态标签
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            main_labels = main_labels.to(device)
            
            outputs = model(input_ids, attention_mask)
            
            # 只获取主任务的预测
            main_preds = torch.argmax(outputs['main_logits'], dim=1).cpu().numpy()
            
            test_main_preds.extend(main_preds)
            test_main_labels.extend(main_labels.cpu().numpy())
    
    # 计算测试集指标 - 只计算主任务
    main_acc = accuracy_score(test_main_labels, test_main_preds)
    main_prec = precision_score(test_main_labels, test_main_preds)
    main_rec = recall_score(test_main_labels, test_main_preds)
    main_f1 = f1_score(test_main_labels, test_main_preds)
    
    print(f"\nFinal Test Results with LoRA:")
    print(f"Main Task - Accuracy: {main_acc:.4f}, Precision: {main_prec:.4f}, Recall: {main_rec:.4f}, F1: {main_f1:.4f}")
    
    # 保存模型和预测结果
    model_name = pretrained_weights.replace("-", "_")
    
    # 保存完整模型（包括LoRA权重）
    torch.save(model.state_dict(), f'./saved_models/{model_name}_multitask_lora.pt')
    
    # 也可以只保存LoRA权重（更小的文件）
    model.base_model.save_pretrained(f'./saved_models/{model_name}_lora_weights')
    
    # 保存预测结果
    np.save(f'./saved_preds/{model_name}_main_preds_lora.npy', test_main_preds)
    
    print(f"Saved LoRA model and predictions for {pretrained_weights}")
    print(f"Best validation F1 score: {best_f1:.4f}")

# 如果需要加载保存的LoRA模型，可以使用以下代码：
def load_lora_model(model_class, tokenizer_class, pretrained_weights, num_tense_labels):
    """
    加载保存的LoRA模型
    """
    # 加载基础模型
    local_path = LOCAL_MODEL_PATHS[pretrained_weights]
    base_model = model_class.from_pretrained(
        local_path,
        num_labels=2,
        output_hidden_states=True,
        output_attentions=False
    )
    
    # 重新配置LoRA
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["query", "key", "value", "dense"],
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.SEQ_CLS,
    )
    
    # 根据模型类型调整target_modules
    if base_model.config.model_type == 'bert':
        lora_config.target_modules = ["query", "key", "value", "dense"]
    elif base_model.config.model_type == 'roberta':
        lora_config.target_modules = ["query", "key", "value", "dense"]
    elif base_model.config.model_type == 'xlnet':
        lora_config.target_modules = ["q", "k", "v", "o", "ff.0", "ff.2"]
    
    # 应用LoRA
    base_model = get_peft_model(base_model, lora_config)
    
    # 创建多任务模型
    model = MultiTaskLoRAModel(base_model, num_main_labels=2, num_tense_labels=num_tense_labels)
    
    # 加载保存的权重
    model_name = pretrained_weights.replace("-", "_")
    model.load_state_dict(torch.load(f'./saved_models/{model_name}_multitask_lora.pt'))
    
    return model

# 额外的LoRA配置选项示例
def get_advanced_lora_config(model_type):
    """
    获取更高级的LoRA配置
    """
    if model_type == 'bert':
        return LoraConfig(
            r=16,  # 增加秩以提高性能
            lora_alpha=32,
            target_modules=["query", "key", "value", "dense", "output.dense"],  # 包含更多层
            lora_dropout=0.05,
            bias="lora_only",  # 只在LoRA层使用偏置
            task_type=TaskType.SEQ_CLS,
            modules_to_save=["classifier"],  # 保存分类器层
        )
    elif model_type == 'roberta':
        return LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["query", "key", "value", "dense", "output.dense"],
            lora_dropout=0.05,
            bias="lora_only",
            task_type=TaskType.SEQ_CLS,
            modules_to_save=["classifier"],
        )
    elif model_type == 'xlnet':
        return LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q", "k", "v", "o", "ff.0", "ff.2", "layer_norm"],
            lora_dropout=0.05,
            bias="lora_only",
            task_type=TaskType.SEQ_CLS,
            modules_to_save=["classifier"],
        )
    else:
        # 默认配置
        return LoraConfig(
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.SEQ_CLS,
        )

# 用于评估LoRA效率的函数
def print_lora_statistics(model):
    """
    打印LoRA模型的统计信息
    """
    total_params = 0
    trainable_params = 0
    
    for name, param in model.named_parameters():
        total_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Trainable percentage: {100 * trainable_params / total_params:.2f}%")
    
    # 计算内存使用
    param_memory = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_memory = sum(b.numel() * b.element_size() for b in model.buffers())
    total_memory = param_memory + buffer_memory
    
    print(f"Model memory usage: {total_memory / 1024**2:.2f} MB")
    print(f"Trainable parameters memory: {trainable_params * 4 / 1024**2:.2f} MB")  # 假设float32




loading file vocab.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading configuration file bert-large-uncased/config.json
Model config BertConfig {
  "_name_or_path": "bert-large-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading configuration file bert-large-uncased/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM



=== Training Model with LoRA: bert-large-uncased ===



Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly

trainable params: 3,557,378 || all params: 338,701,316 || trainable%: 1.0503


  self.tense_weights = torch.tensor([total / tense_counts[i] for i in range(len(tense_counts))], dtype=torch.float32).to(device)
  self.tense_weights = torch.tensor([total / tense_counts[i] for i in range(len(tense_counts))], dtype=torch.float32).to(device)


Epoch 1 - Loss: 0.4379
Main Task - Accuracy: 0.8954, Precision: 0.7199, Recall: 0.8008, F1: 0.7582
Tense Task - Accuracy: 0.3188
New best F1 score: 0.8573
Epoch 2 - Loss: 0.1829
Main Task - Accuracy: 0.9655, Precision: 0.8964, Recall: 0.9401, F1: 0.9177
Tense Task - Accuracy: 0.5028
New best F1 score: 0.8650
Epoch 3 - Loss: 0.1322
Main Task - Accuracy: 0.9810, Precision: 0.9402, Recall: 0.9687, F1: 0.9543
Tense Task - Accuracy: 0.5619
Epoch 4 - Loss: 0.0940
Main Task - Accuracy: 0.9899, Precision: 0.9685, Recall: 0.9828, F1: 0.9756
Tense Task - Accuracy: 0.6123
Epoch 5 - Loss: 0.0667
Main Task - Accuracy: 0.9956, Precision: 0.9869, Recall: 0.9916, F1: 0.9893
Tense Task - Accuracy: 0.6526
Epoch 6 - Loss: 0.0488
Main Task - Accuracy: 0.9975, Precision: 0.9916, Recall: 0.9960, F1: 0.9938
Tense Task - Accuracy: 0.6896
Epoch 7 - Loss: 0.0363
Main Task - Accuracy: 0.9988, Precision: 0.9953, Recall: 0.9987, F1: 0.9970
Tense Task - Accuracy: 0.7226
New best F1 score: 0.8705
Epoch 8 - Loss: 0.0

loading configuration file bert-large-uncased/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading configuration file roberta-large/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
  

Saved LoRA model and predictions for bert-large-uncased
Best validation F1 score: 0.8710


=== Training Model with LoRA: roberta-large ===



Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should 

trainable params: 4,606,978 || all params: 359,985,156 || trainable%: 1.2798


  self.tense_weights = torch.tensor([total / tense_counts[i] for i in range(len(tense_counts))], dtype=torch.float32).to(device)
  self.tense_weights = torch.tensor([total / tense_counts[i] for i in range(len(tense_counts))], dtype=torch.float32).to(device)


Epoch 1 - Loss: 0.3739
Main Task - Accuracy: 0.8956, Precision: 0.7206, Recall: 0.8008, F1: 0.7586
Tense Task - Accuracy: 0.3439
New best F1 score: 0.8586
Epoch 2 - Loss: 0.1738
Main Task - Accuracy: 0.9682, Precision: 0.9057, Recall: 0.9431, F1: 0.9240
Tense Task - Accuracy: 0.5293
New best F1 score: 0.8643
Epoch 3 - Loss: 0.1238
Main Task - Accuracy: 0.9816, Precision: 0.9416, Recall: 0.9704, F1: 0.9558
Tense Task - Accuracy: 0.5775
New best F1 score: 0.8885
Epoch 4 - Loss: 0.0933
Main Task - Accuracy: 0.9884, Precision: 0.9627, Recall: 0.9815, F1: 0.9720
Tense Task - Accuracy: 0.6153
Epoch 5 - Loss: 0.0722
Main Task - Accuracy: 0.9947, Precision: 0.9820, Recall: 0.9923, F1: 0.9871
Tense Task - Accuracy: 0.6409
New best F1 score: 0.8959
Epoch 6 - Loss: 0.0559
Main Task - Accuracy: 0.9962, Precision: 0.9867, Recall: 0.9950, F1: 0.9908
Tense Task - Accuracy: 0.6778
Epoch 7 - Loss: 0.0433
Main Task - Accuracy: 0.9980, Precision: 0.9930, Recall: 0.9973, F1: 0.9951
Tense Task - Accuracy: 

loading configuration file roberta-large/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file spiece.model
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading configuration file xlnet-large-cased/config.json
Model config XLNetConfig {
  "_name_or_path": "xlnet-large-cased",
  "architectures": [
    "XL

Saved LoRA model and predictions for roberta-large
Best validation F1 score: 0.8995


=== Training Model with LoRA: xlnet-large-cased ===



Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

Finding XLNet modules...
Found linear modules: ['logits_proj', 'layer_2', 'layer_1', 'summary']
trainable params: 1,990,672 || all params: 363,311,122 || trainable%: 0.5479




Epoch 1 - Loss: 0.6347
Main Task - Accuracy: 0.8268, Precision: 0.5584, Recall: 0.7355, F1: 0.6348
Tense Task - Accuracy: 0.2951
New best F1 score: 0.8282
Epoch 2 - Loss: 0.1993
Main Task - Accuracy: 0.9636, Precision: 0.8952, Recall: 0.9314, F1: 0.9129
Tense Task - Accuracy: 0.4864
New best F1 score: 0.8810
Epoch 3 - Loss: 0.1458
Main Task - Accuracy: 0.9771, Precision: 0.9342, Recall: 0.9552, F1: 0.9446
Tense Task - Accuracy: 0.5541
New best F1 score: 0.8850
Epoch 4 - Loss: 0.1094
Main Task - Accuracy: 0.9864, Precision: 0.9608, Recall: 0.9731, F1: 0.9669
Tense Task - Accuracy: 0.5955
New best F1 score: 0.8926
Epoch 5 - Loss: 0.0826
Main Task - Accuracy: 0.9913, Precision: 0.9731, Recall: 0.9849, F1: 0.9789
Tense Task - Accuracy: 0.6245
Epoch 6 - Loss: 0.0646
Main Task - Accuracy: 0.9951, Precision: 0.9827, Recall: 0.9936, F1: 0.9881
Tense Task - Accuracy: 0.6593
Epoch 7 - Loss: 0.0526
Main Task - Accuracy: 0.9972, Precision: 0.9913, Recall: 0.9953, F1: 0.9933
Tense Task - Accuracy: 

loading configuration file xlnet-large-cased/config.json
Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 4096,
  "d_model": 1024,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 16,
  "n_layer": 24,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.41.1",
  "untie_r": true,
  "use_mems_eval": true,
  "use_mems_train": false,
  "vocab_size": 32000
}



Saved LoRA model and predictions for xlnet-large-cased
Best validation F1 score: 0.8978


In [14]:
# 加载所有模型的预测结果
bert_large = np.load("./saved_preds/bert_large_uncased_preds.npy")
roberta_large = np.load("./saved_preds/roberta_large_preds.npy")
xlnet_large = np.load("./saved_preds/xlnet_large_cased_preds.npy")

# 投票集成
final_pred = bert_large + roberta_large + xlnet_large
preds = (final_pred >= 2).astype(int)  # 至少两个模型认为是反事实才预测为 1

# 打印F1
# 加载测试集的标签
test_labels = pd.read_csv("data_aug/test.csv")["gold_label"].values

bert_precision = precision_score(test_labels, bert_large)
bert_recall = recall_score(test_labels, bert_large)
bert_f1 = f1_score(test_labels, bert_large)

roberta_precision = precision_score(test_labels, roberta_large)
roberta_recall = recall_score(test_labels, roberta_large)
roberta_f1 = f1_score(test_labels, roberta_large)

xlnet_precision = precision_score(test_labels, xlnet_large)
xlnet_recall = recall_score(test_labels, xlnet_large)
xlnet_f1 = f1_score(test_labels, xlnet_large)

# 计算最终模型 F1 分数
f1_ensemble = f1_score(test_labels, preds)
precision_ensemble = precision_score(test_labels, preds)
recall_ensemble = recall_score(test_labels, preds)

# 打印每个模型的 F1 分数

print(f"Precision for BERT Large: {bert_precision:.4f}")
print(f"Recall for BERT Large: {bert_recall:.4f}")
print(f"F1 Score for BERT Large: {bert_f1:.4f}\n")

print(f"Precision for RoBERTa Large: {roberta_precision:.4f}")
print(f"Recall for RoBERTa Large: {roberta_recall:.4f}")
print(f"F1 Score for RoBERTa Large: {roberta_f1:.4f}\n")

print(f"Precision for XLNet Large: {xlnet_precision:.4f}")
print(f"Recall for XLNet Large: {xlnet_recall:.4f}")
print(f"F1 Score for XLNet Large: {xlnet_f1:.4f}\n")

print(f"Precision for Ensemble Model: {precision_ensemble:.4f}")
print(f"Recall for Ensemble Model: {recall_ensemble:.4f}")
print(f"F1_ensemble Score on Test Set: {f1_ensemble:.4f}")

# 保存最终集成结果
# np.save("final_ensemble_preds.npy", preds)
# print("✅ 集成预测完成，结果已保存至 final_ensemble_preds.npy")

Precision for BERT Large: 0.8838
Recall for BERT Large: 0.8550
F1 Score for BERT Large: 0.8691

Precision for RoBERTa Large: 0.9194
Recall for RoBERTa Large: 0.8659
F1 Score for RoBERTa Large: 0.8918

Precision for XLNet Large: 0.9044
Recall for XLNet Large: 0.8591
F1 Score for XLNet Large: 0.8812

Precision for Ensemble Model: 0.9143
Recall for Ensemble Model: 0.8672
F1_ensemble Score on Test Set: 0.8901
