In [1]:
import os
os.environ['NLTK_DATA'] = 'nltk_data'

import torch
from transformers import *
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
# import tensorflow as tf
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from torch.nn.utils.rnn import pad_sequence
import sentencepiece
# from keras.preprocessing.sequence import pad_sequences



In [2]:
# 验证实际可见的GPU设备
print("\n===== 实际使用的GPU设备 =====")
print(f"可见GPU数量: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    # device = torch.device("cuda")
    device = torch.device("cuda:0")
    print('There is/are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Set the seed value all over the place to make this reproducible. Somehow this isn't working!
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 创建保存模型和预测的文件夹
os.makedirs('./saved_preds', exist_ok=True)
os.makedirs('./saved_models', exist_ok=True)


===== 实际使用的GPU设备 =====
可见GPU数量: 1
GPU 0: NVIDIA GeForce RTX 4090
There is/are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 4090


定义集成模型：

In [3]:
# 定义本地模型路径
LOCAL_MODEL_PATHS = {
    'bert-large-uncased': 'bert-large-uncased',
    'roberta-large': 'roberta-large',
    'xlnet-large-cased': 'xlnet-large-cased'
}

# 启用多个模型配置
MODELS = [
    (BertForSequenceClassification, BertTokenizer, 'bert-large-uncased'),
    (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-large'),
    (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-large-cased'),
]

In [10]:
test_corpus = pd.read_csv("subtask1_test.csv", encoding='utf-8')
master_corpus = pd.read_csv("subtask1_train_with_tense.csv", encoding = 'utf-8')

In [16]:
# Multi-Task Learning

# 首先，创建一个多任务的数据集类
class MultiTaskClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, corpus, train_corpus, tokenizer, max_len=128, is_test=False):
        self.corpus = corpus
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test
        
        # 计算主任务的类别权重
        label_counts = train_corpus['gold_label'].value_counts()
        total = len(train_corpus)
        self.main_task_weights = torch.tensor([total / label_counts[i] for i in range(len(label_counts))], dtype=torch.float32).to(device)
        
        # 只在训练集上计算辅助任务的权重
        if not is_test and 'tense' in corpus.columns:
            # 计算辅助任务（时态）的类别权重
            tense_counts = train_corpus['tense'].value_counts()
            self.num_tenses = len(tense_counts)
            self.tense_weights = torch.tensor([total / tense_counts[i] for i in range(len(tense_counts))], dtype=torch.float32).to(device)
            
            # 创建时态到索引的映射
            self.tense_to_idx = {tense: idx for idx, tense in enumerate(sorted(train_corpus['tense'].unique()))}
        else:
            # 对于测试集，我们需要从训练集获取时态信息
            if 'tense' in train_corpus.columns:
                tense_counts = train_corpus['tense'].value_counts()
                self.num_tenses = len(tense_counts)
                self.tense_weights = torch.tensor([total / tense_counts[i] for i in range(len(tense_counts))], dtype=torch.float32).to(device)
                self.tense_to_idx = {tense: idx for idx, tense in enumerate(sorted(train_corpus['tense'].unique()))}
            else:
                self.num_tenses = 0
                self.tense_weights = None
                self.tense_to_idx = {}
        
    def __len__(self):
        return len(self.corpus)
    
    def __getitem__(self, idx):
        row = self.corpus.iloc[idx]
        text = row['sentence']
        main_label = row['gold_label']
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        result = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'main_label': torch.tensor(main_label, dtype=torch.long),
        }
        
        # 只在训练时添加时态标签
        if not self.is_test and 'tense' in row.index:
            tense_label = self.tense_to_idx[row['tense']]
            result['tense_label'] = torch.tensor(tense_label, dtype=torch.long)
        else:
            # 对于测试集，使用一个占位符
            result['tense_label'] = torch.tensor(0, dtype=torch.long)
            
        return result

# 创建一个多任务模型包装器
class MultiTaskModel(nn.Module):
    def __init__(self, base_model, num_main_labels=2, num_tense_labels=None):
        super(MultiTaskModel, self).__init__()
        self.base_model = base_model
        self.num_main_labels = num_main_labels
        self.num_tense_labels = num_tense_labels
        
        # 获取隐藏层大小
        hidden_size = base_model.config.hidden_size
        
        # 主任务分类头
        self.main_classifier = nn.Linear(hidden_size, num_main_labels)
        
        # 辅助任务（时态）分类头
        self.tense_classifier = nn.Linear(hidden_size, num_tense_labels)
        
        # Dropout层
        dropout = getattr(base_model.config, 'hidden_dropout_prob', None)
        if dropout is None:
            dropout = getattr(base_model.config, 'dropout', 0.1)  # XLNet 使用的是 'dropout'
        self.dropout = nn.Dropout(dropout)
        
        # 确定基础模型的类型
        self.model_type = base_model.config.model_type
        
    def forward(self, input_ids, attention_mask=None):
        # 根据模型类型使用不同的方法获取表示
        if self.model_type == 'bert':
            # BERT模型
            outputs = self.base_model.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output
            
        elif self.model_type == 'roberta':
            # RoBERTa 没有 pooler_output，使用 [CLS] token 的表示
            outputs = self.base_model.roberta(input_ids=input_ids, attention_mask=attention_mask)
            sequence_output = outputs.last_hidden_state
            pooled_output = sequence_output[:, 0]  # 取第一个 token ([CLS])
            
        elif self.model_type == 'xlnet':
            # XLNet模型
            outputs = self.base_model.transformer(input_ids=input_ids, attention_mask=attention_mask)
            # XLNet没有pooler_output，使用最后一个token的表示
            # 对于XLNet，最后一个token通常是最好的句子表示
            sequence_output = outputs.last_hidden_state
            # 获取每个序列的最后一个有效token
            if attention_mask is not None:
                # 找到每个序列的最后一个非padding位置
                seq_lengths = attention_mask.sum(dim=1) - 1
                batch_size = input_ids.size(0)
                pooled_output = sequence_output[range(batch_size), seq_lengths]
            else:
                # 如果没有attention_mask，使用最后一个位置
                pooled_output = sequence_output[:, -1]
                
        else:
            # 其他模型的通用处理
            outputs = self.base_model(
                input_ids=input_ids, 
                attention_mask=attention_mask,
                output_hidden_states=True,
                return_dict=True
            )
            
            if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
                pooled_output = outputs.pooler_output
            else:
                # 使用[CLS] token（第一个token）
                last_hidden_state = outputs.hidden_states[-1]
                pooled_output = last_hidden_state[:, 0]
        
        # 应用dropout
        pooled_output = self.dropout(pooled_output)
        
        # 主任务输出
        main_logits = self.main_classifier(pooled_output)
        
        # 辅助任务输出
        tense_logits = self.tense_classifier(pooled_output)
        
        return {
            'main_logits': main_logits,
            'tense_logits': tense_logits
        }


# 主训练循环
for model_class, tokenizer_class, pretrained_weights in MODELS:
    print(f"\n\n=== Training Model: {pretrained_weights} ===\n")
    
    # Loading the data
    train_corpus, test_corpus = master_corpus, test_corpus
    
    # 从本地加载预训练模型
    local_path = LOCAL_MODEL_PATHS[pretrained_weights]
    tokenizer = tokenizer_class.from_pretrained(local_path)
    base_model = model_class.from_pretrained(
        local_path,
        num_labels=2,
        output_hidden_states=False,
        output_attentions=False
    )
    
    # 创建多任务数据集
    train_dataset = MultiTaskClassificationDataset(train_corpus, train_corpus, tokenizer, max_len=128, is_test=False)
    test_dataset = MultiTaskClassificationDataset(test_corpus, train_corpus, tokenizer, max_len=128, is_test=True)
    
    # 创建多任务模型
    model = MultiTaskModel(base_model, num_main_labels=2, num_tense_labels=train_dataset.num_tenses)
    model.to(device)
    
    # 创建数据加载器
    def collate_fn(batch):
        input_ids = torch.stack([item['input_ids'] for item in batch])
        attention_mask = torch.stack([item['attention_mask'] for item in batch])
        main_labels = torch.stack([item['main_label'] for item in batch])
        tense_labels = torch.stack([item['tense_label'] for item in batch])
        return input_ids, attention_mask, main_labels, tense_labels
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
    
    # 定义损失函数
    main_criterion = nn.CrossEntropyLoss(weight=train_dataset.main_task_weights)
    tense_criterion = nn.CrossEntropyLoss(weight=train_dataset.tense_weights)
    
    # 定义多任务损失权重（可以调整）
    main_task_weight = 0.7
    tense_task_weight = 0.3
    
    epochs = 10
    
    # 优化器
    optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
    
    # 学习率调度器
    total_train_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0,
        num_training_steps=total_train_steps
    )
    
    # 训练循环
    for epoch in range(epochs):   
        model.train()
        
        train_main_preds = []
        train_main_labels = []
        train_tense_preds = []
        train_tense_labels = []
        total_loss = 0
        
        for i, (input_ids, attention_mask, main_labels, tense_labels) in enumerate(train_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            main_labels = main_labels.to(device)
            tense_labels = tense_labels.to(device)
            
            optimizer.zero_grad()
            
            # 前向传播
            outputs = model(input_ids, attention_mask)
            
            # 计算多任务损失
            main_loss = main_criterion(outputs['main_logits'], main_labels)
            tense_loss = tense_criterion(outputs['tense_logits'], tense_labels)
            
            # 组合损失
            loss = main_task_weight * main_loss + tense_task_weight * tense_loss
            
            # 反向传播
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            # 记录预测结果
            main_preds = torch.argmax(outputs['main_logits'], dim=1).cpu().numpy()
            tense_preds = torch.argmax(outputs['tense_logits'], dim=1).cpu().numpy()
            
            train_main_preds.extend(main_preds)
            train_main_labels.extend(main_labels.cpu().numpy())
            train_tense_preds.extend(tense_preds)
            train_tense_labels.extend(tense_labels.cpu().numpy())
            
            total_loss += loss.item()
        
        # 计算主任务指标
        main_acc = accuracy_score(train_main_labels, train_main_preds)
        main_prec = precision_score(train_main_labels, train_main_preds)
        main_rec = recall_score(train_main_labels, train_main_preds)
        main_f1 = f1_score(train_main_labels, train_main_preds)
        
        # 计算辅助任务指标
        tense_acc = accuracy_score(train_tense_labels, train_tense_preds)
        
        print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader):.4f}")
        print(f"Main Task - Accuracy: {main_acc:.4f}, Precision: {main_prec:.4f}, Recall: {main_rec:.4f}, F1: {main_f1:.4f}")
        print(f"Tense Task - Accuracy: {tense_acc:.4f}")
    
    # 测试阶段 - 只关注主任务
    model.eval()
    test_main_preds = []
    test_main_labels = []
    
    with torch.no_grad():
        for input_ids, attention_mask, main_labels, _ in test_loader:  # 忽略时态标签
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            main_labels = main_labels.to(device)
            
            outputs = model(input_ids, attention_mask)
            
            # 只获取主任务的预测
            main_preds = torch.argmax(outputs['main_logits'], dim=1).cpu().numpy()
            
            test_main_preds.extend(main_preds)
            test_main_labels.extend(main_labels.cpu().numpy())
    
    # 计算测试集指标 - 只计算主任务
    main_acc = accuracy_score(test_main_labels, test_main_preds)
    main_prec = precision_score(test_main_labels, test_main_preds)
    main_rec = recall_score(test_main_labels, test_main_preds)
    main_f1 = f1_score(test_main_labels, test_main_preds)
    
    print(f"\nFinal Test Results:")
    print(f"Main Task - Accuracy: {main_acc:.4f}, Precision: {main_prec:.4f}, Recall: {main_rec:.4f}, F1: {main_f1:.4f}")
    
    # 保存模型和预测结果
    model_name = pretrained_weights.replace("-", "_")
    torch.save(model.state_dict(), f'./saved_models/{model_name}_multitask.pt')
    np.save(f'./saved_preds/{model_name}_main_preds.npy', test_main_preds)
    print(f"Saved model and predictions for {pretrained_weights}")


loading file vocab.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading configuration file bert-large-uncased/config.json
Model config BertConfig {
  "_name_or_path": "bert-large-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.40.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



loading configuration file bert-large-uncased/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.40.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file bert-large-uncased/model.safetensors




=== Training Model: bert-large-uncased ===



Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly

Epoch 1 - Loss: 0.2924
Main Task - Accuracy: 0.9266, Precision: 0.7732, Recall: 0.9075, F1: 0.8350
Tense Task - Accuracy: 0.3683
Epoch 2 - Loss: 0.1317
Main Task - Accuracy: 0.9842, Precision: 0.9537, Recall: 0.9701, F1: 0.9618
Tense Task - Accuracy: 0.5534
Epoch 3 - Loss: 0.0919
Main Task - Accuracy: 0.9929, Precision: 0.9802, Recall: 0.9852, F1: 0.9827
Tense Task - Accuracy: 0.6212
Epoch 4 - Loss: 0.0633
Main Task - Accuracy: 0.9968, Precision: 0.9916, Recall: 0.9929, F1: 0.9923
Tense Task - Accuracy: 0.6748
Epoch 5 - Loss: 0.0421
Main Task - Accuracy: 0.9984, Precision: 0.9960, Recall: 0.9963, F1: 0.9961
Tense Task - Accuracy: 0.7144
Epoch 6 - Loss: 0.0297
Main Task - Accuracy: 0.9992, Precision: 0.9980, Recall: 0.9983, F1: 0.9981
Tense Task - Accuracy: 0.7445
Epoch 7 - Loss: 0.0222
Main Task - Accuracy: 0.9993, Precision: 0.9980, Recall: 0.9987, F1: 0.9983
Tense Task - Accuracy: 0.7686
Epoch 8 - Loss: 0.0168
Main Task - Accuracy: 0.9998, Precision: 0.9997, Recall: 0.9993, F1: 0.999

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading configuration file roberta-large/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.40.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading configuration file roberta-large/config.json
Model config RobertaConfig {
  "architect

Saved model and predictions for bert-large-uncased


=== Training Model: roberta-large ===



Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should 

Epoch 1 - Loss: 0.2548
Main Task - Accuracy: 0.9357, Precision: 0.8060, Recall: 0.9031, F1: 0.8518
Tense Task - Accuracy: 0.4273
Epoch 2 - Loss: 0.1372
Main Task - Accuracy: 0.9809, Precision: 0.9419, Recall: 0.9660, F1: 0.9538
Tense Task - Accuracy: 0.5701
Epoch 3 - Loss: 0.0975
Main Task - Accuracy: 0.9904, Precision: 0.9717, Recall: 0.9818, F1: 0.9767
Tense Task - Accuracy: 0.6184
Epoch 4 - Loss: 0.0646
Main Task - Accuracy: 0.9959, Precision: 0.9857, Recall: 0.9943, F1: 0.9899
Tense Task - Accuracy: 0.6661
Epoch 5 - Loss: 0.0502
Main Task - Accuracy: 0.9981, Precision: 0.9936, Recall: 0.9970, F1: 0.9953
Tense Task - Accuracy: 0.7057
Epoch 6 - Loss: 0.0345
Main Task - Accuracy: 0.9993, Precision: 0.9973, Recall: 0.9993, F1: 0.9983
Tense Task - Accuracy: 0.7436
Epoch 7 - Loss: 0.0253
Main Task - Accuracy: 0.9994, Precision: 0.9976, Recall: 0.9993, F1: 0.9985
Tense Task - Accuracy: 0.7660
Epoch 8 - Loss: 0.0219
Main Task - Accuracy: 0.9998, Precision: 0.9997, Recall: 0.9993, F1: 0.999

loading file spiece.model
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading configuration file xlnet-large-cased/config.json
Model config XLNetConfig {
  "_name_or_path": "xlnet-large-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 4096,
  "d_model": 1024,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 16,
  "n_layer": 24,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  

Saved model and predictions for roberta-large


=== Training Model: xlnet-large-cased ===



Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

Epoch 1 - Loss: 0.2983
Main Task - Accuracy: 0.9162, Precision: 0.7527, Recall: 0.8795, F1: 0.8112
Tense Task - Accuracy: 0.4056
Epoch 2 - Loss: 0.1495
Main Task - Accuracy: 0.9773, Precision: 0.9395, Recall: 0.9502, F1: 0.9448
Tense Task - Accuracy: 0.5588
Epoch 3 - Loss: 0.0935
Main Task - Accuracy: 0.9908, Precision: 0.9702, Recall: 0.9855, F1: 0.9778
Tense Task - Accuracy: 0.6246
Epoch 4 - Loss: 0.0635
Main Task - Accuracy: 0.9957, Precision: 0.9869, Recall: 0.9923, F1: 0.9896
Tense Task - Accuracy: 0.6806
Epoch 5 - Loss: 0.0458
Main Task - Accuracy: 0.9981, Precision: 0.9940, Recall: 0.9966, F1: 0.9953
Tense Task - Accuracy: 0.7241
Epoch 6 - Loss: 0.0318
Main Task - Accuracy: 0.9992, Precision: 0.9973, Recall: 0.9990, F1: 0.9982
Tense Task - Accuracy: 0.7549
Epoch 7 - Loss: 0.0215
Main Task - Accuracy: 0.9992, Precision: 0.9976, Recall: 0.9987, F1: 0.9982
Tense Task - Accuracy: 0.7814
Epoch 8 - Loss: 0.0166
Main Task - Accuracy: 0.9997, Precision: 0.9997, Recall: 0.9987, F1: 0.999

In [None]:
# Multi-Task

# 加载所有模型的预测结果
bert_large = np.load("saved_preds/bert_large_uncased_main_preds.npy")
roberta_large = np.load("saved_preds/roberta_large_main_preds.npy")
xlnet_large = np.load("saved_preds/xlnet_large_cased_main_preds.npy")

# 投票集成
final_pred = bert_large + roberta_large + xlnet_large
preds = (final_pred >= 2).astype(int)  # 至少两个模型认为是反事实才预测为 1

# 打印F1
# 加载测试集的标签
test_labels = pd.read_csv("subtask1_test.csv")["gold_label"].values

bert_precision = precision_score(test_labels, bert_large)
bert_recall = recall_score(test_labels, bert_large)
bert_f1 = f1_score(test_labels, bert_large)

roberta_precision = precision_score(test_labels, roberta_large)
roberta_recall = recall_score(test_labels, roberta_large)
roberta_f1 = f1_score(test_labels, roberta_large)

xlnet_precision = precision_score(test_labels, xlnet_large)
xlnet_recall = recall_score(test_labels, xlnet_large)
xlnet_f1 = f1_score(test_labels, xlnet_large)

# 计算最终模型 F1 分数
f1_ensemble = f1_score(test_labels, preds)
precision_ensemble = precision_score(test_labels, preds)
recall_ensemble = recall_score(test_labels, preds)

# 打印每个模型的 F1 分数

print(f"Precision for BERT Large: {bert_precision:.4f}")
print(f"Recall for BERT Large: {bert_recall:.4f}")
print(f"F1 Score for BERT Large: {bert_f1:.4f}\n")

print(f"Precision for RoBERTa Large: {roberta_precision:.4f}")
print(f"Recall for RoBERTa Large: {roberta_recall:.4f}")
print(f"F1 Score for RoBERTa Large: {roberta_f1:.4f}\n")

print(f"Precision for XLNet Large: {xlnet_precision:.4f}")
print(f"Recall for XLNet Large: {xlnet_recall:.4f}")
print(f"F1 Score for XLNet Large: {xlnet_f1:.4f}\n")

print(f"Precision for Ensemble Model: {precision_ensemble:.4f}")
print(f"Recall for Ensemble Model: {recall_ensemble:.4f}")
print(f"F1_ensemble Score on Test Set: {f1_ensemble:.4f}")

# 保存最终集成结果
# np.save("final_ensemble_preds.npy", preds)
# print("✅ 集成预测完成，结果已保存至 final_ensemble_preds.npy")

Precision for BERT Large: 0.8685
Recall for BERT Large: 0.8591
F1 Score for BERT Large: 0.8638

Precision for RoBERTa Large: 0.9202
Recall for RoBERTa Large: 0.8753
F1 Score for RoBERTa Large: 0.8972

Precision for XLNet Large: 0.9110
Recall for XLNet Large: 0.8740
F1 Score for XLNet Large: 0.8921

Precision for Ensemble Model: 0.9130
Recall for Ensemble Model: 0.8821
F1_ensemble Score on Test Set: 0.8973


#### 在原来代码的基础上，加上第四个模型：Roberta_large+语言学特征（结合nltk）

In [27]:
"""Roberta_large+语言学特征（结合nltk）"""
import nltk
from nltk import word_tokenize
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from transformers import RobertaModel, RobertaTokenizer

nltk.data.path.append("nltk_data")
punkt_path = "nltk_data/tokenizers/punkt" 
nltk.data.path.append(punkt_path)
averaged_perceptron_tagger_path = "nltk_data/taggers/averaged_perceptron_tagger"
nltk.data.path.append(averaged_perceptron_tagger_path)
punkt_tab_path = "nltk_data/tokenizers/punkt_tab"
nltk.data.path.append(punkt_tab_path)
punkt_tab_english_path = "nltk_data/tokenizers/punkt_tab/english"
nltk.data.path.append(punkt_tab_english_path)
averaged_perceptron_tagger_eng_path = "nltk_data/taggers/averaged_perceptron_tagger_eng"
nltk.data.path.append(averaged_perceptron_tagger_eng_path)



In [28]:
# 定义本地模型路径
LOCAL_MODEL_PATHS = {
    # 'roberta-large': '/mnt/disk2/xiaolin.zhang/nlp_finalproject/iitk/huggingface/hub/models--roberta-large/snapshots/722cf37b1afa9454edce342e7895e588b6ff1d59',
    'roberta-large': 'roberta-large',
}

# 启用多个模型配置
MODELS = [
    (RobertaModel, RobertaTokenizer, 'roberta-large'),
]

In [29]:
# 加载预训练模型和 Tokenizer
model_class, tokenizer_class, pretrained_weights = (RobertaModel, RobertaTokenizer, 'roberta-large')
tokenizer = tokenizer_class.from_pretrained(LOCAL_MODEL_PATHS[pretrained_weights])
base_model = model_class.from_pretrained(LOCAL_MODEL_PATHS[pretrained_weights], output_hidden_states=False, output_attentions=False).to(device)

# 数据读取
train_corpus = pd.read_csv("subtask1_train_with_tense.csv", encoding='utf-8')
test_corpus = pd.read_csv("subtask1_test.csv", encoding='utf-8')

# 提取 POS 和 N-gram 特征
def extract_pos_tags(sent):
    return " ".join([pairs[1] for pairs in pos_tag(word_tokenize(sent))])

train_corpus['pos_string'] = train_corpus['sentence'].apply(extract_pos_tags)
test_corpus['pos_string'] = test_corpus['sentence'].apply(extract_pos_tags)

pos_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=1000)
pos_vectorizer.fit(train_corpus['pos_string'])

ngram_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=1000)
ngram_vectorizer.fit(train_corpus['sentence'])

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
loading configuration file roberta-large/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.40.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading configuration file roberta-large/config.json
Model config RobertaConfig {
  "architect

In [30]:
# 自定义 Dataset
class MultiTaskClassificationDataset(Dataset):
    def __init__(self, corpus, train_corpus, tokenizer, pos_vectorizer, ngram_vectorizer, max_len=128, is_test=False):
        self.corpus = corpus.reset_index()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pos_vectorizer = pos_vectorizer
        self.ngram_vectorizer = ngram_vectorizer
        self.is_test = is_test
        
        # Tokenize sentences
        self.encodings = [
            self.tokenizer.encode(sent, add_special_tokens=True, max_length=self.max_len, truncation=True, padding='max_length')
            for sent in self.corpus['sentence']
        ]
        
        # Convert to tensors
        self.encodings = [torch.tensor(seq, dtype=torch.long) for seq in self.encodings]
        self.labels = self.corpus['gold_label'].values
        
        # 处理时态标签
        if not is_test and 'tense' in corpus.columns:
            # 创建时态到索引的映射
            self.tense_to_idx = {tense: idx for idx, tense in enumerate(sorted(train_corpus['tense'].unique()))}
            self.num_tenses = len(self.tense_to_idx)
            self.tense_labels = [self.tense_to_idx[tense] for tense in self.corpus['tense']]
        else:
            # 对于测试集，使用训练集的时态映射
            if 'tense' in train_corpus.columns:
                self.tense_to_idx = {tense: idx for idx, tense in enumerate(sorted(train_corpus['tense'].unique()))}
                self.num_tenses = len(self.tense_to_idx)
            else:
                self.tense_to_idx = {}
                self.num_tenses = 0
            # 测试集使用占位符
            self.tense_labels = [0] * len(self.corpus)
    
    def __len__(self):
        return len(self.corpus)
    
    def __getitem__(self, idx):
        input_ids = self.encodings[idx].to(device)
        label = torch.tensor(self.labels[idx]).to(device)
        tense_label = torch.tensor(self.tense_labels[idx]).to(device)
        
        sentence = self.corpus.iloc[idx]['sentence']
        pos_feat = self.pos_vectorizer.transform([self.corpus.iloc[idx]['pos_string']]).toarray().squeeze().astype(np.float32)
        ngram_feat = self.ngram_vectorizer.transform([sentence]).toarray().squeeze().astype(np.float32)
        
        pos_tensor = torch.from_numpy(pos_feat).to(device)
        ngram_tensor = torch.from_numpy(ngram_feat).to(device)
        
        return input_ids, label, tense_label, pos_tensor, ngram_tensor

# 定义多任务模型
class MultiTaskCustomModel(nn.Module):
    def __init__(self, base_model, num_tenses):
        super(MultiTaskCustomModel, self).__init__()
        self.transformer = base_model
        
        # 共享特征提取层
        self.lin1 = nn.Linear(1024 + 1000 + 1000, 512)
        self.lin2 = nn.Linear(512, 64)
        
        # 主任务分类头
        self.main_classifier = nn.Linear(64, 2)
        
        # 辅助任务（时态）分类头
        self.tense_classifier = nn.Linear(64, num_tenses)
        
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, pos_feats, ngram_feats):
        outputs = self.transformer(input_ids)
        pooled_output = outputs[1]  # Shape: (batch_size, hidden_size)
        
        # 结合所有特征
        combined = torch.cat((pooled_output, pos_feats, ngram_feats), dim=-1)
        
        # 共享特征提取
        x = F.relu(self.lin1(combined))
        x = self.dropout(x)
        x = F.relu(self.lin2(x))
        x = self.dropout(x)
        
        # 主任务输出
        main_logits = self.main_classifier(x)
        
        # 辅助任务输出
        tense_logits = self.tense_classifier(x)
        
        return main_logits, tense_logits

# 初始化 Dataset 和 DataLoader
train_dataset = MultiTaskClassificationDataset(train_corpus, train_corpus, tokenizer, pos_vectorizer, ngram_vectorizer, max_len=128, is_test=False)
test_dataset = MultiTaskClassificationDataset(test_corpus, train_corpus, tokenizer, pos_vectorizer, ngram_vectorizer, max_len=128, is_test=True)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 实例化多任务模型
model = MultiTaskCustomModel(base_model, num_tenses=train_dataset.num_tenses).to(device)

# 损失函数和优化器
main_criterion = nn.CrossEntropyLoss()
tense_criterion = nn.CrossEntropyLoss()

# 多任务损失权重
main_task_weight = 0.7
tense_task_weight = 0.3

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

total_train_steps = len(train_loader) * 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_train_steps)

# 训练循环
for epoch in range(10):
    model.train()
    total_loss = 0
    train_main_preds, train_main_labels = [], []
    train_tense_preds, train_tense_labels = [], []
    
    for i, data in enumerate(train_loader):
        input_ids, main_labels, tense_labels, pos_feats, ngram_feats = data
        optimizer.zero_grad()
        
        # 前向传播
        main_outputs, tense_outputs = model(input_ids, pos_feats, ngram_feats)
        
        # 计算多任务损失
        main_loss = main_criterion(main_outputs, main_labels)
        tense_loss = tense_criterion(tense_outputs, tense_labels)
        
        # 组合损失
        loss = main_task_weight * main_loss + tense_task_weight * tense_loss
        
        loss.backward()
        
        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        # 记录预测结果
        main_preds = torch.argmax(main_outputs, dim=1).cpu().numpy()
        tense_preds = torch.argmax(tense_outputs, dim=1).cpu().numpy()
        
        train_main_preds.extend(main_preds)
        train_main_labels.extend(main_labels.cpu().numpy())
        train_tense_preds.extend(tense_preds)
        train_tense_labels.extend(tense_labels.cpu().numpy())
        
        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Train Loss: {total_loss / len(train_loader):.4f}")
    
    # 主任务指标
    print(f"Main Task - Train Acc: {accuracy_score(train_main_labels, train_main_preds):.4f}, "
          f"Prec: {precision_score(train_main_labels, train_main_preds):.4f}, "
          f"Recall: {recall_score(train_main_labels, train_main_preds):.4f}, "
          f"F1: {f1_score(train_main_labels, train_main_preds):.4f}")
    
    # 辅助任务指标（只在训练时显示）
    if not test_dataset.is_test:
        print(f"Tense Task - Train Acc: {accuracy_score(train_tense_labels, train_tense_preds):.4f}")

    # 测试阶段 - 只关注主任务
    model.eval()
    test_main_preds, test_main_labels = [], []
    
    with torch.no_grad():
        for data in test_loader:
            input_ids, main_labels, _, pos_feats, ngram_feats = data  # 忽略时态标签
            main_outputs, _ = model(input_ids, pos_feats, ngram_feats)  # 忽略时态输出
            
            preds = torch.argmax(main_outputs, dim=1).cpu().numpy()
            test_main_preds.extend(preds)
            test_main_labels.extend(main_labels.cpu().numpy())

    print(f"Test - Main Task Acc: {accuracy_score(test_main_labels, test_main_preds):.4f}, "
          f"Prec: {precision_score(test_main_labels, test_main_preds):.4f}, "
          f"Recall: {recall_score(test_main_labels, test_main_preds):.4f}, "
          f"F1: {f1_score(test_main_labels, test_main_preds):.4f}")
    print("-" * 80)

# 保存模型和预测结果
torch.save(model.state_dict(), "./saved_models/roberta_large_pos_ngram_multitask.pth")
np.save("./saved_preds/roberta_large_pos_ngram_multitask.npy", test_main_preds)
print("Model and predictions saved!")




Epoch 1 - Train Loss: 0.6931
Main Task - Train Acc: 0.9306, Prec: 0.9204, Recall: 0.7238, F1: 0.8103
Test - Main Task Acc: 0.9721, Prec: 0.8694, Recall: 0.8659, F1: 0.8676
--------------------------------------------------------------------------------
Epoch 2 - Train Loss: 0.4258
Main Task - Train Acc: 0.9742, Prec: 0.9548, Recall: 0.9176, F1: 0.9358
Test - Main Task Acc: 0.9726, Prec: 0.8945, Recall: 0.8388, F1: 0.8657
--------------------------------------------------------------------------------
Epoch 3 - Train Loss: 0.3267
Main Task - Train Acc: 0.9818, Prec: 0.9656, Recall: 0.9448, F1: 0.9551
Test - Main Task Acc: 0.9736, Prec: 0.9196, Recall: 0.8211, F1: 0.8676
--------------------------------------------------------------------------------
Epoch 4 - Train Loss: 0.2631
Main Task - Train Acc: 0.9897, Prec: 0.9816, Recall: 0.9680, F1: 0.9748
Test - Main Task Acc: 0.9737, Prec: 0.9086, Recall: 0.8347, F1: 0.8701
---------------------------------------------------------------------

In [33]:
# 加载所有模型的预测结果
roberta_large_pos_ngram = np.load("saved_preds/roberta_large_pos_ngram.npy")

test_labels = pd.read_csv("subtask1_test.csv")["gold_label"].values

# 打印F1
# 加载测试集的标签

roberta_large_pos_ngram_precision = precision_score(test_labels, roberta_large_pos_ngram)
roberta_large_pos_ngram_recall = recall_score(test_labels, roberta_large_pos_ngram)
roberta_large_pos_ngram_f1 = f1_score(test_labels, roberta_large_pos_ngram)

print(f"Precision for RoBERTa Large + nltk: {roberta_large_pos_ngram_precision:.4f}")
print(f"Recall for RoBERTa Large + nltk: {roberta_large_pos_ngram_recall:.4f}")
print(f"F1 Score for RoBERTa Large + nltk: {roberta_large_pos_ngram_f1:.4f}\n")

Precision for RoBERTa Large + nltk: 0.9148
Recall for RoBERTa Large + nltk: 0.8875
F1 Score for RoBERTa Large + nltk: 0.9010



In [36]:
# 加载所有模型的预测结果
bert_large = np.load("saved_preds/bert_large_uncased_main_preds.npy")
roberta_large = np.load("saved_preds/roberta_large_main_preds.npy")
xlnet_large = np.load("saved_preds/xlnet_large_cased_main_preds.npy")

# 投票集成
final_pred = bert_large + roberta_large + xlnet_large + roberta_large_pos_ngram
preds = (final_pred >= 2).astype(int)  # 至少两个模型认为是反事实才预测为 1

# 计算最终模型 F1 分数
f1_ensemble = f1_score(test_labels, preds)
precision_ensemble = precision_score(test_labels, preds)
recall_ensemble = recall_score(test_labels, preds)

# 打印每个模型的 F1 分数
print(f"Precision for Ensemble Model: {precision_ensemble:.4f}")
print(f"Recall for Ensemble Model: {recall_ensemble:.4f}")
print(f"F1_ensemble Score on Test Set: {f1_ensemble:.4f}")

Precision for Ensemble Model: 0.9024
Recall for Ensemble Model: 0.9024
F1_ensemble Score on Test Set: 0.9024
