In [1]:
from rich import print
from datasets import load_dataset , load_metric
from transformers import AutoTokenizer , BatchEncoding, BartForConditionalGeneration, BartTokenizer
import nltk
import string
import torch
import evaluate
from tqdm import tqdm
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
dataset = load_dataset("json" , data_files="./data/train.json")
dataset

DatasetDict({
    train: Dataset({
        features: ['headline', 'body'],
        num_rows: 100000
    })
})

In [4]:
sample_test = dataset["train"].train_test_split(test_size=10)
sample_test

DatasetDict({
    train: Dataset({
        features: ['headline', 'body'],
        num_rows: 99990
    })
    test: Dataset({
        features: ['headline', 'body'],
        num_rows: 10
    })
})

In [5]:
%pwd

'/root/code/python/NYCU-Data-Science-2024/HW3'

In [6]:
# model_id = "facebook/bart-base"
model_id = "model/model/data_science_hw3_model_facebook-bart-large/checkpoint-6300"

In [7]:
tokenizer = BartTokenizer.from_pretrained(model_id)
tokenizer

BartTokenizer(name_or_path='model/model/data_science_hw3_model_facebook-bart-large/checkpoint-6300', vocab_size=50265, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}

In [8]:
model = BartForConditionalGeneration.from_pretrained(model_id)
model = model.to(device)
model

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): La

In [9]:
# 检查模型参数的设备
device = next(model.parameters()).device

# 打印设备信息
print(f"Model is running on {device}.")

In [10]:
# from torch.utils.data import DataLoader
# max_input_length = 1024
# max_target_length = 64
# # 定义数据集
# class YourDataset(torch.utils.data.Dataset):
#     def __init__(self, dataset):
#         self.dataset = dataset

#     def __len__(self):
#         return len(self.dataset)

#     def __getitem__(self, idx):
#         # 处理数据集，返回所需的数据
#         test_item = self.dataset[idx]
#         label = tokenizer(test_item["headline"], max_length=max_target_length, truncation=True, padding=True, return_tensors="pt").to(device) 
#         pre_text_input = tokenizer(test_item["body"], max_length=max_input_length, truncation=True, padding=True, return_tensors="pt").to(device)
#         return  label, pre_text_input

In [11]:
# data_loader = DataLoader(YourDataset(dataset["train"]), batch_size=10)
# data_loader

In [12]:
# list(data_loader)[:5]

In [14]:
label_tensor_arr = []
pre_tensor_arr = []
max_input_length = 1024
max_target_length = 64

# 将模型移到相同的设备上
# model.to(device)

for test_item in tqdm(dataset["train"], desc="check"):
    # 将输入张量移动到设备
    label = tokenizer(test_item["headline"], max_length=max_target_length, truncation=True, padding=True, return_tensors="pt").to(device) 
    pre_text_input = tokenizer(test_item["body"], max_length=max_input_length, truncation=True, padding=True, return_tensors="pt").to(device) 
    
    # 生成预测
    with torch.no_grad():
        outputs = model.generate(
            **pre_text_input,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=2,
        )

    label_tensor_arr.append(test_item["headline"])
    pre_tensor_arr.append(outputs[0].cpu())

# for (str_input, labels, pre_text_inputs) in tqdm(data_loader, desc="check"):
#     # 生成预测
#     with torch.no_grad():
#         outputs = model.generate(
#             **pre_text_inputs,
#             num_beams=5,
#             early_stopping=True,
#             no_repeat_ngram_size=2,
#         )

#     label_tensor_arr.extend(str_input)
#     pre_tensor_arr.extend(outputs[0].cpu())


check:   0%|          | 437/100000 [02:49<10:43:15,  2.58it/s]


KeyboardInterrupt: 

In [15]:
metric_rouge = evaluate.load("rouge",rouge_types=["rouge1", "rouge2", "rougeL"])
metric_bert_score = evaluate.load("bertscore")

In [16]:
# print(label_tensor_arr)

In [17]:
def compute_metrics(eval_pred) -> dict:
    predictions , labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions , skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = labels

    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    result_bert_score = metric_bert_score.compute(predictions=decoded_preds, references=decoded_labels, lang="en") # model_type="distilbert-base-uncased",
    
    # Extract ROUGE f1 scores
    result = {key: value * 100 for key, value in result.items()}
    
    # add the bert score f1 mean
    result["BERTScore f1 mean"] = np.mean(result_bert_score["f1"]) * 100
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [18]:
ans = compute_metrics((pre_tensor_arr , label_tensor_arr))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
ans

# # 编码输入文本
# input_ids = tokenizer(input_text, return_tensors="pt").to(device) # .input_ids
# print(input_ids)
# # 生成文本
# outputs = model.generate(
#     **input_ids,
#     num_beams=5,
#     early_stopping=True,
#     no_repeat_ngram_size=2,
# )

# # 解码生成的文本
# decoded_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)




{'rouge1': 51.5127,
 'rouge2': 33.7958,
 'rougeL': 47.2369,
 'rougeLsum': 47.4197,
 'BERTScore f1 mean': 90.9663,
 'gen_len': 16.7574}

In [20]:
# print(decoded_outputs)