In [1]:
# !pip install datasets transformers tqdm rouge accelerate

In [1]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DistilBertModel, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments, 
    LlamaTokenizerFast,
    LlamaForCausalLM,
    AutoModelForCausalLM,
    AutoConfig,
)
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model
from datasets import load_dataset, load_dataset, load_metric
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from logging import getLogger
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from rouge import Rouge
from rouge import FilesRouge

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/hufy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
llama_path = "/scratch/chaijy_root/chaijy2/hufy/.cache/huggingface/hub/LLaMA-2-hf"
local_only = True
t = AutoTokenizer.from_pretrained(llama_path)
m = AutoModelForCausalLM.from_pretrained(llama_path, local_files_only=local_only).to("cuda")

Loading checkpoint shards: 100%|██████████| 2/2 [00:31<00:00, 15.94s/it]


In [2]:
llama_path = "/scratch/chaijy_root/chaijy2/hufy/.cache/huggingface/hub/LLaMA-2-hf"
local_only = True

llama_tokenizer = LlamaTokenizerFast.from_pretrained(
    llama_path,
    local_files_only=local_only,
)
llama_config = AutoConfig.from_pretrained(llama_path)
with init_empty_weights():
    empty_model = AutoModelForCausalLM.from_config(llama_config)
empty_model.tie_weights()
bnb_quantization_config = BnbQuantizationConfig(
    load_in_4bit=True, 
    bnb_4bit_compute_dtype=torch.bfloat16, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4",
)
llama_model = load_and_quantize_model(
    empty_model, 
    weights_location=llama_path, 
    bnb_quantization_config=bnb_quantization_config, 
    device_map = "auto",
)

In [3]:
# model_name = "t5-small"
# model_name = "allenai/led-base-16384"
summ_model_name = "google/bigbird-pegasus-large-arxiv"
summ_tokenizer = AutoTokenizer.from_pretrained(summ_model_name)
summ_model = AutoModelForSeq2SeqLM.from_pretrained(
    summ_model_name,
).to('cuda')

In [4]:
DEBUG = True

In [29]:
batch_size = 8
llama_tokenizer.padding_side = 'left'
llama_tokenizer.pad_token = llama_tokenizer.unk_token

In [7]:
# dataset = load_dataset("cnn_dailymail", '3.0.0')
# dataset = load_dataset("scientific_papers", "arxiv")
dataset = load_dataset("squad_v2")
train_dataset = dataset['train']
val_dataset = dataset['validation']
# test_dataset = dataset['test']

In [8]:
if DEBUG:
    train_dataset = train_dataset.select(range(250))
    val_dataset = val_dataset.select(range(25))
    # test_dataset = test_dataset.select(range(25))

In [9]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    text = [data["context"] for data in batch]
    inputs = summ_tokenizer(
        text,
        padding=True,
        return_tensors="pt"
    )
    return {
        "text": text,
        **inputs
    }

In [47]:
def generate_batched_prompt(batch):
    text = [data["context"] for data in batch]
    prompts = ["Article:\n" + t + "Summarization in one detailed sentence:\n" for t in text]
    inputs = llama_tokenizer(
        prompts,
        padding=True, 
        return_tensors="pt"
    )
    return {
        "text": text,
        "prompts": prompts,
        **inputs
    }

In [37]:
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=process_data_to_model_inputs)
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=process_data_to_model_inputs)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=generate_batched_prompt)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=generate_batched_prompt)

In [46]:
for dp in val_dataloader:
    print("Generating summary...")
    generated = llama_model.generate(
        input_ids=dp["input_ids"].to(llama_model.device), 
        attention_mask=dp["attention_mask"].to(llama_model.device),
        max_new_tokens=64,
    )
    results = llama_tokenizer.batch_decode(generated[:, len(dp["input_ids"][0]):])
    # print("Original:")
    # print(dp["text"][0] + "\n")
    # print("Generated:")
    # print(results[0].split("\n")[0] + "\n")
    summary = [result.split("\n")[0] + "\n" for result in results]
    print(summary)
    break

Generating summary...
['The Normans were Vikings who settled in France and became French.\n', 'The Normans were Vikings who settled in France and became French.\n', 'The Normans were Vikings who settled in France and became French.\n', 'The Normans were Vikings who settled in France and became French.\n', 'The Normans were Vikings who settled in France and became French.\n', 'The Normans were Vikings who settled in France and became French.\n', 'The Normans were Vikings who settled in France and became French.\n', 'The Normans were Vikings who settled in France and became French.\n']


In [14]:
rouge = load_metric("rouge")

  rouge = load_metric("rouge")


In [15]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [16]:
trainer = Seq2SeqTrainer(
    model=led,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
trainer.train()

You're using a LEDTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 