In [30]:
import re

from datasets import load_dataset, load_metric
import evaluate
import nltk
import nltk.data
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm
from transformers import (
    AdamW, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

In [31]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\milan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Setup and Preprocessing for Model Load

In [32]:
DATASET_NAME = "multi_x_science_sum"
DOC_SEP = " ||||| "
BATCH_SIZE = 16
MAX_LENGTH_ENC = 4096
MAX_LENGTH_DEC = 256

rouge = load_metric("rouge")

dataset = load_dataset(DATASET_NAME)

pat = re.compile("@cite_[0-9]+")

Found cached dataset multi_x_science_sum (C:/Users/milan/.cache/huggingface/datasets/multi_x_science_sum/default/1.1.0/2876ec0401f8f5c5acf7f4857dbc8d6229a390ab428321ab848f03f14b7f9729)
100%|██████████| 3/3 [00:00<00:00, 158.28it/s]


In [33]:
def preprocess_dataset(example):

    abstracts = example["abstract"].split("| Abstract: ")[-1]
    related_work = pat.sub("@cite", example["related_work"])
    ref_abstracts = filter(bool, example["ref_abstract"]["abstract"])
    output = {
        "abstracts": f"{abstracts}{DOC_SEP}{DOC_SEP.join(ref_abstracts)}",
        "related_work": related_work
    }
    return output

def preprocess_dataset_batched(example):
    abstracts = [
        abstract.split("| Abstract: ")[-1] + DOC_SEP + DOC_SEP.join([x for x in ref_abstract["abstract"] if x])
        for abstract, ref_abstract in zip(example["abstract"], example["ref_abstract"])
    ]
    related_work = [pat.sub("@cite", rw) for rw in example["related_work"]]
    output = {
        "abstracts": abstracts,
        "related_work": related_work,
    }
    return output

dataset_processed = {}
for split in dataset.keys():
    dataset_processed[split] = dataset[split].map(
        preprocess_dataset_batched,
        remove_columns=dataset[split].column_names,
        batched=True,
        batch_size=BATCH_SIZE,
    )

Loading cached processed dataset at C:\Users\milan\.cache\huggingface\datasets\multi_x_science_sum\default\1.1.0\2876ec0401f8f5c5acf7f4857dbc8d6229a390ab428321ab848f03f14b7f9729\cache-36ea612be6b764b4.arrow
Loading cached processed dataset at C:\Users\milan\.cache\huggingface\datasets\multi_x_science_sum\default\1.1.0\2876ec0401f8f5c5acf7f4857dbc8d6229a390ab428321ab848f03f14b7f9729\cache-521ec846b4403907.arrow
Loading cached processed dataset at C:\Users\milan\.cache\huggingface\datasets\multi_x_science_sum\default\1.1.0\2876ec0401f8f5c5acf7f4857dbc8d6229a390ab428321ab848f03f14b7f9729\cache-bfe27b0fca3375bf.arrow


In [34]:
dataset_processed

{'train': Dataset({
     features: ['related_work', 'abstracts'],
     num_rows: 30369
 }),
 'test': Dataset({
     features: ['related_work', 'abstracts'],
     num_rows: 5093
 }),
 'validation': Dataset({
     features: ['related_work', 'abstracts'],
     num_rows: 5066
 })}

## Loading the Model and Tokenizer

In [35]:
def get_tokenizer(host_tokenizer: str):
  """return the tokenizer and model for LLM training"""

  return (AutoTokenizer.from_pretrained(host_tokenizer, 
                                        use_cache=False, 
                                        gradient_checkpointing=True), 
          AutoModelForSeq2SeqLM.from_pretrained(host_tokenizer, 
                                                use_cache=False, 
                                                gradient_checkpointing=True))


centrum_tokenizer, centrum_model = get_tokenizer("ratishsp/Centrum")

loading file vocab.json from cache at C:\Users\milan/.cache\huggingface\hub\models--ratishsp--Centrum\snapshots\e9e32bd7ab7f460c1786f42e6e7f4f5697ace02d\vocab.json
loading file merges.txt from cache at C:\Users\milan/.cache\huggingface\hub\models--ratishsp--Centrum\snapshots\e9e32bd7ab7f460c1786f42e6e7f4f5697ace02d\merges.txt
loading file tokenizer.json from cache at C:\Users\milan/.cache\huggingface\hub\models--ratishsp--Centrum\snapshots\e9e32bd7ab7f460c1786f42e6e7f4f5697ace02d\tokenizer.json
loading file added_tokens.json from cache at C:\Users\milan/.cache\huggingface\hub\models--ratishsp--Centrum\snapshots\e9e32bd7ab7f460c1786f42e6e7f4f5697ace02d\added_tokens.json
loading file special_tokens_map.json from cache at C:\Users\milan/.cache\huggingface\hub\models--ratishsp--Centrum\snapshots\e9e32bd7ab7f460c1786f42e6e7f4f5697ace02d\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\milan/.cache\huggingface\hub\models--ratishsp--Centrum\snapshots\e9e32bd7a

In [36]:
print(centrum_tokenizer, centrum_model)

PreTrainedTokenizerFast(name_or_path='ratishsp/Centrum', vocab_size=50265, model_max_len=16384, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}) LEDForConditionalGeneration(
  (led): LEDModel(
    (shared): Embedding(50266, 768, padding_idx=1)
    (encoder): LEDEncoder(
      (embed_tokens

In [37]:
def tokenize_dataset_batched(example):
    # Tokenizer input
    input_encoding = centrum_tokenizer(
        example["abstracts"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH_ENC,
        return_tensors="pt",
    )

    # Tokenizer output
    output_encoding = centrum_tokenizer(
        example["related_work"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH_DEC,
        return_tensors="pt",
    )

    # Modify output encoding to ignore padding in loss function
    # torch ignore -100 in loss function computation
    labels = output_encoding["input_ids"].clone()
    labels[labels == centrum_tokenizer.pad_token_id] = -100

    # Global attention with vectorized operations (optimized for GPU)
    input_ids = input_encoding["input_ids"]
    docsep_token_id = centrum_tokenizer.convert_tokens_to_ids(DOC_SEP)
    global_attention_mask = (input_ids == centrum_tokenizer.cls_token_id) | (input_ids == docsep_token_id)

    return {
        "input_ids": input_encoding["input_ids"],
        "attention_mask": input_encoding["attention_mask"],
        "global_attention_mask": global_attention_mask.float(),
        "labels": labels,
    }

centrum_tokenizer.add_tokens(DOC_SEP, special_tokens=True)
centrum_model.resize_token_embeddings(len(centrum_tokenizer))
docsep_token_id = centrum_tokenizer.convert_tokens_to_ids(DOC_SEP)

dataset_tokenized = {}
for split in dataset_processed.keys():
    dataset_tokenized[split] = (
        dataset_processed[split]
        .select(range(len(dataset_processed[split])))
        .map(
            tokenize_dataset_batched,
            remove_columns=dataset_processed[split].column_names,
            batched=True,
            batch_size=BATCH_SIZE,
        )
    )



In [38]:
dataset_tokenized

{'train': Dataset({
     features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
     num_rows: 30369
 }),
 'test': Dataset({
     features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
     num_rows: 5093
 }),
 'validation': Dataset({
     features: ['input_ids', 'attention_mask', 'global_attention_mask', 'labels'],
     num_rows: 5066
 })}

In [39]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = centrum_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = centrum_tokenizer.pad_token_id
    label_str = centrum_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(
        predictions=pred_str, references=label_str, rouge_types=["rouge2"]
    )["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

## Fine-tuning the Model with Trainer

In [40]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    fp16=True,
    num_train_epochs=2,
    learning_rate=2e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=250,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [41]:
trainer = Seq2SeqTrainer(
    model=centrum_model,
    args=training_args,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["validation"],
)

Using cuda_amp half precision backend
 13%|█▎        | 758/5694 [54:56<5:57:49,  4.35s/it]


In [42]:
trainer.train()

***** Running training *****
  Num examples = 30369
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3798
  Number of trainable parameters = 152408832
 13%|█▎        | 500/3798 [1:43:37<2:15:43,  2.47s/it]  

{'loss': 3.4396, 'learning_rate': 0.0001859639233370913, 'epoch': 0.26}


 26%|██▋       | 1000/3798 [2:04:57<1:55:21,  2.47s/it]***** Running Evaluation *****
  Num examples = 5066
  Batch size = 16


{'loss': 3.2059, 'learning_rate': 0.00015777903043968434, 'epoch': 0.53}


                                                       
 26%|██▋       | 1000/3798 [2:07:39<1:55:21,  2.47s/it]Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000\config.json


{'eval_loss': 3.1048591136932373, 'eval_runtime': 162.5592, 'eval_samples_per_second': 31.164, 'eval_steps_per_second': 1.95, 'epoch': 0.53}


Model weights saved in ./checkpoint-1000\pytorch_model.bin
 39%|███▉      | 1500/3798 [2:28:19<1:34:49,  2.48s/it] 

{'loss': 3.1152, 'learning_rate': 0.00012959413754227734, 'epoch': 0.79}


 53%|█████▎    | 2000/3798 [2:48:49<1:13:14,  2.44s/it]***** Running Evaluation *****
  Num examples = 5066
  Batch size = 16


{'loss': 2.9804, 'learning_rate': 0.00010140924464487037, 'epoch': 1.05}


                                                       
 53%|█████▎    | 2000/3798 [2:51:32<1:13:14,  2.44s/it]Saving model checkpoint to ./checkpoint-2000
Configuration saved in ./checkpoint-2000\config.json


{'eval_loss': 2.9712271690368652, 'eval_runtime': 162.9232, 'eval_samples_per_second': 31.094, 'eval_steps_per_second': 1.946, 'epoch': 1.05}


Model weights saved in ./checkpoint-2000\pytorch_model.bin
 66%|██████▌   | 2500/3798 [3:12:08<53:20,  2.47s/it]   

{'loss': 2.7474, 'learning_rate': 7.328072153325818e-05, 'epoch': 1.32}


 79%|███████▉  | 3000/3798 [10:20:27<10:41:14, 48.21s/it] ***** Running Evaluation *****
  Num examples = 5066
  Batch size = 16


{'loss': 2.7161, 'learning_rate': 4.509582863585119e-05, 'epoch': 1.58}


                                                         
 79%|███████▉  | 3000/3798 [10:23:25<10:41:14, 48.21s/it]Saving model checkpoint to ./checkpoint-3000
Configuration saved in ./checkpoint-3000\config.json


{'eval_loss': 2.889575481414795, 'eval_runtime': 178.1377, 'eval_samples_per_second': 28.439, 'eval_steps_per_second': 1.78, 'epoch': 1.58}


Model weights saved in ./checkpoint-3000\pytorch_model.bin
 92%|█████████▏| 3500/3798 [10:46:45<14:01,  2.82s/it]   

{'loss': 2.6719, 'learning_rate': 1.6910935738444194e-05, 'epoch': 1.84}


100%|██████████| 3798/3798 [11:00:36<00:00,  2.04s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 3798/3798 [11:00:36<00:00, 10.44s/it]

{'train_runtime': 39636.919, 'train_samples_per_second': 1.532, 'train_steps_per_second': 0.096, 'train_loss': 2.958127825808814, 'epoch': 2.0}





TrainOutput(global_step=3798, training_loss=2.958127825808814, metrics={'train_runtime': 39636.919, 'train_samples_per_second': 1.532, 'train_steps_per_second': 0.096, 'train_loss': 2.958127825808814, 'epoch': 2.0})

In [43]:
torch.save(centrum_model.state_dict(), "centrum_xsci_test")