In [None]:
!pip install torch==2.3.1 --index-url https://download.pytorch.org/whl/cu121
!pip install transformers==4.37.0
!pip install datasets==2.21.0
!pip install accelerate==0.21.0
!pip install rouge==1.0.1
!pip install tqdm==4.66.5
!pip install jieba==0.42.1

In [None]:
from transformers import AutoTokenizer
from transformers import GPT2LMHeadModel
from datasets import load_dataset
from tqdm import tqdm
import torch
from torch.utils.tensorboard import SummaryWriter
from rouge import Rouge
import jieba

In [None]:
class LCSTSDataset(torch.utils.data.Dataset):
    def __init__(self, raw_data) -> None:
        super().__init__()
        self.data = raw_data
        # To prevent out-of-vocabulary tokens from being transformed into [UNK]
        self.token_replacement = [
            ["：", ":"],
            ["，", ","],
            ["“", '"'],
            ["”", '"'],
            ["？", "?"],
            ["……", "..."],
            ["！", "!"],
        ]

    def __getitem__(self, index):
        d = self.data[index]
        # Substitute some full-width punctuations with half-width ones
        for k in d:
            for tok in self.token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

In [None]:
# `pad_token_id`=`tokenizer.eos_token_id`:
# For each batch, first finished sentences should have <|endoftext|> rather than [PAD] at the end.
# Check more details from the following link.
# https://github.com/huggingface/transformers/blob/b880508440f43f80e35a78ccd2a32f3bde91cb23/src/transformers/generation_utils.py#L1248-L1251

# `max_new_tokens`: If you don’t set max_new_tokens,
# Hugging Face will also count the input tokens!

def do_evaluate(tokenizer, model, validation_loader, rouge_metric, inner_check=False):
    pbar = tqdm(validation_loader)
    pbar.set_description(f"Evaluating")

    predictions = []
    references = []
    count = 0
    for ground_truth, inputs in pbar:
        output = [
            s.split("[SEP]")[1].replace(" ", "").split("<|endoftext|>")[0]
            for s in tokenizer.batch_decode(
                model.generate(
                    **inputs,
                    max_new_tokens=200, # Maximum number of tokens to generate
                    pad_token_id=tokenizer.eos_token_id,
                )
            )
        ]
        targets = [
            s.split("[SEP]")[1].replace(" ", "").replace("<|endoftext|>", "")
            for s in tokenizer.batch_decode(ground_truth["input_ids"])
        ]
        assert len(output) == len(targets)
        output = [" "] if output == [""] else output
        # We use jieba to perform word-level evaluations with ROUGE
        predictions.extend([" ".join(jieba.lcut(o)) for o in output])
        references.extend([" ".join(jieba.lcut(t)) for t in targets])
        count += 1
        if count > 100 and inner_check:
            break # During training, we only evaluate the first 100 examples.

    score = rouge_metric.get_scores(predictions, references, avg=True)
    if inner_check:
        print("Validation using 100 examples: ", score)
    else:
        print(score)

    return score, predictions, references

In [None]:
def collate_fn(batch):
    complete_text = [
        f"[CLS]{example['text']}[SEP]{example['summary']}<|endoftext|>"
        for example in batch
    ]
    complete_text = tokenizer.batch_encode_plus(
        complete_text,
        padding=True,
        truncation=True,
        return_tensors="pt",
        add_special_tokens=False,
    )
    # Set label padding tokens to -100 for loss masking
    labels = torch.where(
        condition=complete_text.input_ids != tokenizer.pad_token_id,
        input=complete_text.input_ids,
        other=-100,
    )
    complete_text["labels"] = labels
    complete_text = {k: complete_text[k].to(device) for k in complete_text}

    infer_text = [example["text"] for example in batch]
    infer_text = tokenizer.batch_encode_plus(
        infer_text,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    infer_text = {k: infer_text[k].to(device) for k in infer_text}
    return complete_text, infer_text

In [None]:
TRAIN_BATCH_SIZE = 32
VAL_BATCH_SIZE = 1 # During evaluation, we don't pad the input.
NUM_EPOCHS = 3
LR = 1e-5
SAVED_DIR = "saved_models"
model_name = "uer/gpt2-chinese-cluecorpussmall"

# TensorBoard writer
writer = SummaryWriter(f"runs/{SAVED_DIR}_test_bs{VAL_BATCH_SIZE}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left", # Use left padding for GPT2
)
# You can set your device id instead of cuda:0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# Sometimes checking the Hugging Face dataset is slow,
# it will be faster if we transform the dataset object into a list using .to_list(). 

raw_train = load_dataset(
    "hugcyp/LCSTS", split="train", cache_dir="./cache/"
).to_list()
raw_val = load_dataset(
    "hugcyp/LCSTS", split="validation", cache_dir="./cache/"
).to_list()

In [None]:
train_set = LCSTSDataset(raw_train)
val_set = LCSTSDataset(raw_val)

In [None]:
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)
val_loader = torch.utils.data.DataLoader(
    val_set,
    batch_size=VAL_BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
)

In [None]:
# `resize_token_embeddings`:
# Increasing the size will add newly initialized vectors at the end. 

model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.add_special_tokens({"eos_token": "<|endoftext|>"}) # Add a new eos token
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

In [None]:
# Set up the optimizer and the evaluation metric
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
rouge_metric = Rouge()

In [None]:
step_i = 0
for epoch in range(NUM_EPOCHS):
    pbar = tqdm(train_loader)
    pbar.set_description(f"Training epoch [{epoch+1}/{NUM_EPOCHS}]")
    for inputs, _ in pbar:
        optimizer.zero_grad()
        loss = model(**inputs).loss
        loss.backward()
        optimizer.step()
        pbar.set_postfix(loss=loss.item())
        # Log the loss to TensorBoard
        writer.add_scalar("Loss/train", loss.item(), step_i)

        if step_i % 1000 == 0 and step_i != 0: # Evaluate every 1000 steps
            score, pres, refs = do_evaluate(
                tokenizer=tokenizer,
                model=model,
                validation_loader=val_loader,
                rouge_metric=rouge_metric,
                inner_check=True,
            )
            print(f"Rouge scores on step{step_i} of epoch {epoch}:", score)
            print("Predictions:", pres[:5]) # Check the first 5 predictions
            print("References:", refs[:5])  # Check the first 5 references
            writer.add_scalar("Rouge-1/val", score["rouge-1"]["f"], step_i)
            writer.add_scalar("Rouge-2/val", score["rouge-2"]["f"], step_i)

        step_i += 1
    score, pres, refs = do_evaluate(
        tokenizer=tokenizer,
        model=model,
        validation_loader=val_loader,
        rouge_metric=rouge_metric,
    )
    torch.save(model, f"{SAVED_DIR}/ep{epoch}.ckpt")

tokenizer.save_pretrained(f"{SAVED_DIR}/tokenizer")