Final Project - Gregory LeMasurier and Mojtaba Talaei Khoei

Making the training file a jupyter notebook for the time being so I can easily debug it.

In [8]:
# Install Dependencies
import sys
!{sys.executable} -m pip install rouge-score nltk sentencepiece



In [9]:
# Common Imports
import os
import random

import transformers
from transformers import PegasusTokenizer, PegasusConfig
from transformers import PegasusForConditionalGeneration

import datasets
from datasets import load_dataset

import torch
from torch.utils.data import DataLoader

import wandb
from packaging import version
from tqdm.auto import tqdm


%load_ext autoreload
%autoreload 2

sys.path.append('../')

from transformer_mt import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
# Setup logging
import logging

logger = logging.getLogger("Summarization")
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_warning()

In [11]:
# ROUGE Metric
rouge = datasets.load_metric("rouge")

In [12]:
dataset_name = 'cnn_dailymail'
dataset_version = '3.0.0'
wandb_project = "PegasusSummarization"
output_dir = "output_dir/"
device = 'cpu'#'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'google/pegasus-xsum' 
tokenizer_name = 'google/pegasus-cnn_dailymail'
seq_len = 512
batch_size = 8
learning_rate = 5e-5
weight_decay = 0.0
num_train_epochs = 1
lr_scheduler_type = "linear"
num_warmup_steps = 0

# Flag to make 
debug = True

In [13]:
def main():
    logger.info(f"Starting tokenizer training")

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    logger.info(f"Loading dataset")

    #wandb.init(project=wandb_project) #Skipping config for now - will add back later

    os.makedirs(output_dir, exist_ok=True)

    raw_datasets = load_dataset(dataset_name, dataset_version)

    # Make a small dataset for proof of concept
    if debug:
        raw_datasets = utils.sample_small_debug_dataset(raw_datasets)

    ## TOKENIZER
    #tokenizer_path = os.path.join(output_dir, f"tokenizer")
    tokenizer = transformers.PegasusTokenizer.from_pretrained(tokenizer_name)

    ## PRETRAINED MODEL
    #The pegasus model is too large to test on a laptop, so load a small config for now
    #model = PegasusForConditionalGeneration(model_name).to(device)
    config = PegasusConfig(
            encoder_layers=2, 
            decoder_layers=2, 
            encoder_attention_heads=8, 
            decoder_attention_heads=8, 
            decoder_ffn_dim=1024, 
            encoder_ffn_dim=1024,
            max_position_embeddings=seq_len,
            )
    model = PegasusForConditionalGeneration(config).to(device)


    column_names = raw_datasets["train"].column_names

    def tokenize_function(examples):
        inputs = [ex for ex in examples['article']]
        targets = [ex for ex in examples['highlights']]
        model_inputs = tokenizer(inputs, max_length=seq_len, truncation=True)
        model_inputs['labels'] = tokenizer(targets, max_length=seq_len, truncation=True)['input_ids']
        return model_inputs

    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=8,
        remove_columns=column_names,
        load_from_cache_file=True,
        desc="Tokenizing the dataset",
    )

    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"] if "validaion" in tokenized_datasets else tokenized_datasets["test"]

    for index in random.sample(range(len(train_dataset)), 2):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
        logger.info(f"Decoded input_ids: {tokenizer.decode(train_dataset[index]['input_ids'])}")
        logger.info(f"Decoded labels: {tokenizer.decode(train_dataset[index]['labels'])}")
        logger.info("\n")

    #collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer, max_length=seq_len, padding='max_length')
    collator = transformers.DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, max_length=seq_len, padding='max_length')

    train_dataloader = DataLoader(
        train_dataset, 
        shuffle=True, 
        collate_fn=collator, 
        batch_size=batch_size
    )
    
    eval_dataloader = DataLoader(
        eval_dataset, 
        collate_fn=collator, 
        batch_size=batch_size
    )

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay,
    )
    
    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = len(train_dataloader)
    max_train_steps = num_train_epochs * num_update_steps_per_epoch

    lr_scheduler = transformers.get_scheduler(
        name=lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=max_train_steps,
    )

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {num_train_epochs}")
    logger.info(f"  Total optimization steps = {max_train_steps}")
    #progress_bar = tqdm(range(max_train_steps))

    # Log a pre-processed training example to make sure the pre-processing does not have bugs in it
    # and we do not input garbage to our model.
    batch = next(iter(train_dataloader))
    #print(len(batch['input_ids']))
    #print(len(batch['labels']))
    #logger.info("Look at the data that we input into the model, check that it looks like what we expect.")
    #for index in random.sample(range(len(batch)), 2):
    #    logger.info(f"Decoded input_ids size: {len(batch['input_ids'][index])}")
    #    logger.info(f"Decoded input_ids: {tokenizer.decode(batch['input_ids'][index])}")
    #    logger.info(f"Decoded labels size: {len(batch['labels'][index])}")
    #    logger.info(f"Decoded labels: {tokenizer.decode(batch['labels'][index])}")
    #    logger.info("\n")

    global_step = 0
    for epoch in range(num_train_epochs):
        model.train()
        for batch in train_dataloader:
            #print("EPOCH: " + str(epoch) + " BATCH: " + str(batch))
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            print(logits)


In [14]:
if __name__ == "__main__" :
    if version.parse(datasets.__version__) < version.parse("1.18.0"):
        raise RuntimeError("This script requires Datasets 1.18.0 or higher. Please update via pip install -U datasets.")
    main()

04/19/2022 09:16:47 - INFO - Summarization - Starting tokenizer training
04/19/2022 09:16:47 - INFO - Summarization - Loading dataset
100%|██████████| 3/3 [00:00<00:00, 605.56it/s]
Tokenizing the dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A


[A[A[A




Tokenizing the dataset #0: 100%|██████████| 1/1 [00:00<00:00, 10.93ba/s]


[A[A
Tokenizing the dataset #2: 100%|██████████| 1/1 [00:00<00:00,  6.97ba/s]




[A[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A


Tokenizing the dataset #6: 100%|██████████| 1/1 [00:00<00:00,  7.23ba/s]
Tokenizing the dataset #4: 100%|██████████| 1/1 [00:00<00:00,  6.92ba/s]


Tokenizing the dataset #3: 100%|██████████| 1/1 [00:00<00:00,  6.04ba/s]
Tokenizing the dataset #1: 100%|██████████| 1/1 [00:00<00:00,  5.16ba/s]
Tokenizing the dataset #7: 100%|██████████| 1/1 [00:00<00:00, 12.37ba/s]




Tokenizing the dataset #5: 100%|██████████| 1/1 [00:00<00:00,  7.44ba/s]
04/19/2022 09:16:56 - INFO - Summarization - Sample 75 of the training s

IndexError: index out of range in self