In [1]:
!nvidia-smi

Fri May 20 17:02:56 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers datasets torchinfo rouge_score sacrebleu sacremoses git+https://github.com/google-research/bleurt.git

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [4]:
import torch
import random

RANDOM_SEED = 42

torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [5]:
DATA_PATH = "/gdrive/MyDrive/final-project/post-refactor/data/"
DEVICE = "cuda"

# Dataset loading

In [6]:
from datasets import load_from_disk
from transformers import AutoTokenizer
import os
import sys
import pandas as pd
import csv

DATASET_CSV_PATH = os.path.join(DATA_PATH, "data.csv")

# set maximum csv size to avoid pandas ram issues
csv.field_size_limit(sys.maxsize)
# load into memory for analysis
df = pd.read_csv(DATASET_CSV_PATH, engine="python")[["summary", "claim"]]
df = df.dropna()

TOKENIZED_DATASET_PATH = os.path.join(DATA_PATH, "tokenized_bart_dataset")
MAX_SUMMARY_LEN = None #@param
MAX_CLAIM_LEN = 500 #@param


tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

# let's check if we can load the dataset from disk first.
# this will save us the burden of loading the tokenizer
# and tokenizing all the data we need
if os.path.exists(TOKENIZED_DATASET_PATH):
  dataset = load_from_disk(TOKENIZED_DATASET_PATH)
  print("Dataset loaded")
else:
  from datasets import Dataset
  dataset = Dataset.from_pandas(df)

  # first let's rename data in the way the model expect
  dataset = dataset.rename_column("summary", "input_ids") \
    .rename_column("claim", "decoder_input_ids")
  # even though we carefully preprocessed data some descriptions are still empty.
  # we will filter them out
  dataset = dataset.filter(lambda r: r["input_ids"] is not None)

  def encoder_tokenize_function(row):
    """
    Tokenize the summary into input_ids and attention_mask
    """
    kwargs = {
        "padding": "max_length",
        "truncation": True,
    }

    if MAX_SUMMARY_LEN is not None:
      kwargs["max_length"] = MAX_SUMMARY_LEN

    return tokenizer(row["input_ids"], **kwargs)

  # tokenize the summaries
  dataset = dataset.map(encoder_tokenize_function, batched=True)

  def decoder_tokenize_function(row):
    """
    Tokenize claim into the expected output from the decoder 
    (decoder_input_ids and decoder_attention_mask)
    """
    kwargs = {
        "padding": "max_length",
        "truncation": True,
    }

    if MAX_CLAIM_LEN is not None:
      kwargs["max_length"] = MAX_CLAIM_LEN

    tokenized = tokenizer(row["decoder_input_ids"], **kwargs)

    return {
        "decoder_input_ids": tokenized["input_ids"],
        "decoder_attention_mask": tokenized["attention_mask"]
    }

  # tokenize the claim
  dataset = dataset.map(decoder_tokenize_function, batched=True)

  def compute_labels(row):
    """
    Compute labels based on decoder_input_ids where padding token is represented as -100
    """
    labels = row["decoder_input_ids"]
    labels = [-100 if t == 0 else t for t in labels]
    return {"labels" : labels}
  
  dataset = dataset.map(compute_labels, batched=True)

  # export the dataset to disk for future loading
  dataset.save_to_disk(TOKENIZED_DATASET_PATH)
  print("Dataset computed and saved")

Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset computed and saved


# Model loading

In [8]:
from transformers import BartForConditionalGeneration
from torchinfo import summary

FINETUNE_MODEL_PATH = os.path.join(DATA_PATH, "BartModelFineTune/checkpoint-1986/")
model = BartForConditionalGeneration.from_pretrained(FINETUNE_MODEL_PATH).to(DEVICE)
model.gradient_checkpointing_enable()
summary(model, dtypes=["torch.IntTensor"])

Layer (type:depth-idx)                                  Param #
BartForConditionalGeneration                            --
├─BartModel: 1-1                                        --
│    └─Embedding: 2-1                                   38,603,520
│    └─BartEncoder: 2-2                                 --
│    │    └─Embedding: 3-1                              (recursive)
│    │    └─BartLearnedPositionalEmbedding: 3-2         787,968
│    │    └─ModuleList: 3-3                             42,527,232
│    │    └─LayerNorm: 3-4                              1,536
│    └─BartDecoder: 2-3                                 --
│    │    └─Embedding: 3-5                              (recursive)
│    │    └─BartLearnedPositionalEmbedding: 3-6         787,968
│    │    └─ModuleList: 3-7                             56,710,656
│    │    └─LayerNorm: 3-8                              1,536
├─Linear: 1-2                                           38,603,520
Total params: 178,023,936
Trainable params: 

# Generation

We have different methods which can be used to genetate text, mainly Greedy search, Beam search, Top-K sampling and Top-p sampling.

We will generate text using all of those methods first and later we will analyze the result by computing metrics on them.

## Gready search

In [9]:
from tqdm.notebook import tqdm

# generate using greedy strategy
greedy_output = list()
for sample in tqdm(dataset):
  with torch.cuda.amp.autocast():
    sample = torch.tensor(sample["input_ids"]).unsqueeze(0).to(DEVICE)
    pred = model.generate(sample, max_length=MAX_CLAIM_LEN).to("cpu")
    # pad pred to MAX_CLAIM_LENGTH
    pred = torch.nn.functional.pad(pred, (0, MAX_CLAIM_LEN - pred.shape[1]))
    # add it to the output
    greedy_output.append(pred.squeeze(0))

# decode all outputs into text
greedy_output = tokenizer.batch_decode(torch.stack(greedy_output), skip_special_tokens=True)

# set it to df
df["greedy_gen"] = pd.Series(greedy_output)

# export df
df.to_csv(os.path.join(DATA_PATH, "bart_out.csv"))

  0%|          | 0/60 [00:00<?, ?it/s]

## Beam search

In [10]:
from tqdm.notebook import tqdm

# generate using beam strategy
beam1_output = list()
beam2_output = list()
beam3_output = list()
for sample in tqdm(dataset):
  with torch.cuda.amp.autocast():
    sample = torch.tensor(sample["input_ids"]).unsqueeze(0).to(DEVICE)
    pred = model.generate(
      sample, 
      max_length=500, 
      num_beams=3, 
      num_return_sequences=3, 
      early_stopping=True
    )
    # pad pred to MAX_CLAIM_LENGTH
    pred = torch.nn.functional.pad(pred, (0, MAX_CLAIM_LEN - pred.shape[1]))
    # add it to the output
    beam1_output.append(pred[0])
    beam2_output.append(pred[1])
    beam3_output.append(pred[2])
    

# decode all outputs into text
beam1_output = tokenizer.batch_decode(torch.stack(beam1_output), skip_special_tokens=True)
df["beam1_gen"] = pd.Series(beam1_output)
beam2_output = tokenizer.batch_decode(torch.stack(beam2_output), skip_special_tokens=True)
df["beam2_gen"] = pd.Series(beam2_output)
beam3_output = tokenizer.batch_decode(torch.stack(beam3_output), skip_special_tokens=True)
df["beam3_gen"] = pd.Series(beam3_output)

# export df
df.to_csv(os.path.join(DATA_PATH, "bart_out.csv"))

  0%|          | 0/60 [00:00<?, ?it/s]