In [1]:
!nvidia-smi

Tue Apr 19 20:25:20 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8    12W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers datasets torchinfo rouge_score git+https://github.com/google-research/bleurt.git

[K     |████████████████████████████████| 4.0 MB 7.3 MB/s 
[K     |████████████████████████████████| 325 kB 2.9 MB/s 
[K     |████████████████████████████████| 352 kB 50.9 MB/s 
[K     |████████████████████████████████| 1.2 MB 55.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 48.1 MB/s 
[K     |████████████████████████████████| 596 kB 58.5 MB/s 
[K     |████████████████████████████████| 77 kB 6.7 MB/s 
[K     |████████████████████████████████| 895 kB 59.4 MB/s 
[K     |████████████████████████████████| 136 kB 65.3 MB/s 
[K     |████████████████████████████████| 212 kB 65.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 52.2 MB/s 
[K     |████████████████████████████████| 127 kB 67.6 MB/s 
[K     |████████████████████████████████| 271 kB 70.1 MB/s 
[K     |████████████████████████████████| 94 kB 3.0 MB/s 
[K     |████████████████████████████████| 144 kB 68.7 MB/s 
[K     |████████████████████████████████| 462 kB 68.6 MB/s 
[?25h  Building wheel for BLE

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
RANDOM_SEED = 42

In [20]:
DATA_PATH = "/gdrive/MyDrive/final-project/post-refactor/data/"
DEVICE = "cuda"

# Dataset loading

In [3]:
# load already tokenized dataset
from datasets import load_dataset, load_from_disk
import os

TOKENIZED_DATASET_PATH = os.path.join(DATA_PATH, "tokenized_bigbird_dataset")

dataset = load_from_disk(TOKENIZED_DATASET_PATH)

# split dataset into test and train
dataset = dataset.train_test_split(test_size=0.10, seed=RANDOM_SEED)

Loading cached split indices for dataset at /gdrive/MyDrive/final-project/post-refactor/data/tokenized_bigbird_dataset/cache-4e4df3dc8633de75.arrow and /gdrive/MyDrive/final-project/post-refactor/data/tokenized_bigbird_dataset/cache-5bc3cdbcb3bdc05b.arrow


# Model loading

In [8]:
from transformers import BigBirdPegasusForConditionalGeneration
from torchinfo import summary

FINETUNE_MODEL_PATH = os.path.join(DATA_PATH, "BigBirdModelFineTune/", "final/")
model = BigBirdPegasusForConditionalGeneration.from_pretrained(
    FINETUNE_MODEL_PATH,
    block_size=16,
    num_random_blocks=3,
    attention_type="block_sparse",
    use_cache=True)
model.gradient_checkpointing_enable()
summary(model, dtypes=["torch.IntTensor"])

Layer (type:depth-idx)                                                      Param #
BigBirdPegasusForConditionalGeneration                                      --
├─BigBirdPegasusModel: 1-1                                                  --
│    └─Embedding: 2-1                                                       98,409,472
│    └─BigBirdPegasusEncoder: 2-2                                           --
│    │    └─Embedding: 3-1                                                  (recursive)
│    │    └─BigBirdPegasusLearnedPositionalEmbedding: 3-2                   4,194,304
│    │    └─ModuleList: 3-3                                                 201,474,048
│    │    └─LayerNorm: 3-4                                                  2,048
│    └─BigBirdPegasusDecoder: 2-3                                           --
│    │    └─Embedding: 3-5                                                  (recursive)
│    │    └─BigBirdPegasusLearnedPositionalEmbedding: 3-6                   4,194

# Generation

We have different methods which can be used to genetate text, mainly Greedy search, Beam search, Top-K sampling and Top-p sampling.

We will test those on a random sample first, to get an idea of the capabilities of the model.

In [15]:
from IPython.display import HTML, display
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-bigpatent")

def show_patent(summary, claim, generated_claim="", title=""):
  html = f"""
    <div>
      <h1>{ title }</h1>
      <hr/>
      <h2>Summary</h2>
      <p>{ summary }</p>
      <hr/>
      <h2>Claim</h2>
      <p>{ claim }</p>
      <hr/>
      <h2>Generated claim</h2>
      <p>{ generated_claim }</p>
    </div>
  """ 
  return HTML(html)

In [16]:
sample_summary = dataset["test"]["input_ids"][4]
sample_summary_batched = torch.unsqueeze(torch.tensor(sample_summary), 0).to(DEVICE)
sample_claim = dataset["test"]["decoder_input_ids"][4]
sample_claim_batched = torch.unsqueeze(torch.tensor(sample_claim), 0).to(DEVICE)

text_summary_sample = tokenizer.decode(sample_summary, skip_special_tokens=True)
text_claim_sample = tokenizer.decode(sample_claim, skip_special_tokens=True)
show_patent(text_summary_sample, text_claim_sample, generated_claim="TODO", title="Original sample")

In [42]:
#@title Greedy search

greedy_output = model.generate(sample_summary_batched, max_length=500)

generated_claim_sample = tokenizer.decode(greedy_output.squeeze(0), 
                                          skip_special_tokens=True, 
                                          truncation=True)

show_patent(text_summary_sample, text_claim_sample, 
            generated_claim=generated_claim_sample, 
            title="Greedy search generation")

  * num_indices_to_pick_from


In [43]:
#@title Beam search

beam_outputs = model.generate(
    sample_summary_batched, 
    max_length=500, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    num_return_sequences=5, 
    early_stopping=True
)

for beam in range(beam_outputs.shape[0]):
  t = beam_outputs[beam]

  generated_claim_sample = tokenizer.decode(t, 
                                          skip_special_tokens=True, 
                                          truncation=True)

  display(show_patent(text_summary_sample, text_claim_sample, 
                      generated_claim=generated_claim_sample, 
                      title=f"Beam {beam}"))

  * num_indices_to_pick_from


# Catastrophic forgetting

It's pretty easy to see how the model is generating useless text.
Fine tuning is an extremely delicate task and, although we only finetuned the model for $1$ epoch on $\approx 600$ samples, we can see that catastrophic forgetting is happening and the model is overfitting on the output we provide.

Let's try and see how the text looks like without even finetuning the model.

In [10]:
from transformers import BigBirdPegasusForConditionalGeneration

if "model" in vars(): del model
model = BigBirdPegasusForConditionalGeneration.from_pretrained(
    "google/bigbird-pegasus-large-bigpatent",
    block_size=16,
    num_random_blocks=3,
    attention_type="block_sparse",
    use_cache=True).to(DEVICE)

In [17]:
#@title Greedy search

greedy_output = model.generate(sample_summary_batched, max_length=500)

generated_claim_sample = tokenizer.decode(greedy_output.squeeze(0), 
                                          skip_special_tokens=True, 
                                          truncation=True)

show_patent(text_summary_sample, text_claim_sample, 
            generated_claim=generated_claim_sample, 
            title="Greedy search generation")

  * num_indices_to_pick_from


In [19]:
#@title Beam search

beam_outputs = model.generate(
    sample_summary_batched, 
    max_length=500, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    num_return_sequences=5, 
    early_stopping=True
)

for beam in range(beam_outputs.shape[0]):
  t = beam_outputs[beam]

  generated_claim_sample = tokenizer.decode(t, 
                                          skip_special_tokens=True, 
                                          truncation=True)

  display(show_patent(text_summary_sample, text_claim_sample, 
                      generated_claim=generated_claim_sample, 
                      title=f"Beam {beam}"))

  * num_indices_to_pick_from


The generated text is now much more human-like. However, we can clearly see how the network is mostly summarizing the summary in a shorter text instead of actively trying to extract text that is meaningful for the claim.

We **do** need some finetuning in order to change this behaviour.

# Few samples finetuning

Let's load the checkpoint in which the model has been trained with only $250$ examples and see if it behaves better.

In [21]:
from transformers import BigBirdPegasusForConditionalGeneration

if "model" in vars(): del model
model = BigBirdPegasusForConditionalGeneration.from_pretrained(
    os.path.join(DATA_PATH, "BigBirdModelFineTune/", "checkpoint-250/"),
    block_size=16,
    num_random_blocks=3,
    attention_type="block_sparse",
    use_cache=True).to(DEVICE)

In [22]:
#@title Greedy search

greedy_output = model.generate(sample_summary_batched, max_length=500)

generated_claim_sample = tokenizer.decode(greedy_output.squeeze(0), 
                                          skip_special_tokens=True, 
                                          truncation=True)

show_patent(text_summary_sample, text_claim_sample, 
            generated_claim=generated_claim_sample, 
            title="Greedy search generation")

  * num_indices_to_pick_from
