In [10]:
!nvidia-smi

Mon Jan 31 19:03:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    28W /  70W |   8624MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers datasets torchinfo

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


# Dataset tokenization

We will tokenize the whole data by making use of the `datasets` library, which works seamlessly with the huggingface library.

In [3]:
from datasets import load_dataset, load_from_disk
import os

GDRIVE_DATASET_PATH = "/gdrive/MyDrive/university/tokenized_dataset"
SUMMARIES_CLAIMS_CSV_PATH = "/gdrive/MyDrive/university/summaries_claims.csv"

## Memory concerns

Unfortunately finetuning such a huge model requires a lot of memory in GPU. 
Google colab limit is $\approx$ 12 GB which is not enough for the whole dataset.

In [4]:
import sys
import pandas as pd
import csv

csv.field_size_limit(sys.maxsize)
# load into memory for analysis
df = pd.read_csv(SUMMARIES_CLAIMS_CSV_PATH, engine="python")
# some descriptions are NaNs so let's drop them
df = df.dropna()

Let's check how many samples we would be able to use by using a lower amount of data.

In [5]:
SIZES = [512, 1024, 2048, 4096]

summary_tokens = df["summaries"].apply(lambda x: len(x.split(" ")))
claims_tokens = df["claims"].apply(lambda x: len(x.split(" ")))

for size in SIZES:
  ok_summaries = summary_tokens <= size
  ok_claims = claims_tokens <= size  
  print(f"{ok_summaries.sum()} summaries have <= {size} tokens ({ok_summaries.sum() / len(summary_tokens) * 100:2.2f}%)")
  print(f"{ok_claims.sum()} claims have <= {size} tokens ({ok_claims.sum() / len(ok_claims) * 100:2.2f}%)")

11707 summaries have <= 512 tokens (44.97%)
25371 claims have <= 512 tokens (97.45%)
18100 summaries have <= 1024 tokens (69.52%)
25970 claims have <= 1024 tokens (99.75%)
23065 summaries have <= 2048 tokens (88.59%)
26026 claims have <= 2048 tokens (99.97%)
25311 summaries have <= 4096 tokens (97.22%)
26033 claims have <= 4096 tokens (99.99%)


We can safely see that by using 512 as maximum length we obtain most of the claims ($97.45\%$).

For the description, however, we can't really go that low or we would lose most of the sample. While using 4096 tokens, which is the maximum length handled by bigbird, would allow us to use all the data in the dataset we don't have adequate resources for such a job.

We will therefore make use of only those summaries whose length is $\leq 2048$, which accounts for $\approx 88\%$ of all the available data.

In [4]:
#@title Tokenize data

SUMMARY_LEN = 2048 #@param {type: "number"}
CLAIM_LEN = 512 #@param {type: "number"}

# let's check if we can load the dataset from disk first.
# this will save us the burden of loading the tokenizer
# and tokenizing all the data we need
if os.path.exists(GDRIVE_DATASET_PATH):
  dataset = load_from_disk(GDRIVE_DATASET_PATH)
  print("Dataset loaded")
else:
  from transformers import AutoTokenizer
  from datasets import Dataset
  reduced_df = df[(claims_tokens <= CLAIM_LEN) & (summary_tokens <= SUMMARY_LEN)]
  dataset = Dataset.from_pandas(reduced_df)

  tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-bigpatent")

  # first let's rename data in the way the model expect
  dataset = dataset.rename_column("summaries", "input_ids") \
    .rename_column("claims", "decoder_input_ids") \
    .remove_columns("patentnumber") \
    .remove_columns("__index_level_0__")

  # even though we carefully preprocessed data some descriptions are still empty.
  # we will filter them out
  dataset = dataset.filter(lambda r: r["input_ids"] is not None)

  def encoder_tokenize_function(row):
    """
    Tokenize the summary into input_ids and attention_mask
    """
    return tokenizer(row["input_ids"], max_length=SUMMARY_LEN, padding="max_length", truncation=True)

  # tokenize the summaries
  dataset = dataset.map(encoder_tokenize_function, batched=True)

  def decoder_tokenize_function(row):
    """
    Tokenize claim into the expected output from the decoder 
    (decoder_input_ids and decoder_attention_mask)
    """
    tokenized = tokenizer(row["decoder_input_ids"], max_length=CLAIM_LEN, padding="max_length", truncation=True)

    return {
        "decoder_input_ids": tokenized["input_ids"],
        "decoder_attention_mask": tokenized["attention_mask"]
    }

  # tokenize the claim
  dataset = dataset.map(decoder_tokenize_function, batched=True)

  def compute_labels(row):
    """
    Compute labels based on decoder_input_ids where padding token is represented as -100
    """
    labels = row["decoder_input_ids"]
    labels = [-100 if t == 0 else t for t in labels]
    return {"labels" : labels}
  
  dataset = dataset.map(compute_labels, batched=True)

  # export the dataset to disk for future loading
  dataset.save_to_disk(GDRIVE_DATASET_PATH)
  print("Dataset computed and saved")

Dataset loaded


We saw that, with the suggested configuration, we can manage to achieve $\approx 0.55 \ \text{it/s}$.

In [8]:
from datetime import timedelta

def format(seconds):
    h, rem = divmod(seconds, 3600)
    m, s = divmod(rem, 60)
    return round(h), round(m), round(s)

time_per_epoch = len(dataset) * 0.55
total_time = time_per_epoch * 3
h, m, s = format(total_time)

print(f"Approximately we would need {h}:{m}:{s} to train the whole dataset for 3 epochs")

Approximately we would need 10:19:38 to train the whole dataset for 3 epochs


# Neural model

We will use Google's BigbirdPegasus (*BP*) model, by making use of huggingface APIs.
[Bigbird](https://arxiv.org/pdf/2007.14062v2.pdf) represents the BERT variant in which attention is computed.


The model is made public with an already pretrained version on a patent dataset, [BIGPATENT](https://arxiv.org/pdf/1906.03741v1.pdf) different from the one we're using.
The dataset consists of 1.3 million records of U.S. patent documents along with human written abstractive summaries.

While general abstractive summarization is a different task than the one we're interested in, it's fairly easy to suppose that an indipendent claim could be interpreted as the summarization of the patent from a particular perspective.

This model, which represents the current SOTA in the abstractive summarization field, is an incredible resource in the problem we're trying to solve.

We will initially try to generate claims without any work on the model.

In [10]:
from transformers import BigBirdPegasusForConditionalGeneration

model = BigBirdPegasusForConditionalGeneration.from_pretrained(
    "google/bigbird-pegasus-large-bigpatent",
    block_size=16,
    num_random_blocks=2,
    attention_type="block_sparse")

Downloading:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

# Fine-tuning

In [13]:
# clean cuda memory 
import gc
import torch

if "model" in vars(): del model
if "trainer" in vars(): del trainer

torch.cuda.empty_cache()
gc.collect()

52

We are going to use [Huggingface](https://huggingface.co/) library to fine tune 

In [5]:
from transformers import BigBirdPegasusForConditionalGeneration, BigBirdPegasusConfig

model = BigBirdPegasusForConditionalGeneration.from_pretrained(
    "google/bigbird-pegasus-large-bigpatent",
    block_size=16,
    num_random_blocks=3,
    attention_type="block_sparse")

model.gradient_checkpointing_enable()

In [8]:
from torchinfo import summary
summary(model, dtypes=["torch.IntTensor"])

Layer (type:depth-idx)                                                      Param #
BigBirdPegasusForConditionalGeneration                                      --
├─BigBirdPegasusModel: 1-1                                                  --
│    └─Embedding: 2-1                                                       98,409,472
│    └─BigBirdPegasusEncoder: 2-2                                           --
│    │    └─Embedding: 3-1                                                  (recursive)
│    │    └─BigBirdPegasusLearnedPositionalEmbedding: 3-2                   4,194,304
│    │    └─ModuleList: 3-3                                                 201,474,048
│    │    └─LayerNorm: 3-4                                                  2,048
│    └─BigBirdPegasusDecoder: 2-3                                           --
│    │    └─Embedding: 3-5                                                  (recursive)
│    │    └─BigBirdPegasusLearnedPositionalEmbedding: 3-6                   4,194

In [6]:
def freeze_params(model):
  for par in model.parameters():
      par.requires_grad = False

def freeze_embeds(model):
  freeze_params(model.model.shared)
  freeze_params(model.model.encoder)

  for d in [model.model.decoder]:
    freeze_params(d.embed_positions)
    freeze_params(d.embed_tokens)

In [7]:
freeze_embeds(model)
summary(model, dtypes=["torch.IntTensor"])

NameError: ignored

In [8]:
from transformers import TrainingArguments, Trainer
from torch.utils.checkpoint import checkpoint 

training_args = TrainingArguments(
    "test_trainer", 
    num_train_epochs=1, 
    per_device_train_batch_size=4,
    fp16=True)

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=dataset
)

Using amp half precision backend


In [9]:
trainer.train()

***** Running training *****
  Num examples = 22532
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5633
  * num_indices_to_pick_from
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


KeyboardInterrupt: ignored