In [1]:
!nvidia-smi

Tue Feb  1 13:09:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers datasets torchinfo rouge_score git+https://github.com/google-research/bleurt.git unbabel-comet

[K     |████████████████████████████████| 3.5 MB 4.9 MB/s 
[K     |████████████████████████████████| 312 kB 14.1 MB/s 
[K     |████████████████████████████████| 596 kB 27.1 MB/s 
[K     |████████████████████████████████| 895 kB 35.9 MB/s 
[K     |████████████████████████████████| 6.8 MB 17.4 MB/s 
[K     |████████████████████████████████| 67 kB 4.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 34.1 MB/s 
[K     |████████████████████████████████| 133 kB 39.2 MB/s 
[K     |████████████████████████████████| 243 kB 31.8 MB/s 
[K     |████████████████████████████████| 271 kB 42.5 MB/s 
[K     |████████████████████████████████| 94 kB 3.3 MB/s 
[K     |████████████████████████████████| 144 kB 43.4 MB/s 
[?25h

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [9]:
RANDOM_SEED = 42

# Dataset tokenization

We will tokenize the whole data by making use of the `datasets` library, which works seamlessly with the huggingface library.

In [4]:
from datasets import load_dataset, load_from_disk
import os

GDRIVE_DATASET_PATH = "/gdrive/MyDrive/university/tokenized_dataset"
SUMMARIES_CLAIMS_CSV_PATH = "/gdrive/MyDrive/university/summaries_claims.csv"

## Memory concerns

Unfortunately finetuning such a huge model requires a lot of memory in GPU. 
Google colab limit is $\approx$ 12 GB which is not enough for the whole dataset.

In [None]:
import sys
import pandas as pd
import csv

csv.field_size_limit(sys.maxsize)
# load into memory for analysis
df = pd.read_csv(SUMMARIES_CLAIMS_CSV_PATH, engine="python")
# some descriptions are NaNs so let's drop them
df = df.dropna()

Let's check how many samples we would be able to use by using a lower amount of data.

In [None]:
SIZES = [512, 1024, 2048, 4096]

summary_tokens = df["summaries"].apply(lambda x: len(x.split(" ")))
claims_tokens = df["claims"].apply(lambda x: len(x.split(" ")))

for size in SIZES:
  ok_summaries = summary_tokens <= size
  ok_claims = claims_tokens <= size  
  print(f"{ok_summaries.sum()} summaries have <= {size} tokens ({ok_summaries.sum() / len(summary_tokens) * 100:2.2f}%)")
  print(f"{ok_claims.sum()} claims have <= {size} tokens ({ok_claims.sum() / len(ok_claims) * 100:2.2f}%)")

11707 summaries have <= 512 tokens (44.97%)
25371 claims have <= 512 tokens (97.45%)
18100 summaries have <= 1024 tokens (69.52%)
25970 claims have <= 1024 tokens (99.75%)
23065 summaries have <= 2048 tokens (88.59%)
26026 claims have <= 2048 tokens (99.97%)
25311 summaries have <= 4096 tokens (97.22%)
26033 claims have <= 4096 tokens (99.99%)


We can safely see that by using 512 as maximum length we obtain most of the claims ($97.45\%$).

For the description, however, we can't really go that low or we would lose most of the sample. While using 4096 tokens, which is the maximum length handled by bigbird, would allow us to use all the data in the dataset we don't have adequate resources for such a job.

We will therefore make use of only those summaries whose length is $\leq 2048$, which accounts for $\approx 88\%$ of all the available data.

In [37]:
#@title Tokenize data
from transformers import AutoTokenizer


SUMMARY_LEN = 2048 #@param {type: "number"}
CLAIM_LEN = 512 #@param {type: "number"}
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-bigpatent")


# let's check if we can load the dataset from disk first.
# this will save us the burden of loading the tokenizer
# and tokenizing all the data we need
if os.path.exists(GDRIVE_DATASET_PATH):
  dataset = load_from_disk(GDRIVE_DATASET_PATH)
  print("Dataset loaded")
else:
  from datasets import Dataset
  reduced_df = df[(claims_tokens <= CLAIM_LEN) & (summary_tokens <= SUMMARY_LEN)]
  dataset = Dataset.from_pandas(reduced_df)

  # first let's rename data in the way the model expect
  dataset = dataset.rename_column("summaries", "input_ids") \
    .rename_column("claims", "decoder_input_ids") \
    .remove_columns("patentnumber") \
    .remove_columns("__index_level_0__")

  # even though we carefully preprocessed data some descriptions are still empty.
  # we will filter them out
  dataset = dataset.filter(lambda r: r["input_ids"] is not None)

  def encoder_tokenize_function(row):
    """
    Tokenize the summary into input_ids and attention_mask
    """
    return tokenizer(row["input_ids"], max_length=SUMMARY_LEN, padding="max_length", truncation=True)

  # tokenize the summaries
  dataset = dataset.map(encoder_tokenize_function, batched=True)

  def decoder_tokenize_function(row):
    """
    Tokenize claim into the expected output from the decoder 
    (decoder_input_ids and decoder_attention_mask)
    """
    tokenized = tokenizer(row["decoder_input_ids"], max_length=CLAIM_LEN, padding="max_length", truncation=True)

    return {
        "decoder_input_ids": tokenized["input_ids"],
        "decoder_attention_mask": tokenized["attention_mask"]
    }

  # tokenize the claim
  dataset = dataset.map(decoder_tokenize_function, batched=True)

  def compute_labels(row):
    """
    Compute labels based on decoder_input_ids where padding token is represented as -100
    """
    labels = row["decoder_input_ids"]
    labels = [-100 if t == 0 else t for t in labels]
    return {"labels" : labels}
  
  dataset = dataset.map(compute_labels, batched=True)

  # export the dataset to disk for future loading
  dataset.save_to_disk(GDRIVE_DATASET_PATH)
  print("Dataset computed and saved")

Dataset loaded


We saw that, with the suggested configuration, we can manage to achieve $\approx 0.55 \ \text{it/s}$.

In [None]:
from datetime import timedelta

def format(seconds):
    h, rem = divmod(seconds, 3600)
    m, s = divmod(rem, 60)
    return round(h), round(m), round(s)

time_per_epoch = len(dataset) * 0.55
total_time = time_per_epoch * 3
h, m, s = format(total_time)

print(f"Approximately we would need {h}:{m}:{s} to train the whole dataset for 3 epochs")

Approximately we would need 10:19:38 to train the whole dataset for 3 epochs


# Dataset splits

We will divide the overall dataset into the usual splits: training and testing respectively $90\%$ and $10\%$ of the overall data.

We will further extract $10\%$ from training data and use it as validation during training.

In [39]:
train_test = dataset.train_test_split(test_size=0.15, seed=RANDOM_SEED)
test_valid = train_test["train"].train_test_split(test_size=0.1, seed=RANDOM_SEED)
train_test["train"] = test_valid["train"]
train_test["valid"] = test_valid["test"]
dataset = train_test

# delete from memory unused values
del train_test
del test_valid

Loading cached split indices for dataset at /gdrive/MyDrive/university/tokenized_dataset/cache-b48cb57575d26b9a.arrow and /gdrive/MyDrive/university/tokenized_dataset/cache-b9e15dba1c57d4d3.arrow
Loading cached split indices for dataset at /gdrive/MyDrive/university/tokenized_dataset/cache-a07b120e791de5e0.arrow and /gdrive/MyDrive/university/tokenized_dataset/cache-11437103a5c00f24.arrow


In [40]:
print(f"Train: {len(dataset['train'])} samples")
print(f"Test: {len(dataset['test'])} samples")
print(f"Valid: {len(dataset['valid'])} samples")

Train: 17236 samples
Test: 3380 samples
Valid: 1916 samples


# Neural model

We will use Google's BigbirdPegasus (*BP*) model, by making use of huggingface APIs.
[Bigbird](https://arxiv.org/pdf/2007.14062v2.pdf) represents the BERT variant in which attention is computed.


The model is made public with an already pretrained version on a patent dataset, [BIGPATENT](https://arxiv.org/pdf/1906.03741v1.pdf) different from the one we're using.
The dataset consists of 1.3 million records of U.S. patent documents along with human written abstractive summaries.

While general abstractive summarization is a different task than the one we're interested in, it's fairly easy to suppose that an indipendent claim could be interpreted as the summarization of the patent from a particular perspective.

This model, which represents the current SOTA in the abstractive summarization field, is an incredible resource in the problem we're trying to solve.

We will initially try to generate claims without any work on the model.

In [158]:
from transformers import BigBirdPegasusForConditionalGeneration

model = BigBirdPegasusForConditionalGeneration.from_pretrained(
    "google/bigbird-pegasus-large-bigpatent",
    block_size=16,
    num_random_blocks=3,
    attention_type="block_sparse")
model.gradient_checkpointing_enable()

loading configuration file https://huggingface.co/google/bigbird-pegasus-large-bigpatent/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/18466945902df20eeca22d080160ea9121a57c97e4d5293994d85f67fa11e50e.c65855e5554b00a37b55e85d3a9f9dd66ca2c3f276ee79e8daea2165fe581bbf
Model config BigBirdPegasusConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu_new",
  "architectures": [
    "BigBirdPegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_type": "block_sparse",
  "block_size": 16,
  "bos_token_id": 2,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 16,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 16,
  "eos_token_id": 1,
  "gradient_checkpointing": false,
  "init_std": 0.02,
  "is_encoder_decoder": true,

Huggingface library provide us with several different metrics, in particular we will make use of BLEURT and ROUGE metrics, used to compare the model with other models in literature. 

In [121]:
from datasets import load_metric
import nltk

ROUGE = load_metric('rouge')
BLEURT = load_metric('bleurt', 'bleurt-large-512')

def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [label.strip() for label in labels]

  # rougeLSum expects newline after each sentence
  preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
  labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

  return preds, labels

def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
      preds = preds[0]

  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
  # Replace -100 in the labels to actual padding
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Some simple post-processing
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  rouge_score = ROUGE.compute(predictions=preds, references=labels)
  rouge_score = { k: v.mid.fmeasure for k, v in rouge_score.items() }

  bleurt_score = BLEURT.compute(predictions=preds, references=labels)["scores"]
  bleurt_score = {"bleurt": bleurt_score}

  return {**rouge_score, **bleurt_score}

In [154]:
FINETUNE_MODEL_PATH = "/gdrive/MyDrive/university/BigBirdModel/"

In [159]:
from transformers import  Seq2SeqTrainingArguments,  Seq2SeqTrainer
from torch.utils.checkpoint import checkpoint 

# setup training arguments, mainly batch size of 4
# accumulating gradient over 2 consecutive steps and training for 3 epochs.
# Saves the model every 900 steps and evaluates it every 300 steps.
# Only keeps the most recent 2 models.
# Training is made by means of gradient checkpointing and mixed-precision to 
# make the training process faster and lighter.
training_args =  Seq2SeqTrainingArguments(
    output_dir=FINETUNE_MODEL_PATH,
    overwrite_output_dir=True, # used to keep training
    gradient_accumulation_steps=2, # lower memory usage: perform backprop every 2 steps
    num_train_epochs=3, 
    per_device_train_batch_size=4,
    logging_first_step=True,
    logging_strategy="steps",
    logging_steps=300,
    save_strategy="steps",
    save_steps=900,
    save_total_limit=2, # save at most two checkpoints, delete the older ones
    fp16=True, # faster and lighter on memory but possibly less precise on convergence
    predict_with_generate=True)

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args,
    eval_dataset=dataset["valid"],
    train_dataset=dataset["train"]
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


RuntimeError: ignored

In [157]:
out = runner.evaluate(dataset["test"])

***** Running Evaluation *****
  Num examples = 3380
  Batch size = 8
  * num_indices_to_pick_from


RuntimeError: ignored

# Fine-tuning

We are going to use [Huggingface](https://huggingface.co/) library to fine tune 

In [None]:
from transformers import BigBirdPegasusForConditionalGeneration, BigBirdPegasusConfig
from torchinfo import summary


model = BigBirdPegasusForConditionalGeneration.from_pretrained(
    "google/bigbird-pegasus-large-bigpatent",
    block_size=16,
    num_random_blocks=3,
    attention_type="block_sparse")

model.gradient_checkpointing_enable()

summary(model, dtypes=["torch.IntTensor"])

As we can see the model have a lot of trainable parameters which would take a lot of time to train.

The model, however, has already been pretrained on a huge patent dataset and we suppose that the encoder latent dimensions alrady represents a good approximation of what patent documents looks like.

We will therefore only finetune the decoder part in order to instruct the network to try and generate claims rather than summarizing the description.
Weights composing embeddings, positional embeddings and encoder will be freezed.

In [None]:
def freeze_params(model):
  """
  Freeze the parameter of the provided subset of the network
  """
  for par in model.parameters():
      par.requires_grad = False

def freeze_embeddings_encoder(model):
  """
  Freeze the embeddings and the encoder part of the network.
  """
  freeze_params(model.model.shared)
  freeze_params(model.model.encoder)

  for d in [model.model.decoder]:
    freeze_params(d.embed_positions)
    freeze_params(d.embed_tokens)

freeze_embeds(model)
summary(model, dtypes=["torch.IntTensor"])

In [None]:
from transformers import  Seq2SeqTrainingArguments,  Seq2SeqTrainer
from torch.utils.checkpoint import checkpoint 

# setup training arguments, mainly batch size of 4
# accumulating gradient over 2 consecutive steps and training for 3 epochs.
# Saves the model every 900 steps and evaluates it every 300 steps.
# Only keeps the most recent 2 models.
# Training is made by means of gradient checkpointing and mixed-precision to 
# make the training process faster and lighter.
training_args =  Seq2SeqTrainingArguments(
    "finetuning_bigbird", 
    output_dir=FINETUNE_MODEL_PATH,
    overwrite_output_dir=True, # used to keep training
    gradient_accumulation_steps=2, # lower memory usage: perform backprop every 2 steps
    num_train_epochs=3, 
    per_device_train_batch_size=4,
    logging_first_step=True,
    logging_strategy="steps",
    logging_steps=300,
    save_strategy="steps",
    save_steps=900,
    save_total_limit=2, # save at most two checkpoints, delete the older ones
    fp16=True, # faster and lighter on memory but possibly less precise on convergence
    gradient_checkpointing=True # slower backpass but lighter on memory,
    predict_with_generate=True)


Using amp half precision backend


In [None]:
def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
      preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
  if data_args.ignore_pad_token_for_loss:
      # Replace -100 in the labels as we can't decode them.
      labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Some simple post-processing
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  # Extract a few results from ROUGE
  result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)
  result = {k: round(v, 4) for k, v in result.items()}
  return result


In [None]:
trainer =  Seq2SeqTrainer(
    model=model, 
    args=training_args,
    train_dataset=dataset
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 22532
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5633
  * num_indices_to_pick_from
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


KeyboardInterrupt: ignored