In [1]:
!nvidia-smi

Tue Apr 19 15:07:52 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers datasets torchinfo

[K     |████████████████████████████████| 4.0 MB 34.6 MB/s 
[K     |████████████████████████████████| 325 kB 70.2 MB/s 
[K     |████████████████████████████████| 596 kB 68.7 MB/s 
[K     |████████████████████████████████| 895 kB 44.7 MB/s 
[K     |████████████████████████████████| 6.6 MB 64.8 MB/s 
[K     |████████████████████████████████| 77 kB 7.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 57.5 MB/s 
[K     |████████████████████████████████| 212 kB 8.6 MB/s 
[K     |████████████████████████████████| 136 kB 53.9 MB/s 
[K     |████████████████████████████████| 127 kB 61.7 MB/s 
[K     |████████████████████████████████| 271 kB 11.1 MB/s 
[K     |████████████████████████████████| 144 kB 51.5 MB/s 
[K     |████████████████████████████████| 94 kB 3.1 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires foli

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
RANDOM_SEED = 42

# Dataset preparation

We will tokenize the whole data by making use of the `datasets` library, which works seamlessly with the huggingface library.

In [3]:
from datasets import load_dataset, load_from_disk
import os
import sys
import pandas as pd
import csv

DATA_PATH = "/gdrive/MyDrive/final-project/post-refactor/data/"
DATASET_CSV_PATH = os.path.join(DATA_PATH, "balanced_summaries_claims.zip")

# set maximum csv size to avoid pandas ram issues
csv.field_size_limit(sys.maxsize)
# load into memory for analysis
df = pd.read_csv(DATASET_CSV_PATH, engine="python").set_index("patentnumber")[["summary", "claim"]]

## Dataset tokenization

Unfortunately, we won't be able to use the whole dataset. That is because BigBird-Pegasus requires a huge amount of memory in order to be finetuned.

To avoid incurring into out-of-memory errors from CUDA we will only use a subset of the available dataset, by taking only those documents whose claim length is shorter than 500 tokens.

In [4]:
summary_len = df.summary.apply(lambda r: len(r.split()))
claim_len = df.claim.apply(lambda r: len(r.split()))

MAX_SUMMARY_LEN = None #@param
MAX_CLAIM_LEN = 500 #@param

subset_clause = summary_len >= 0
if MAX_SUMMARY_LEN is not None:
  subset_clause = subset_clause & (summary_len < MAX_SUMMARY_LEN)
if MAX_CLAIM_LEN is not None:
  subset_clause = subset_clause & (claim_len < MAX_CLAIM_LEN)

subset_df = df[subset_clause]

In [5]:
#@title Tokenize data
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-bigpatent")
TOKENIZED_DATASET_PATH = os.path.join(DATA_PATH, "tokenized_bigbird_dataset")

# let's check if we can load the dataset from disk first.
# this will save us the burden of loading the tokenizer
# and tokenizing all the data we need
if os.path.exists(TOKENIZED_DATASET_PATH):
  dataset = load_from_disk(TOKENIZED_DATASET_PATH)
  print("Dataset loaded")
else:
  from datasets import Dataset
  dataset = Dataset.from_pandas(subset_df)

  # first let's rename data in the way the model expect
  dataset = dataset.rename_column("summary", "input_ids") \
    .rename_column("claim", "decoder_input_ids")
  # even though we carefully preprocessed data some descriptions are still empty.
  # we will filter them out
  dataset = dataset.filter(lambda r: r["input_ids"] is not None)

  def encoder_tokenize_function(row):
    """
    Tokenize the summary into input_ids and attention_mask
    """
    kwargs = {
        "padding": "max_length",
        "truncation": True,
    }

    if MAX_SUMMARY_LEN is not None:
      kwargs["max_length"] = MAX_SUMMARY_LEN

    return tokenizer(row["input_ids"], **kwargs)

  # tokenize the summaries
  dataset = dataset.map(encoder_tokenize_function, batched=True)

  def decoder_tokenize_function(row):
    """
    Tokenize claim into the expected output from the decoder 
    (decoder_input_ids and decoder_attention_mask)
    """
    kwargs = {
        "padding": "max_length",
        "truncation": True,
    }

    if MAX_CLAIM_LEN is not None:
      kwargs["max_length"] = MAX_CLAIM_LEN

    tokenized = tokenizer(row["decoder_input_ids"], **kwargs)

    return {
        "decoder_input_ids": tokenized["input_ids"],
        "decoder_attention_mask": tokenized["attention_mask"]
    }

  # tokenize the claim
  dataset = dataset.map(decoder_tokenize_function, batched=True)

  def compute_labels(row):
    """
    Compute labels based on decoder_input_ids where padding token is represented as -100
    """
    labels = row["decoder_input_ids"]
    labels = [-100 if t == 0 else t for t in labels]
    return {"labels" : labels}
  
  dataset = dataset.map(compute_labels, batched=True)

  # export the dataset to disk for future loading
  dataset.save_to_disk(TOKENIZED_DATASET_PATH)
  print("Dataset computed and saved")

Dataset loaded


# Dataset splits

We will divide the overall dataset into the usual splits: training and testing respectively $90\%$ and $10\%$ of the overall data.

In [6]:
dataset = dataset.train_test_split(test_size=0.10, seed=RANDOM_SEED)

In [7]:
print(f"Train: {len(dataset['train'])} samples")
print(f"Test: {len(dataset['test'])} samples")

Train: 1249 samples
Test: 139 samples


# Neural model

We will use Google's BigbirdPegasus (*BP*) model, by making use of huggingface APIs.
[Bigbird](https://arxiv.org/pdf/2007.14062v2.pdf) represents the BERT variant in which attention is computed.


The model is made public with an already pretrained version on a patent dataset, [BIGPATENT](https://arxiv.org/pdf/1906.03741v1.pdf) different from the one we're using.
The dataset consists of 1.3 million records of U.S. patent documents along with human written abstractive summaries.

While general abstractive summarization is a different task than the one we're interested in, it's fairly easy to suppose that an indipendent claim could be interpreted as the summarization of the patent from a particular perspective.

This model, which represents the current SOTA in the abstractive summarization field, is an incredible resource in the problem we're trying to solve.

We will initially try to generate claims without any work on the model.

In [8]:
from transformers import BigBirdPegasusForConditionalGeneration
from torchinfo import summary

model = BigBirdPegasusForConditionalGeneration.from_pretrained(
    "google/bigbird-pegasus-large-bigpatent",
    block_size=16,
    num_random_blocks=3,
    attention_type="block_sparse") # required for fp16
model.gradient_checkpointing_enable()
summary(model, dtypes=["torch.IntTensor"])

Layer (type:depth-idx)                                                      Param #
BigBirdPegasusForConditionalGeneration                                      --
├─BigBirdPegasusModel: 1-1                                                  --
│    └─Embedding: 2-1                                                       98,409,472
│    └─BigBirdPegasusEncoder: 2-2                                           --
│    │    └─Embedding: 3-1                                                  (recursive)
│    │    └─BigBirdPegasusLearnedPositionalEmbedding: 3-2                   4,194,304
│    │    └─ModuleList: 3-3                                                 201,474,048
│    │    └─LayerNorm: 3-4                                                  2,048
│    └─BigBirdPegasusDecoder: 2-3                                           --
│    │    └─Embedding: 3-5                                                  (recursive)
│    │    └─BigBirdPegasusLearnedPositionalEmbedding: 3-6                   4,194

In [9]:
FINETUNE_MODEL_PATH = os.path.join(DATA_PATH, "BigBirdModelFineTune/")

In [10]:
from transformers import  Seq2SeqTrainingArguments,  Seq2SeqTrainer
from torch.utils.checkpoint import checkpoint 
import gc
gc.collect()

training_args =  Seq2SeqTrainingArguments(
    output_dir=FINETUNE_MODEL_PATH,
    overwrite_output_dir=True, # used to keep training
    gradient_accumulation_steps=1, # lower memory usage: perform backprop every 2 steps
    num_train_epochs=1,
    per_device_train_batch_size=2,
    logging_first_step=True,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=250,
    save_total_limit=2, # save at most two checkpoints, delete the older ones
    fp16=True, # faster and lighter on memory but possibly less precise on convergence
    predict_with_generate=True,
    gradient_checkpointing=True)

In [11]:
trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args,
    train_dataset=dataset["train"]
)

trainer.train()

Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `BigBirdPegasusForConditionalGeneration.forward` and have been ignored: patentnumber. If patentnumber are not expected by `BigBirdPegasusForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1249
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 625
  * num_indices_to_pick_from


Step,Training Loss
1,10.8453
100,7.2766
200,2.3287
300,0.4645
400,0.1519
500,0.0746


Saving model checkpoint to /gdrive/MyDrive/final-project/post-refactor/data/BigBirdModelFineTune/checkpoint-250
Configuration saved in /gdrive/MyDrive/final-project/post-refactor/data/BigBirdModelFineTune/checkpoint-250/config.json
Model weights saved in /gdrive/MyDrive/final-project/post-refactor/data/BigBirdModelFineTune/checkpoint-250/pytorch_model.bin
  * num_indices_to_pick_from
Saving model checkpoint to /gdrive/MyDrive/final-project/post-refactor/data/BigBirdModelFineTune/checkpoint-500
Configuration saved in /gdrive/MyDrive/final-project/post-refactor/data/BigBirdModelFineTune/checkpoint-500/config.json
Model weights saved in /gdrive/MyDrive/final-project/post-refactor/data/BigBirdModelFineTune/checkpoint-500/pytorch_model.bin
  * num_indices_to_pick_from


Step,Training Loss
1,10.8453
100,7.2766
200,2.3287
300,0.4645
400,0.1519
500,0.0746
600,0.1088




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=625, training_loss=1.6715751320838927, metrics={'train_runtime': 2706.6577, 'train_samples_per_second': 0.461, 'train_steps_per_second': 0.231, 'total_flos': 1.4429729247461376e+16, 'train_loss': 1.6715751320838927, 'epoch': 1.0})

In [12]:
model.save_pretrained(os.path.join(FINETUNE_MODEL_PATH, "final/"))

Configuration saved in /gdrive/MyDrive/final-project/post-refactor/data/BigBirdModelFineTune/config.json
Model weights saved in /gdrive/MyDrive/final-project/post-refactor/data/BigBirdModelFineTune/pytorch_model.bin
