In [1]:
!nvidia-smi

Sat May 14 12:56:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers datasets torchinfo rouge_score sacrebleu

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [4]:
RANDOM_SEED = 42

# Dataset preparation

We will tokenize the whole data by making use of the `datasets` library, which works seamlessly with the huggingface library.

In [5]:
from datasets import load_from_disk
import os
import sys
import pandas as pd
import csv

DATA_PATH = "/gdrive/MyDrive/final-project/post-refactor/data/"
DATASET_CSV_PATH = os.path.join(DATA_PATH, "summaries_claims.zip")

# set maximum csv size to avoid pandas ram issues
csv.field_size_limit(sys.maxsize)
# load into memory for analysis
df = pd.read_csv(DATASET_CSV_PATH, engine="python").set_index("patentnumber")[["summary", "claim"]]
df = df.dropna()

## Dataset tokenization

Unfortunately, we won't be able to use the whole dataset. That is because BigBird-Pegasus requires a huge amount of memory in order to be finetuned.

To avoid incurring into out-of-memory errors from CUDA we will only use a subset of the available dataset, by taking only those documents whose claim length is shorter than 500 tokens.

In [6]:
summary_len = df.summary.apply(lambda r: len(r.split()))
claim_len = df.claim.apply(lambda r: len(r.split()))

MAX_SUMMARY_LEN = 1024 #@param
MAX_CLAIM_LEN = 500 #@param

subset_clause = summary_len >= 0
if MAX_SUMMARY_LEN is not None:
  subset_clause = subset_clause & (summary_len < MAX_SUMMARY_LEN)
if MAX_CLAIM_LEN is not None:
  subset_clause = subset_clause & (claim_len < MAX_CLAIM_LEN)

subset_df = df[subset_clause]

In [7]:
#@title Tokenize data
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
TOKENIZED_DATASET_PATH = os.path.join(DATA_PATH, "tokenized_bart_dataset")

# let's check if we can load the dataset from disk first.
# this will save us the burden of loading the tokenizer
# and tokenizing all the data we need
if os.path.exists(TOKENIZED_DATASET_PATH):
  dataset = load_from_disk(TOKENIZED_DATASET_PATH)
  print("Dataset loaded")
else:
  from datasets import Dataset
  dataset = Dataset.from_pandas(subset_df)

  # first let's rename data in the way the model expect
  dataset = dataset.rename_column("summary", "input_ids") \
    .rename_column("claim", "decoder_input_ids")
  # even though we carefully preprocessed data some descriptions are still empty.
  # we will filter them out
  dataset = dataset.filter(lambda r: r["input_ids"] is not None)

  def encoder_tokenize_function(row):
    """
    Tokenize the summary into input_ids and attention_mask
    """
    kwargs = {
        "padding": "max_length",
        "truncation": True,
    }

    if MAX_SUMMARY_LEN is not None:
      kwargs["max_length"] = MAX_SUMMARY_LEN

    return tokenizer(row["input_ids"], **kwargs)

  # tokenize the summaries
  dataset = dataset.map(encoder_tokenize_function, batched=True)

  def decoder_tokenize_function(row):
    """
    Tokenize claim into the expected output from the decoder 
    (decoder_input_ids and decoder_attention_mask)
    """
    kwargs = {
        "padding": "max_length",
        "truncation": True,
    }

    if MAX_CLAIM_LEN is not None:
      kwargs["max_length"] = MAX_CLAIM_LEN

    tokenized = tokenizer(row["decoder_input_ids"], **kwargs)

    return {
        "decoder_input_ids": tokenized["input_ids"],
        "decoder_attention_mask": tokenized["attention_mask"]
    }

  # tokenize the claim
  dataset = dataset.map(decoder_tokenize_function, batched=True)

  def compute_labels(row):
    """
    Compute labels based on decoder_input_ids where padding token is represented as -100
    """
    labels = row["decoder_input_ids"]
    labels = [-100 if t == 0 else t for t in labels]
    return {"labels" : labels}
  
  dataset = dataset.map(compute_labels, batched=True)

  # export the dataset to disk for future loading
  dataset.save_to_disk(TOKENIZED_DATASET_PATH)
  print("Dataset computed and saved")

Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

Dataset computed and saved


# Dataset splits

We will divide the overall dataset into the usual splits: training and testing respectively $99.5\%$ and $0.5\%$ of the overall data.

In [8]:
dataset = dataset.train_test_split(test_size=0.005, seed=RANDOM_SEED)

In [9]:
print(f"Train: {len(dataset['train'])} samples")
print(f"Test: {len(dataset['test'])} samples")

Train: 17661 samples
Test: 89 samples


# Neural model

We will use Google's BigbirdPegasus (*BP*) model, by making use of huggingface APIs.
[Bigbird](https://arxiv.org/pdf/2007.14062v2.pdf) represents the BERT variant in which attention is computed.


The model is made public with an already pretrained version on a patent dataset, [BIGPATENT](https://arxiv.org/pdf/1906.03741v1.pdf) different from the one we're using.
The dataset consists of 1.3 million records of U.S. patent documents along with human written abstractive summaries.

While general abstractive summarization is a different task than the one we're interested in, it's fairly easy to suppose that an indipendent claim could be interpreted as the summarization of the patent from a particular perspective.

This model, which represents the current SOTA in the abstractive summarization field, is an incredible resource in the problem we're trying to solve.

We will initially try to generate claims without any work on the model.

In [10]:
from transformers import BartForConditionalGeneration
from torchinfo import summary

model = BartForConditionalGeneration.from_pretrained("Pyke/bart-finetuned-with-patent")
model.gradient_checkpointing_enable()
summary(model, dtypes=["torch.IntTensor"])

Downloading:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/532M [00:00<?, ?B/s]

Layer (type:depth-idx)                                  Param #
BartForConditionalGeneration                            --
├─BartModel: 1-1                                        --
│    └─Embedding: 2-1                                   38,603,520
│    └─BartEncoder: 2-2                                 --
│    │    └─Embedding: 3-1                              (recursive)
│    │    └─BartLearnedPositionalEmbedding: 3-2         787,968
│    │    └─ModuleList: 3-3                             42,527,232
│    │    └─LayerNorm: 3-4                              1,536
│    └─BartDecoder: 2-3                                 --
│    │    └─Embedding: 3-5                              (recursive)
│    │    └─BartLearnedPositionalEmbedding: 3-6         787,968
│    │    └─ModuleList: 3-7                             56,710,656
│    │    └─LayerNorm: 3-8                              1,536
├─Linear: 1-2                                           38,603,520
Total params: 178,023,936
Trainable params: 

In [11]:
from datasets import load_metric
import numpy as np
import nltk

nltk.download('punkt')

ROUGE = load_metric('rouge')
BLEU = load_metric('sacrebleu')

def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [label.strip() for label in labels]

  # rougeLSum expects newline after each sentence
  preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
  labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

  return preds, labels

def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
      preds = preds[0]

  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
  # Replace -100 in the labels to actual padding
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Some simple post-processing
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  rouge_scores = ROUGE.compute(predictions=decoded_preds, 
                               references=[[dl] for dl in decoded_labels])
  rouge_scores = { k: v.mid.fmeasure for k, v in rouge_scores.items() }

  bleu_score = BLEU.compute(predictions=decoded_preds,
                            references=[[dl] for dl in decoded_labels])["score"]
  
  return {"sacrebleu": bleu_score, **rouge_scores}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
FINETUNE_MODEL_PATH = os.path.join(DATA_PATH, "BartModelFineTune/")

In [13]:
from transformers import  Seq2SeqTrainingArguments,  Seq2SeqTrainer, EarlyStoppingCallback
from torch.utils.checkpoint import checkpoint 
import gc
gc.collect()

EPOCHS = 1
BATCH_SIZE = 4
ACCUMULATION_STEPS = 2
# perform saving and evaluation every 25% of the dataset
EVAL_LOG_STEPS = int((len(dataset['train']) / (BATCH_SIZE * ACCUMULATION_STEPS)) * 0.05)
EVAL_ACC_STEPS = 2

print(f"Evaluate and log every {EVAL_LOG_STEPS}")

training_args =  Seq2SeqTrainingArguments(
    output_dir=FINETUNE_MODEL_PATH,
    overwrite_output_dir=True, # used to keep training
    gradient_accumulation_steps=ACCUMULATION_STEPS, # lower memory usage: perform backprop every 2 steps
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    evaluation_strategy="steps",
    eval_steps=EVAL_LOG_STEPS,
    eval_accumulation_steps=EVAL_ACC_STEPS,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_first_step=True,
    logging_strategy="steps",
    logging_steps=EVAL_LOG_STEPS,
    save_strategy="steps",
    save_steps=2 * EVAL_LOG_STEPS,
    save_total_limit=2, # save at most two checkpoints, delete the older ones
    fp16=True, # faster and lighter on memory but possibly less precise on convergence
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="sacrebleu",
    generation_max_length=500,
    weight_decay=0.01,
    greater_is_better=True,
    gradient_checkpointing=True)

Evaluate and log every 331


In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Using amp half precision backend


In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: patentnumber. If patentnumber are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 17661
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 2208


Step,Training Loss,Validation Loss,Sacrebleu,Rouge1,Rouge2,Rougel,Rougelsum
331,2.0154,1.178324,3.38585,0.306299,0.203927,0.259219,0.25997
662,0.9898,1.060759,9.344636,0.368612,0.239915,0.303889,0.303803
993,0.8944,1.016388,10.889214,0.406721,0.265523,0.330283,0.331428
1324,0.8278,0.975204,10.339262,0.415897,0.279918,0.349229,0.348608
1655,0.8291,0.950134,13.274809,0.432357,0.286769,0.356539,0.357188
1986,0.814,0.943129,11.86296,0.427764,0.283999,0.350457,0.350657


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: patentnumber. If patentnumber are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 89
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: patentnumber. If patentnumber are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 89
  Batch size = 4
Saving model checkpoint to /gdrive/MyDrive/final-project/post-refactor/data/BartModelFineTune/checkpoint-662
Configuration saved in /gdrive/MyDrive/final-project/post-refactor/data/BartModelFineTune/checkpoint-662/config.json
Model weights saved in /gdrive/MyDrive/final-project/post-refactor/data/BartModelFineTune/chec

TrainOutput(global_step=2208, training_loss=1.0444016655286152, metrics={'train_runtime': 3218.1979, 'train_samples_per_second': 5.488, 'train_steps_per_second': 0.686, 'total_flos': 1.076855558897664e+16, 'train_loss': 1.0444016655286152, 'epoch': 1.0})

In [16]:
#model.save_pretrained(os.path.join(FINETUNE_MODEL_PATH, "final/"))