In [2]:
!nvidia-smi

Mon Jan 31 14:52:28 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install -q transformers datasets 

In [4]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


# Dataset tokenization

We will tokenize the whole data by making use of the `datasets` library, which works seamlessly with the huggingface library.

In [5]:
from datasets import load_dataset, load_from_disk
import os

GDRIVE_DATASET_PATH = "/gdrive/MyDrive/final-project/data/bigbird"
SUMMARIES_CLAIMS_CSV_PATH = "/gdrive/MyDrive/final-project/data/summaries_claims.csv"

## Memory concerns

Unfortunately finetuning such a huge model requires a lot of memory in GPU. 
Google colab limit is $\approx$ 12 GB which is not enough for the whole dataset.

In [6]:
import sys
import pandas as pd
import csv

csv.field_size_limit(sys.maxsize)
# load into memory for analysis
df = pd.read_csv(SUMMARIES_CLAIMS_CSV_PATH, engine="python")
# some descriptions are NaNs so let's drop them
df = df.dropna()

Let's check how many samples we would be able to use by using a lower amount of data.

In [7]:
SIZES = [512, 1024, 2048, 4096]

summary_tokens = df["summaries"].apply(lambda x: len(x.split(" ")))
claims_tokens = df["claims"].apply(lambda x: len(x.split(" ")))

for size in SIZES:
  ok_summaries = summary_tokens <= size
  ok_claims = claims_tokens <= size  
  print(f"{ok_summaries.sum()} summaries have <= {size} tokens ({ok_summaries.sum() / len(summary_tokens) * 100:2.2f}%)")
  print(f"{ok_claims.sum()} claims have <= {size} tokens ({ok_claims.sum() / len(ok_claims) * 100:2.2f}%)")

11707 summaries have <= 512 tokens (44.97%)
25371 claims have <= 512 tokens (97.45%)
18100 summaries have <= 1024 tokens (69.52%)
25970 claims have <= 1024 tokens (99.75%)
23065 summaries have <= 2048 tokens (88.59%)
26026 claims have <= 2048 tokens (99.97%)
25311 summaries have <= 4096 tokens (97.22%)
26033 claims have <= 4096 tokens (99.99%)


We can safely see that by using 512 as maximum length we obtain most of the claims ($97.45\%$).

For the description, however, we can't really go that low or we would lose most of the sample.

In [8]:
SUMMARY_LEN = 2048
CLAIM_LEN = 512

In [19]:
# let's check if we can load the dataset from disk first.
# this will save us the burden of loading the tokenizer
# and tokenizing all the data we need
if os.path.exists(GDRIVE_DATASET_PATH):
  dataset = load_from_disk(GDRIVE_DATASET_PATH)
  print("Dataset loaded")
else:
  from transformers import AutoTokenizer
  from datasets import Dataset
  reduced_df = df[(claims_tokens <= CLAIM_LEN) & (summary_tokens <= SUMMARY_LEN)]
  dataset = Dataset.from_pandas(reduced_df)

  tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-bigpatent")

  # first let's rename data in the way the model expect
  dataset = dataset.rename_column("summaries", "input_ids") \
    .rename_column("claims", "decoder_input_ids") \
    .remove_columns("patentnumber") \
    .remove_columns("__index_level_0__")

  # even though we carefully preprocessed data some descriptions are still empty.
  # we will filter them out
  dataset = dataset.filter(lambda r: r["input_ids"] is not None)

  def encoder_tokenize_function(row):
    """
    Tokenize the summary into input_ids and attention_mask
    """
    return tokenizer(row["input_ids"], max_length=SUMMARY_LEN, padding="max_length", truncation=True)

  # tokenize the summaries
  dataset = dataset.map(encoder_tokenize_function, batched=True)

  def decoder_tokenize_function(row):
    """
    Tokenize claim into the expected output from the decoder 
    (decoder_input_ids and decoder_attention_mask)
    """
    tokenized = tokenizer(row["decoder_input_ids"], max_length=CLAIM_LEN, padding="max_length", truncation=True)
    return {
        "decoder_input_ids": tokenized["input_ids"],
        "decoder_attention_mask": tokenized["attention_mask"],
    }

  # tokenize the claim
  dataset = dataset.map(decoder_tokenize_function, batched=True)

  # export the dataset to disk for future loading
  dataset.save_to_disk(GDRIVE_DATASET_PATH)
  print("Dataset computed and saved")

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

Dataset computed and saved


# Fine-tuning

In [11]:
import gc
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

In [12]:
#del model
#del trainer
torch.cuda.empty_cache()
gc.collect()

710

In [13]:
from transformers import BigBirdPegasusForConditionalGeneration, BigBirdPegasusConfig

model = BigBirdPegasusForConditionalGeneration.from_pretrained(
    "google/bigbird-pegasus-large-bigpatent",
    block_size=16,
    num_random_blocks=3,
    attention_type="block_sparse")

In [14]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

576891904

In [15]:
def freeze_params(model):
  for par in model.parameters():
      par.requires_grad = False

def freeze_embeds(model):
  freeze_params(model.model.shared)
  freeze_params(model.model.encoder)

  for d in [model.model.decoder]:
    freeze_params(d.embed_positions)
    freeze_params(d.embed_tokens)

In [16]:
freeze_embeds(model)
# trainable params
sum(p.numel() for p in model.parameters() if p.requires_grad)

268617728

In [20]:
def compute_labels(row):
  # compute labels for each row
  labels = row["decoder_input_ids"].copy()
  labels[row["decoder_input_ids"] == 0] = -100

  row["labels"] = labels
  return row

dataset = dataset.map(compute_labels)

0ex [00:00, ?ex/s]

In [None]:
dataset["labels"][0]

In [21]:
d = dataset.select(range(1000))

In [22]:
d

Dataset({
    features: ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask', 'labels'],
    num_rows: 1000
})

In [23]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments("test_trainer", num_train_epochs=3, per_device_train_batch_size=1)
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=d
)

In [24]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3000
  * num_indices_to_pick_from


Step,Training Loss
500,5.9147
1000,4.2635
1500,3.8887
2000,3.8465
2500,3.6684
3000,3.6597


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
  * num_indices_to_pick_from
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
  * num_indices_to_pick_from
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
  * num_indices_to_pick_from
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
  * num_indices_to_pick_from
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model w

TrainOutput(global_step=3000, training_loss=4.206904378255208, metrics={'train_runtime': 6543.6455, 'train_samples_per_second': 0.458, 'train_steps_per_second': 0.458, 'total_flos': 1.7329538727936e+16, 'train_loss': 4.206904378255208, 'epoch': 3.0})

In [57]:
del model
del trainer
torch.cuda.empty_cache()
gc.collect()

6781