This script performs a fine tuning of BART to perform summaries.

About the DataSet:

DataSet consists of 295,174 news articles scrapped from a Mexican Newspaper, along with its summary. Summaries were created using `StableBeluga-7B`. I left the LLM running for several days (weeks) in order to get all the summaries. The teacher observations are used for BART fine tunning.

The objective for this is to have a lightweight model that can perform summarization as good as `StableBeluga-7B`, much faster and with much less computing resources.

In [1]:
import json
import pandas as pd
import torch
import numpy as np
import time
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large"  # or another BART variant
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = BartTokenizer.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
# Load the StableBeluga summaries data set that was pre-processed in 1-make_dataset notebook

with open('datasets/BART_data_set.json', "r") as f:
    train_set = json.load(f)

In [5]:
summs_df = pd.DataFrame.from_dict(train_set)

In [6]:
# split in train and validation
train_df, val_df = train_test_split(summs_df, test_size=0.15)

In [7]:
# save train and validation data
val_df.to_json("datasets/BART_validation_data.json", orient='records')
train_df.to_json("datasets/BART_train_data.json", orient='records')

In [7]:
# Define a custom dataset class
class NewsSummaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=1024, padding='max_length'):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len
        self.padding = padding

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # article is the input
        article = self.data.iloc[idx]['article']
        # summary will be the target label
        summary = self.data.iloc[idx]['summary']
        inputs = self.tokenizer(
            article, 
            max_length=self.max_len, 
            truncation=True, 
            padding=self.padding,  # Enable padding
            return_tensors="pt"
        )
        targets = self.tokenizer(
            summary, 
            max_length=self.max_len, 
            truncation=True, 
            padding=self.padding,  # Enable padding
            return_tensors="pt"
        )
        return inputs.input_ids.squeeze(), targets.input_ids.squeeze()

# Prepare the dataset and data loader
train_dataset = NewsSummaryDataset(train_df, tokenizer)
val_dataset = NewsSummaryDataset(val_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)

In [8]:
# Adam optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 2e-5, # args.learning_rate
    eps = 1e-8 # args.adam_epsilon
)

In [9]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_loader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0, # Default value in run_glue.py
    num_training_steps = total_steps
)

In [10]:
# Define the number of steps to accumulate gradients
accumulation_steps = 16  # Adjust this according to your needs

# Put the model into training mode. Don't be mislead--the call to 
# `train` just changes the *mode*, it doesn't *perform* the training.
# `dropout` and `batchnorm` layers behave differently during training
# vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
model.train()
for epoch in range(epochs):  # number of epochs
    # Clear the gradients
    model.zero_grad()
    # empty gpu cache to free memory
    torch.cuda.empty_cache()
    for step, batch in enumerate(train_loader):
        
        articles, summaries = batch
        articles = articles.to(device)
        summaries = summaries.to(device)

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(input_ids=articles, labels=summaries)
        # Update loss
        loss = outputs.loss / accumulation_steps  # Normalize the loss
        # Perform a backward pass to calculate the gradients.
        loss.backward()

        if (step + 1) % accumulation_steps == 0:
            # Clip the norm of the gradients to 1.0 to prevent the "exploding gradients" problem
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Update parameters
            optimizer.step()
            # Update the learning rate.
            scheduler.step() # Uncomment if using a learning rate scheduler
            # Clear the gradients
            model.zero_grad()
            print(f"Epoch {epoch}, Step {step}, Loss: {loss.item() * accumulation_steps}", end='\r')  # Un-normalize the loss for reporting
            

    # Perform an optimization step if there are any unprocessed gradients remaining after the last batch
    if len(train_loader) % accumulation_steps != 0:
        # Clip the norm of the gradients to 1.0 to prevent the "exploding gradients" problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step() # Uncomment if using a learning rate scheduler
        # Clear the gradients
        model.zero_grad()

Epoch 2, Step 125439, Loss: 0.037962757050991064

In [11]:
# Save the model
model.save_pretrained("./model_save/bart_summarizer")
tokenizer.save_pretrained("./model_save/bart_summarizer")

('./model_save/bart_summarizer\\tokenizer_config.json',
 './model_save/bart_summarizer\\special_tokens_map.json',
 './model_save/bart_summarizer\\vocab.json',
 './model_save/bart_summarizer\\merges.txt',
 './model_save/bart_summarizer\\added_tokens.json')