# Finetuning Bart large on CNN daily news dataset

In [1]:
### MODULES ###

import sys,os
import tqdm
import csv
from datetime import datetime 
import numpy as np
import pandas as pd
import json


from datasets import load_dataset

import torch
from torch import cuda
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.tensorboard import SummaryWriter


from datasets import Dataset

# Load the ROUGE metric
import evaluate

from transformers import AutoTokenizer, BartForConditionalGeneration


#link : https://github.com/facebookresearch/fairseq/tree/main/examples/bart
#link : https://huggingface.co/docs/transformers/model_doc/bart
#link : https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/model#transformers.PreTrainedModel

In [2]:

NUM_PROCS = os.cpu_count() 

print("NUM_PROCS = " ,NUM_PROCS)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
device = 'cpu'


NUM_PROCS =  12
cuda


In [3]:
# Load configuration from JSON
with open('./config_finetune_bart_large.json', 'r') as f:
    config = json.load(f)
    print(config)
    print(type(config))

SEED = config['config_machine']["SEED"]
NUM_LOADER = 4 #config['config_machine']["NUM_LOADER"] #depends of the number of thread 


# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

{'config_machine': {'SEED': 42, 'NUM_LOADER': 50}, 'config_model': {'MODEL_TYPE': 'bart', 'MODEL_NAME': 'bart-large', 'MODEL_HUB': 'facebook/bart-large'}, 'config_generate': {'num_beams': 4, 'min_length': 14, 'max_length': 200, 'no_repeat_ngram_size': 3, 'repetition_penalty': 1.5, 'length_penalty': 2, 'early_stopping': True, 'do_sample': True}, 'config_training': {'max_len': 1024, 'TRAIN_BATCH_SIZE': 2, 'VALID_BATCH_SIZE': 4, 'LEARNING_RATE': 0.0001, 'NB_EPOCHS': 15, 'early_stopping_patience': 5, 'reduce_lr_patience': 2, 'reduce_lr_factor': 0.1}}
<class 'dict'>


# Load dataset CNN daily

In [4]:
# Load CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Check the dataset structure
print(dataset)

## Comment this part for the real training time :

percentage = 0.05

for split in dataset: 
    dataset[split] = dataset[split].shuffle(seed=SEED).select(range(int(len(dataset[split]) * percentage)))

# Check the dataset structure
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 14355
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 668
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 574
    })
})


# load the model and tokenizer 

In [5]:
### Load model ###
MODEL_HUB = config["config_model"]["MODEL_HUB"]
# Load Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_HUB, clean_up_tokenization_spaces=True)
model = BartForConditionalGeneration.from_pretrained(MODEL_HUB, forced_bos_token_id=0)
print(tokenizer.model_max_length)
print(type(tokenizer))
print(type(model))

1024
<class 'transformers.models.bart.tokenization_bart_fast.BartTokenizerFast'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>


In [6]:
def len_distrib(batch):

    len_articles = []
    len_highlights = []
    
    for article, highlight in zip(batch["article"], batch["highlights"]):
        len_articles.append(len(tokenizer(article, truncation=False)["input_ids"]))
        len_highlights.append(len(tokenizer(highlight, truncation=False)["input_ids"]))


    source = tokenizer(batch["article"],truncation=True, max_length=tokenizer.model_max_length)
    resume = tokenizer(batch["highlights"],truncation=True, max_length=tokenizer.model_max_length)

    return {
        'input_ids': source['input_ids'], 
        'input_mask': source['attention_mask'],
        'input_len': len_articles,
        'target_ids': resume['input_ids'], 
        'target_mask': resume['attention_mask'],
        'target_len': len_highlights
        }


dataset = dataset.map(len_distrib,num_proc=NUM_PROCS,batched=True,batch_size=64)# Save the Hugging Face dataset


In [12]:
## TO CONTINUE

# Define the custom collate function
def collate_fn(batch):
    """
    Custom collate function that add padding for each batch.
    """

    # Pad the tokenized content
    input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    
    attention_mask = [torch.tensor(item['input_mask'], dtype=torch.long) for item in batch]
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)

    decoder_input_ids  = [torch.tensor(item['target_ids'][:-1], dtype=torch.long) for item in batch]
    decoder_input_ids = pad_sequence(decoder_input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)     
    
    decoder_attention_mask = [torch.tensor(item['target_mask'][:-1], dtype=torch.long) for item in batch]
    decoder_attention_mask = pad_sequence(decoder_attention_mask, batch_first=True, padding_value=0)
    
    # input_len = torch.tensor([item['input_len'] for item in batch], dtype=torch.long)

    # target_len = torch.tensor([item['target_len'] for item in batch], dtype=torch.long)

    # Labels should be the same as decoder_input_ids (BART-style training)
    labels = [torch.tensor(item['target_ids'][1:], dtype=torch.long) for item in batch]
    labels = pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)  
    labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding in loss computation

    return {
        'input_ids':input_ids,
        'attention_mask':attention_mask,
        'decoder_input_ids':decoder_input_ids,
        'decoder_attention_mask':decoder_attention_mask,
        'labels': labels,
        # 'input_len': input_len,
        # 'target_len': target_len
    }


train_params = {
    'batch_size': config["config_training"]["TRAIN_BATCH_SIZE"],
    'shuffle': True,
    'collate_fn':collate_fn,
    'num_workers': NUM_LOADER,
    'pin_memory': True  #  Enables faster GPU transfers
    }

eval_params = {
    'batch_size': config["config_training"]["VALID_BATCH_SIZE"],
    'shuffle': False,
    'collate_fn':collate_fn,
    'num_workers': NUM_LOADER,
    'pin_memory': True  #  Enables faster GPU transfers
    }


# This will be used down for training and validation stage for the model.
train_loader = DataLoader(dataset["train"], **train_params)
eval_loader = DataLoader(dataset["validation"], **eval_params)

for batch in train_loader:
    print(batch)
    break


{'input_ids': tensor([[    0,  4030,  3534,  ...,     1,     1,     1],
        [    0,  1640, 16256,  ...,     5, 23808,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'decoder_input_ids': tensor([[    0,   448,  3853,   661,  7369, 30421,   877,    10,  1212,  4318,
             6,   490,   668,    15,    10,   559,  4243, 26671,   479, 50118,
         36342,     5,   706,   848,    16,     5,  3787,     9,    41,  1475,
            12,   448,  3853,   661, 11941, 17157,     6,   799,   249,  1024,
           479, 50118,  2895,    82,    11,     5, 22369,    58,    31,   666,
            18,  2255,  1148,   537,   479, 50118, 25973,   692,  1554,   119,
         14669,  3657,  5695,     5,  1710,     6, 23297,     7,   465, 17685,
            29,   479],
        [    0,   250,    92,   266,   924,    14, 37442,  2099,   227,   604,
            16,    15,     5,  1430,   479, 50118,  6323,    33,  8113,   119,
          1720,  3240,  

In [8]:

# with torch.no_grad():
#     model.to(device)
#     model.eval()
#     batch = {key: val.to(device) for key, val in batch.items()}
#     output = model.generate(input_ids = batch["input_ids"], attention_mask= batch["attention_mask"], **config['config_generate'])
#     decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]


#     print("Generate summary: " , decoded_output,"\n\n")
#     print("Expected summary: " , tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])

# Entrainement des poids du modèle et sauvegarde 

In [13]:

def train(model, device, loader, optimizer, epoch, writer):
    """
    Function to call for training with the parameters passed from main function
    
    """

    total_loss = 0.0

    model.train()

    for _, batch in tqdm.tqdm(enumerate(loader, 0),desc=f'Total iterations: {len(loader)}', unit=" it"):

        batch = {key: val.to(device) for key, val in batch.items()}

        outputs = model(**batch)

        loss = outputs[0]
        total_loss += loss.item()
        
        writer.add_scalar(f"batch_loss/train", loss/len(batch["input_ids"]), epoch*len(loader) + _ )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return {"loss": total_loss/len(loader)} 

def eval(model, device, loader, epoch, writer): #epoch,writer 

    """
    Function to be called for evaluate with the parameters passed from main function

    """

    total_loss = 0.0

    model.eval()
    
    with torch.no_grad():
        for _, batch in tqdm.tqdm(enumerate(loader, 0),desc=f'Total iterations: {len(loader)}', unit=" it"):

            batch = {key: val.to(device) for key, val in batch.items()}

            outputs = model(**batch)

            loss = outputs[0]
            total_loss += loss.item()
            
            writer.add_scalar(f"batch_loss/train", loss/len(batch["input_ids"]), epoch*len(loader) + _ )

    return {"loss": total_loss/len(loader)} 

In [None]:
# Initialisation
time = datetime.now().strftime("%Y-%m-%d_%Hh%M")

checkpoint_path = f"./models_checkpoints/Bart-large-{time}"
os.makedirs(checkpoint_path,exist_ok=True)

log_dir = f"./logs/Bart-large-{time}" 
os.makedirs(log_dir,exist_ok=True)

writer = SummaryWriter(log_dir=log_dir)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params = model.parameters(), lr=config['config_training']["LEARNING_RATE"])

early_stopping_patience = config['config_training']["early_stopping_patience"]
reduce_lr_patience = config['config_training']["reduce_lr_patience"]
reduce_lr_factor = config['config_training']["reduce_lr_factor"]

scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=reduce_lr_patience, factor=reduce_lr_factor)


In [15]:
eval(model, device, eval_loader, 0, writer)

Total iterations: 167: 167 it [13:26,  4.83s/ it]


{'loss': 2.0759328146894536}

In [None]:
def main(model, device, train_loader, eval_loader, optimizer, writer, scheduler, checkpoint_path, NB_EPOCHS):
        
    best_loss = float('inf')
    patience_counter = 0
    current_lr = 0

    eval_loss, train_loss = [], []
    
    for epoch in range(NB_EPOCHS):
        outputs_train = train(model, device, train_loader, optimizer, epoch, writer)
        outputs_eval = eval(model, device, eval_loader, epoch, writer)
        
        train_loss.append(outputs_train["loss"])
        eval_loss.append(outputs_eval["loss"])

        writer.add_scalars("loss",{"train":outputs_train["loss"], "eval":outputs_eval["loss"] } , epoch)


        # Enregistrer le taux d'apprentissage actuel
        for param_group in optimizer.param_groups:
            current_lr = param_group['lr']
            writer.add_scalar("lr", current_lr, epoch)

        # Réduit automatiquement le taux d'apprentissage si la perte de validation ne diminue pas
        # sous les critères
        scheduler.step(eval_loss[-1])

        # Early Stopping
        if eval_loss[-1] < best_loss:
            best_loss = eval_loss[-1]
            patience_counter = 0
            # Sauvegarder le meilleur modèle
            model.save_pretrained(checkpoint_path)
            tokenizer.save_pretrained(checkpoint_path)
            epoch_best = epoch+1
            print(f'Find a better model at the epoch {epoch+1} - Train Loss: {train_loss[-1]:.4f}, eval Loss: {eval_loss[-1]:.4f}')
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                print("Early stopping triggered")
                print("current_lr : ", current_lr)
                print("epoch_best : ", epoch_best)
                break

        writer.close()

In [None]:
main(model.to(device), device, train_loader, eval_loader, optimizer, writer,scheduler,checkpoint_path, config['config_training']["NB_EPOCHS"])