# Finetuning Bart large on CNN daily news dataset

In [1]:
### MODULES ###

import sys,os
import tqdm
import csv
import numpy as np
import pandas as pd
import json
import torch

from datasets import load_dataset

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Load the ROUGE metric
import evaluate

from transformers import AutoTokenizer, BartForConditionalGeneration

# Load the ROUGE metric
import evaluate


#link : https://github.com/facebookresearch/fairseq/tree/main/examples/bart

In [None]:

NUM_PROCS = os.cpu_count() 

print("NUM_PROCS = " ,NUM_PROCS)

MODEL_HUB = "facebook/bart-large"

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)


NUM_PROCS =  12
cuda


In [None]:
# Load configuration from JSON
with open('./config_finetune_bart_large.json', 'r') as f:
    config = json.load(f)
    print(config)
    print(type(config))

SEED = config['config_machine']["SEED"]
NUM_LOADER = 4 #config['config_machine']["NUM_LOADER"] #depends of the number of thread 


# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

{'config_machine': {'SEED': 42, 'NUM_LOADER': 50}, 'config_generate': {'NUM_BEAM': 4, 'min_len_resume': 14, 'max_len_resume': 159, 'no_repeat_ngram_size': 3, 'repetition_penalty': 1.5, 'length_penalty': 2, 'early_stopping': True, 'use_cache': False}, 'config_training': {'max_len': 1024, 'BATCH_SIZE': 16}}
<class 'dict'>


# Load dataset CNN daily

In [None]:
# Load CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Check the dataset structure
print(dataset)

## Comment this part for the real training time :

percentage = 0.05

for split in dataset: 
    dataset[split] = dataset[split].shuffle(seed=SEED).select(range(int(len(dataset[split]) * percentage)))

# Check the dataset structure
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 14355
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 668
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 574
    })
})


# load the model and tokenizer 

In [None]:
### Load model ###

# Load Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_HUB, clean_up_tokenization_spaces=True)
model = BartForConditionalGeneration.from_pretrained(MODEL_HUB)
print(tokenizer.model_max_length)

In [18]:
def len_distrib(batch):

    len_articles = []
    len_highlights = []
    
    for article, highlight in zip(batch["article"], batch["highlights"]):
        len_articles.append(len(tokenizer(article, truncation=False)["input_ids"]))
        len_highlights.append(len(tokenizer(highlight, truncation=False)["input_ids"]))


    source = tokenizer(batch["article"],truncation=True, max_length=tokenizer.model_max_length)
    resume = tokenizer(batch["highlights"],truncation=True, max_length=tokenizer.model_max_length)

    return {
        'input_ids': source['input_ids'], 
        'input_mask': source['attention_mask'],
        'input_len': len_articles,
        'target_ids': resume['input_ids'], 
        'target_mask': resume['attention_mask'],
        'target_len': len_highlights
        }


dataset = dataset.map(len_distrib,num_proc=NUM_PROCS,batched=True,batch_size=64)# Save the Hugging Face dataset


Map (num_proc=12):   0%|          | 0/14355 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1267 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1055 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1283 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1232 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1297 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Map (num_proc=12):   0%|          | 0/668 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1138 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1060 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1198 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1103 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1441 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Map (num_proc=12):   0%|          | 0/574 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1075 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1060 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1200 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1182 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1868 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

In [35]:

# Define the custom collate function
def collate_fn(batch):
    """
    Custom collate function that add padding for each batch.
    """

    id = [item['id'] for item in batch]

    # Pad the tokenized content
    padded_text_ids = pad_sequence(
        [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch], 
        batch_first=True, 
        padding_value=tokenizer.pad_token_id)
    
    padded_text_mask = pad_sequence(
        [torch.tensor(item['input_mask'], dtype=torch.long) for item in batch], 
        batch_first=True, 
        padding_value=0)

    decoder_input_ids = pad_sequence(
        [torch.tensor(item['target_ids'], dtype=torch.long) for item in batch], 
        batch_first=True, 
        padding_value=tokenizer.pad_token_id)     
    
    decoder_attention_mask = pad_sequence(
        [torch.tensor(item['target_mask'], 
                      dtype=torch.long) for item in batch], 
                      batch_first=True, 
                      padding_value=0)
    
    input_len = [item['input_len'] for item in batch]

    target_len = [item['target_len'] for item in batch]

    highlights = [item['highlights'] for item in batch]

    
    

    return {
        'id':id,
        'input_ids':padded_text_ids,
        'attention_mask':padded_text_mask,
        'decoder_input_ids':decoder_input_ids,
        'target_mask':decoder_attention_mask,
        'input_len': input_len ,
        'target_len': target_len,
        'highlights': highlights
    }


params = {
    'batch_size': BATCH_SIZE,
    'shuffle': False,
    'collate_fn':collate_fn,
    'num_workers': NUM_LOADER,
    'pin_memory': True  #  Enables faster GPU transfers
    }

# This will be used down for training and validation stage for the model.
loader = DataLoader(dataset, **params)

for batch in loader:
    print(batch)
    break


{'id': ['08cf276c9eadb638e0c7fdc83ce0229c8af5d09b', 'a0965f34cb08bd7db5845f8285dc8a9512d3e590'], 'input_ids': tensor([[    0, 18581,  3916,  ...,     1,     1,     1],
        [    0, 18581,  3916,  ...,   185,   480,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'decoder_input_ids': tensor([[    0, 10567,     8,   479, 50118, 37779,  5460,  4350,    58,  2967,
          2863,    49,  1354,     6,  3066, 26743,   479, 50118,  1213,    58,
           303,    23, 31103,   271,   459,  2193,   861,    11, 21690,   479,
         50118, 40333,   224,     5,   130,   962,     9,  4363,  6154, 24260,
           479, 50118,  5873,  4060,   154,   479,     2,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1],
        [    0,  5341,    35,  7662,    64,  1807,    25,  1246,     9,  4601,
             6,   735,   446,  1565,   161, 

In [25]:

rouge = evaluate.load('rouge')
candidates = ["Summarization is cool","I love Machine Learning","Good night"]

references = ["summarization is beneficial and cool","i think i love Machine Learning","Good night everyone!"]
             
results = rouge.compute(predictions=candidates, references=references)
print(results)

{'rouge1': 0.7833333333333332, 'rouge2': 0.5833333333333334, 'rougeL': 0.7833333333333332, 'rougeLsum': 0.7833333333333332}


In [36]:
import csv

with open('./rouge.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["rouge1", "rouge2", "rougeL"]
    writer.writerow(field)

with open('./len.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["id", "input_len", "target_len", "generate_len"]
    writer.writerow(field)

In [37]:


model.eval()
model.to(device)

rouge1_score, rouge2_score , rougeL_score = 0, 0, 0
nb_sample = 0

exclude_ids = torch.tensor([0, 1, 2, 3, 50264]).to(device)

with torch.no_grad():
    
    for _, batch in tqdm.tqdm(enumerate(loader, 0),desc=f'total iter: {len(loader)}', unit=" iter"):
        

        generated_ids = model.generate(
              input_ids = batch["input_ids"].to(device),
              attention_mask = batch["attention_mask"].to(device), 
              max_length=max_len_resume, 
              num_beams=NUM_BEAM,
              repetition_penalty=repetition_penalty, 
              length_penalty=length_penalty, 
              early_stopping=early_stopping
              )   
        #print(generated_ids)

        generated_txt = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        #print(generated_txt)
        #print(type(generated_txt))

        mask = ~torch.isin(generated_ids, exclude_ids) #mask to skip the special tokens 
        generate_len = mask.sum(dim=1)  

        with open('./len.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerows([[batch["id"][i], batch["input_len"][i], batch["target_len"][i], generate_len[i].item()] for i in range(BATCH_SIZE)])

        # Compute ROUGE scores here
        rouge_results = rouge.compute(predictions=generated_txt, references=batch["highlights"])
        
        
        with open('./rouge.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([rouge_results['rouge1'], rouge_results['rouge2'], rouge_results['rougeL']])

        rouge1_score += rouge_results['rouge1'].item()
        rouge2_score += rouge_results['rouge2'].item()
        rougeL_score += rouge_results['rougeL'].item()

        nb_sample+=1

        if nb_sample == 2:
            break
        

with open('./rouge_total.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["Total_rouge1", "Total_rouge2", "Total_rougeL"]
    writer.writerow(field)
    writer.writerow([rouge1_score/nb_sample*100, rouge2_score/nb_sample*100, rougeL_score/nb_sample*100])


total iter: 14356: 1 iter [01:15, 75.20s/ iter]


# Trash code 