# Evaluation of Bart-CNN score rouge

In [33]:
import sys,os
import numpy as np
import pandas as pd
import torch

from datasets import load_dataset
from datasets import load_from_disk

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Load the ROUGE metric
import evaluate

from transformers import AutoTokenizer

In [34]:

NUM_PROCS = os.cpu_count() 
NUM_LOADER = 8

print("NUM_PROCS = " ,NUM_PROCS)

MODEL_HUB = "facebook/bart-large-cnn"

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'



max_len = 1024


BATCH_SIZE =2

NUM_BEAM = 5
max_len_resume = 200
repetition_penalty=2.0
length_penalty=1.0
early_stopping=True




NUM_PROCS =  12


In [35]:
from transformers import AutoTokenizer, BartForConditionalGeneration

# Load Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_HUB, clean_up_tokenization_spaces=True)
model = BartForConditionalGeneration.from_pretrained(MODEL_HUB)

print(model.config.max_position_embeddings) 


1024


In [36]:
# Load dataset (e.g., CNN/DailyMail)
dataset = load_dataset("cnn_dailymail", "3.0.0", split='train')
# Load the saved dataset
#dataset = load_from_disk('data/cnn_dailymail')

In [37]:
def len_distrib(batch):

    len_articles = []
    len_highlights = []
    
    # Prefix the "summarize: " instruction to each article (can be adjusted depending on your task)
    batch["article"] = ["summarize: " + article for article in batch["article"]]

    for article, highlight in zip(batch["article"], batch["highlights"]):
        len_articles.append(len(tokenizer(article, truncation=False)["input_ids"]))
        len_highlights.append(len(tokenizer(highlight, truncation=False)["input_ids"]))


    source = tokenizer(batch["article"],truncation=True, max_length=max_len)
    resume = tokenizer(batch["highlights"],truncation=True, max_length=max_len)

    return {
        'input_ids': source['input_ids'], 
        'input_mask': source['attention_mask'],
        'input_len': len_articles,
        'target_ids': resume['input_ids'], 
        'target_mask': resume['attention_mask'],
        'target_len': len_highlights
        }



dataset = dataset.map(len_distrib,num_proc=NUM_PROCS,batched=True,batch_size=32)# Save the Hugging Face dataset
dataset.save_to_disk('data/cnn_dailymail')
print("Dataset saved successfully.")


Map (num_proc=12):   0%|          | 0/287113 [00:00<?, ? examples/s]

Saving the dataset (0/9 shards):   0%|          | 0/287113 [00:00<?, ? examples/s]

Dataset saved successfully.


In [38]:

# Define the custom collate function
def collate_fn(batch):
    """
    Custom collate function that add padding for each batch.
    """

    # Pad the tokenized content
    padded_text_ids = pad_sequence(
        [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch], 
        batch_first=True, 
        padding_value=tokenizer.pad_token_id)
    
    padded_text_mask = pad_sequence(
        [torch.tensor(item['input_mask'], dtype=torch.long) for item in batch], 
        batch_first=True, 
        padding_value=0)

    decoder_input_ids = pad_sequence(
        [torch.tensor(item['target_ids'], dtype=torch.long) for item in batch], 
        batch_first=True, 
        padding_value=tokenizer.pad_token_id)     
    
    decoder_attention_mask = pad_sequence(
        [torch.tensor(item['target_mask'], 
                      dtype=torch.long) for item in batch], 
                      batch_first=True, 
                      padding_value=0)
    
    input_len = [item['input_len'] for item in batch]

    target_len = [item['target_len'] for item in batch]
    

    return {
        'input_ids':padded_text_ids,
        'attention_mask':padded_text_mask,
        'decoder_input_ids':decoder_input_ids,
        'target_mask':decoder_attention_mask,
        'input_len': input_len ,
        'target_len': target_len
    }


params = {
    'batch_size': BATCH_SIZE,
    'shuffle': False,
    'collate_fn':collate_fn,
    'num_workers': NUM_LOADER
    }

# This will be used down for training and validation stage for the model.
loader = DataLoader(dataset, **params)

for batch in loader:
    print(batch)
    break


{'input_ids': tensor([[    0, 18581,  3916,  ...,     1,     1,     1],
        [    0, 18581,  3916,  ...,  1441,   479,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'decoder_input_ids': tensor([[    0, 29345, 10997,   999,  3028,  7312, 20152,  1516,   984,   844,
           448, 13016,    25,    37,  4072,   504,   302,   479, 50118, 22138,
          2701,   161,    37,    34,   117,   708,     7,   856,  3961,  1334,
            39,  1055,   409,   479, 50118, 28243, 20152,    18,  1107,    31,
            78,   292, 10997,  3541,    33,    57,   547,    11,  2416,  1391,
           479,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [    0,   448,  1342,  2368,  4812,  8039,    11,  2561,    32, 15740,
            15,     5,    22,  1990, 35095,  1929,   113, 50118, 40145,  5031,
          1063,  1594,  

In [39]:

rouge = evaluate.load('rouge')
candidates = ["Summarization is cool","I love Machine Learning","Good night"]

references = ["Summarization is beneficial and cool","I think i love Machine Learning","Good night everyone!"]
             
results = rouge.compute(predictions=candidates, references=references)
print(results)

{'rouge1': 0.7833333333333332, 'rouge2': 0.5833333333333334, 'rougeL': 0.7833333333333332, 'rougeLsum': 0.7833333333333332}


In [40]:
import csv

with open('./rouge.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["rouge1", "rouge2", "rougeL"]
    writer.writerow(field)

with open('./len.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["input_len","target_len", "generate_len"]
    writer.writerow(field)

In [41]:
import tqdm

model.eval()
model.to(device)

rouge1_score, rouge2_score , rougeL_score = 0, 0, 0
nb_sample = 0

exclude_ids = torch.tensor([0, 1, 2, 3, 50264]).to(device)

with torch.no_grad():
    
    for _, batch in tqdm.tqdm(enumerate(loader, 0),desc=f'total iter: {len(loader)}', unit=" iter"):
        
        y_txt = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)

        generated_ids = model.generate(
              input_ids = batch["input_ids"].to(device),
              attention_mask = batch["attention_mask"].to(device), 
              max_length=max_len_resume, 
              num_beams=NUM_BEAM,
              repetition_penalty=repetition_penalty, 
              length_penalty=length_penalty, 
              early_stopping=early_stopping
              )   
        #print(generated_ids)

        generated_txt = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        #print(generated_txt)

        mask = ~torch.isin(generated_ids, exclude_ids)  
        generate_len = mask.sum(dim=1)  

        for i in range(BATCH_SIZE):
            with open('./len.csv', 'a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow([batch["input_len"][i],batch["target_len"][i],generate_len[i].item()])

        # Compute ROUGE scores here
        rouge_results = rouge.compute(predictions=generated_txt, references=y_txt)
        
        
        with open('./rouge.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([rouge_results['rouge1'], rouge_results['rouge2'], rouge_results['rougeL']])

        rouge1_score += rouge_results['rouge1']
        rouge2_score += rouge_results['rouge2']
        rougeL_score += rouge_results['rougeL']

        nb_sample+=1

        if nb_sample == 3:
            break
        

with open('./rouge_total.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["rouge1", "rouge2", "rougeL"]
    writer.writerow(field)
    writer.writerow([rouge1_score/nb_sample*100, rouge2_score/nb_sample*100, rougeL_score/nb_sample*100])


total iter: 143557: 2 iter [00:30, 15.21s/ iter]


In [47]:
filtered_dataset = dataset.filter(lambda example: example["id"] == "42c027e4ff9730fbb3de84c1af0d2c506e41c3e4")
filtered_dataset

Filter:   0%|          | 0/287113 [00:00<?, ? examples/s]

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'input_mask', 'input_len', 'target_ids', 'target_mask', 'target_len'],
    num_rows: 1
})