# Replicating the BARTScore Results for Summarization from "Evaluating Generated Text as Text Generation" by Weizhe Yuan, Graham Neubig, and Pengfei Liu

The objective of this project was to replicate the results from the paper "Evaluating Generated Text as Text Generation" by Weizhe Yuan, Graham Neubig, and Pengfei Liu, specifically for the summarization task. For this purpose, we utilized the datasets provided on GitHub at : https://github.com/neulab/BARTScore/tree/main/SUM, as recommended in the paper.

The project is divided into three parts:

Dataset Analysis: In the first part, we explored the datasets to understand their structure and components.

Custom BART Scorer: In the second part, we implemented a custom (vanilla version) BART scorer from scratch to evaluate the summarization quality.

Evaluation and Comparison: In the final part, we computed evaluation scores using several metrics mentioned in the paper, including ROUGE-1, ROUGE-2, ROUGE-L, BertScore, MoverScore, and PRISM. We then compared the results obtained from these metrics with the scores from our custom BARTScore implementation and the scores provided in the dataset.



## 1. Dataset Analysis

In [65]:
! pip install rouge_score evaluate torch transformers 
! pip install moverscore pyemd pytorch_pretrained_bert
! pip install bert-score
from scipy.stats import spearmanr
import requests
import pickle
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import evaluate
from nltk.tokenize import sent_tokenize
import random
import pandas as pd
from moverscore import word_mover_score, get_idf_dict
import numpy as np
from bert_score import BERTScorer




### 1.a For the NeR18 dataset

In [57]:
# URLs of the raw real_summ files
urls = {
    'summ_eval.pkl': 'https://raw.githubusercontent.com/neulab/BARTScore/main/SUM/SummEval/data.pkl',
}

# Download each file
for filename, url in urls.items():
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f'Successfully downloaded {filename}')
    else:
        print(f'Failed to download {filename} from {url}')


Successfully downloaded summ_eval.pkl


In [58]:
with open('real_summ.pkl', 'rb') as f:
    real_summ = pickle.load(f)
    print(type(real_summ))

<class 'dict'>


In [59]:
def print_structure(d, indent=0):
    """
    Recursively prints the structure of keys and subkeys in a hierarchical format.
    :param d: The dictionary to traverse
    :param indent: The current level of indentation for hierarchy
    """
    for key, value in d.items():
        print("  " * indent + f"- {key}")
        if isinstance(value, dict):  # If the value is another dictionary, recurse
            print_structure(value, indent + 1)

In [60]:
random_element_real_summ = random.choice(list(real_summ.values()))
print_structure(random_element_real_summ)

- src
- ref_summ
- sys_summs
  - presumm_out_trans_abs.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - two_stage_rl_out.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - unilm_out_v2.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - t5_out_large.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - presumm_out_ext_abs.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - ptr_generator_out_pointer_gen_cov.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - bart_out.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - fast_abs_rl_out_rerank.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - t5_out_11B.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - presumm_out_abs.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - bottom_up_out.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - unilm_out_v1.txt
    - sys_summ
    - scores
      - litepyramid_recall
  - t5_out

## 2. Custom BART Scorer

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
# Build our custom Bart Scorer which uses pretrainned BART
class CustomBartScorer:
    def __init__(self, model_name="facebook/bart-large-cnn", device=device):
        """
        Initialize the tokenizer and model for computing BartScore.
        Args:
            model_name (str): Pretrained BART model checkpoint.
            device (str): Device to run computations
        """
        self.device = device
        self.tokenizer = BartTokenizer.from_pretrained(model_name)
        self.model = BartForConditionalGeneration.from_pretrained(model_name)
        self.model.to(device)
        self.model.eval()

    def compute_log_probs(self, src_text, tgt_text):
        """
        Compute the log probabilities of the target text given the source text.
        Args:
            src_text (str): Source text
            tgt_text (str): Target text
        Returns:
            log_prob (float): The log probability of the target text.
        """
        # Tokenize source and target texts
        src_inputs = self.tokenizer(src_text, return_tensors="pt", max_length=1024, truncation=True, padding=True).to(self.device)
        tgt_inputs = self.tokenizer(tgt_text, return_tensors="pt", max_length=1024, truncation=True, padding=True).to(self.device)

        # Forward pass with source as input and target as labels
        with torch.no_grad():
            outputs = self.model(**src_inputs, labels=tgt_inputs["input_ids"])
            logits = outputs.logits  # Logits: (batch_size, seq_len, vocab_size)

        # Compute log probabilities using log-softmax
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

        # Gather log probabilities of the target tokens
        tgt_token_ids = tgt_inputs["input_ids"]
        tgt_mask = tgt_inputs["attention_mask"]
        seq_len = tgt_mask.sum(dim=1)

        # Collect log probabilities for the correct target tokens
        tgt_log_probs = log_probs.gather(2, tgt_token_ids.unsqueeze(-1)).squeeze(-1)

        # Mask out padding tokens and sum log probabilities
        tgt_log_probs = tgt_log_probs * tgt_mask
        total_log_probs = tgt_log_probs.sum(dim=1)

        # Normalize by sequence length
        normalized_log_probs = total_log_probs / seq_len

        return normalized_log_probs.item()

    def compute_bartscore(self, src, tgt):
        """
        Compute BartScore for a given source and target text.
        Args:
            src (str): Source text.
            tgt (str): Target text.
        Returns:
            score (float): BartScore value.
        """
        return self.compute_log_probs(src, tgt)


In [13]:
scorer = CustomBartScorer(model_name="facebook/bart-large-cnn", device="cuda")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

## 3. Evaluation and Comparison


In [15]:
rouge = evaluate.load("rouge")

In [16]:
# Store ROUGE, Bertscore, Prism, Moverscore and BartScore results
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
bart_scores = []

# Loop over each element in our dataset
for element in real_summ.values():
    src = element["src"]
    ref_summ = element["ref_summ"]
    sys_summary = element["sys_summs"]

    # For each system summary variant (fragments, textrank, etc.), compute ROUGE and BartScore
    for key, sys_sum_dict in sys_summary.items():
        sys_sum = sys_sum_dict["sys_summ"]
        scores = sys_sum_dict["scores"]

        # Compute ROUGE score
        rouge_result = rouge.compute(predictions=[sys_sum], references=[ref_summ])

        # Store Rouge1 / Rouge2 / RougeL scores
        rouge1_score = rouge_result['rouge1']
        rouge2_score = rouge_result['rouge2']
        rougeL_score = rouge_result['rougeL']

        # Compute BartScore
        bart_score = scorer.compute_bartscore(src, sys_sum)

        # Compute BERTSCORE

        # Append the results
        rouge1_scores.append(rouge1_score)
        rouge2_scores.append(rouge2_score)
        rougeL_scores.append(rougeL_score)
        bart_scores.append(bart_score)

In [29]:
sys_summary_rnd = random_element["sys_summs"]
sys_summary_rnd.keys()

dict_keys(['presumm_out_trans_abs.txt', 'two_stage_rl_out.txt', 'unilm_out_v2.txt', 't5_out_large.txt', 'presumm_out_ext_abs.txt', 'ptr_generator_out_pointer_gen_cov.txt', 'bart_out.txt', 'fast_abs_rl_out_rerank.txt', 't5_out_11B.txt', 'presumm_out_abs.txt', 'bottom_up_out.txt', 'unilm_out_v1.txt', 't5_out_base.txt', 'semsim_out.txt', 'neusumm_out.txt', 'pnbert_out_lstm_pn_rl.txt', 'refresh_out.txt', 'heter_graph_out.txt', 'pnbert_out_bert_tf_sl.txt', 'banditsumm_out.txt', 'pnbert_out_bert_lstm_pn_rl.txt', 'pnbert_out_bert_tf_pn.txt', 'matchsumm_out.txt', 'pnbert_out_bert_lstm_pn.txt'])

In [34]:
    # Retrieve and store the score of "litepyramid_recall", 
    # as it is the only key inside the "scores" it must correspond to the COV on the table
    cov = []
    
    for element in real_summ.values():
        src = element["src"]
        ref_summ = element["ref_summ"]
        sys_summary = element["sys_summs"]
    
        # Loop over the different summarization methods (presumm_out_trans_abs, two_stage_rl_out, etc.)
        for key, sys_sum_dict in sys_summary.items():
    
            # append the scores to their corresponding lists
            # as this is the only score provided, it must correspond to COV
            cov.append(sys_sum_dict['scores']["litepyramid_recall"]) 

# MoverScore

In [None]:
# Initialize lists to store generated summaries and reference summaries
generated_texts = []
reference_texts = []

# Iterate through the dataset to extract generated and reference texts
for element in real_summ.values():
    ref_summ = element['ref_summ']  # Reference summary
    sys_summaries = element['sys_summs']  # System-generated summaries collection

    for sys_name, sys_data in sys_summaries.items():
        sys_summ = sys_data['sys_summ']  # Extract system-generated summary

        # Add generated and reference texts to the respective lists
        generated_texts.append(sys_summ)
        reference_texts.append(ref_summ)


In [69]:

# Calculate the IDF for reference and generated summaries
idf_reference = get_idf_dict(reference_texts)
idf_generated = get_idf_dict(generated_texts)

# Calculate MoverScore
mover_scores = word_mover_score(
    reference_texts,          # List of reference summaries
    generated_texts,          # List of generated summaries
    idf_reference,            # IDF dictionary for reference texts
    idf_generated,            # IDF dictionary for generated texts
    stop_words=[],            # Stopwords, typically used to remove non-essential words
    n_gram=1,                 # Use n-gram, default is 1 (unigram)
    remove_subwords=True,     # Whether to remove subwords
    batch_size=8,             # Batch size, adjust to improve calculation speed
    device='cuda'             # Choose computing device, e.g., 'cuda' or 'cpu'
)



# BERTScore

In [None]:
# initialize BERTScorer
scorer = BERTScorer(lang="en", rescale_with_baseline=True)

# calculate BERTScore
P, R, F = scorer.score(generated_texts, reference_texts)

# Calculate Spearman correlation between ROUGE-1 and COH
corr, p_value = spearmanr(F, coh)

print(f"Spearman correlation of BERTScore: {corr}")
print(f"P-value of BERTScore: {p_value}")

In [74]:
# Create a dictionary to store the real_summ
correlation_real_summ = {}

# Define the metrics and human evaluation scores
metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BartScore', 'MoverScore', 'BERTScore']
human_scores = ['cov']

# Calculate Spearman correlations
for metric in metrics:
    correlation_data[metric] = {}
    if metric == 'ROUGE-1':
        metric_scores = rouge1_scores
    elif metric == 'ROUGE-2':
        metric_scores = rouge2_scores
    elif metric == 'ROUGE-L':
        metric_scores = rougeL_scores
    elif metric == 'BartScore':
        metric_scores = bart_scores
    elif metric == 'MoverScore':
        metric_scores = mover_scores
    elif metric == 'BERTScore':
        metric_scores = F
    for human_score in human_scores:
        if human_score == 'cov':
            scores = cov
            
        correlation, _ = spearmanr(metric_scores, scores)
        correlation_data[metric][human_score] = correlation

# Create the DataFrame
correlation_df = pd.DataFrame.from_dict(correlation_data, orient='index')

# Display the DataFrame
correlation_df

Unnamed: 0,cov
ROUGE-1,0.454249
ROUGE-2,0.467728
ROUGE-L,0.431196
BartScore,0.129514
MoverScore,0.430399
BERTScore,0.441452
