# Imports 

In [442]:
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

from ast import literal_eval
import json

import torch

from rouge_score import rouge_scorer
from torchmetrics.text.bert import BERTScore
# Note: Lines 188 and 196 of torchmetrics.bert were set to `truncation = True` as the previous code threw errors that failed to truncate text
from nltk.translate import bleu_score

from typing import List, Tuple, Union, Optional

# Settings and Configuration 

In [2]:
# Mutes warning concerning tokenizers and forking processes
%set_env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [3]:
RES_BASE      = '../../results/'

BART_ZS_FP    = RES_BASE + 'bart_zshot_gens.csv'
PRIMERA_ZS_FP = RES_BASE + 'primera_zshot_gens.csv'

LR_RES_FP     = RES_BASE + 'lexrank_results.csv'
CMOS_RES_FP   = RES_BASE + 'cmos_results.csv'
TOGL_RES_FP   = RES_BASE + '/togl_decoding/togl_predictions.csv'
COCO_RES_FP   = RES_BASE + 'cocosum_results.csv'
BART_FS_FP    = RES_BASE + 'bart_fshot_gens.csv'
PRIMERA_FS_FP = RES_BASE + 'primera_fshot_gens.csv'

TOGL_ABL_FP   = RES_BASE + '/togl_decoding/togl_decoding_w_%d.csv'

In [15]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer = True)

bert_scorer = BERTScore(model_name_or_path = 'roberta-large', 
                        rescale_with_baseline = True, 
                        lang = 'en',
                        verbose = True, 
                        max_length = 510,
                        device = torch.device('cuda:0'))

# Data Reading 

In [61]:
bart_zs_res = pd.read_csv(BART_ZS_FP)
prim_zs_res = pd.read_csv(PRIMERA_ZS_FP)

In [28]:
# Extractive Baselines
lr_res = pd.read_csv(LR_RES_FP)
cmos_res = pd.read_csv(CMOS_RES_FP)

# Abstractive Pre-trained Baselines
bart_zs_res = pd.read_csv(BART_ZS_FP)
bart_fs_res = pd.read_csv(BART_FS_FP)
prim_zs_res = pd.read_csv(PRIMERA_ZS_FP)
prim_fs_res = pd.read_csv(PRIMERA_FS_FP)

# Abstract Contrastive Approaches
coco_res = pd.read_csv(COCO_RES_FP)
togl_res = pd.read_csv(TOGL_RES_FP)

# Togl Weight Ablation
togl_abl = []
for i in range(2, 6):
    togl_abl.append(pd.read_csv(TOGL_ABL_FP % i))

In [29]:
lr_res['title_date'] = lr_res['title'] + '_' + lr_res['date']
lr_res = lr_res[['title_date', 'lexrank_lsum', 'lexrank_rsum', 'left_sum', 'right_sum']]
cmos_res['title_date'] = cmos_res['title'] + '_' + cmos_res['date']
cmos_res = cmos_res[['title_date', 'cmos_lsum', 'cmos_rsum', 'left_sum', 'right_sum']]

In [30]:
togl_res.columns = ['title_date', 'togl_lsum', 'togl_rsum']
togl_res = togl_res.merge(lr_res[['title_date', 'left_sum', 'right_sum']], on = 'title_date')

bart_fs_res = bart_fs_res.merge(lr_res[['title_date', 'left_sum', 'right_sum']], on = 'title_date')
prim_fs_res = prim_fs_res.merge(lr_res[['title_date', 'left_sum', 'right_sum']], on = 'title_date')

for i, togl_res_w in enumerate(togl_abl):
    togl_abl[i].columns = ['title_date', 'togl_lsum', 'togl_rsum']
    togl_abl[i] = togl_abl[i].merge(lr_res[['title_date', 'left_sum', 'right_sum']], on = 'title_date')

# Evaluation Setup

## Functions 

In [446]:
def get_rouge(text: str, ref: str) -> Tuple[float]:
    '''
        Get Rouge-F1 scores given a text and reference
        
        Parameters:
            -text:str
                Prediction text for rouge calculation
            -ref: str
                Reference summary for rouge calculation
                
        Return
            Tuple of the ROUGE-1, ROUGE-2, and ROUGE-L scores
    '''
    
    rouge = scorer.score(text, ref)
    rouge1 = rouge['rouge1'].fmeasure
    rouge2 = rouge['rouge2'].fmeasure
    rougeL = rouge['rougeL'].fmeasure
    
    return rouge1, rouge2, rougeL

def rouge_and_reverse(row: pd.Series, model: str, reverse = True) -> pd.Series:
    '''
        Calculate rouge scores given a row from a results dataset.
        Also swap the left and right text for approaches that are not expected to distinguish
            left and right perspectives.
            
        Parameters:
            -row
                Current row in the dataframe (to be used in apply/progress_apply)
            -model: str
                Name of the model to access predictions in the dataframe
        Return:
            Altered row including two columns with predictions that maximize ROUGE-2 (new_l_text, new_r_text)
            as well as rouge F scores in separate columns in the form [lr]_rouge[12L]
    '''
    
    lsum, rsum   = row[f'left_sum'], row['right_sum']
    ltext, rtext = row[f'{model}_lsum'], row[f'{model}_rsum']
    
    # Rouge Scores
    
    # Calculate scores assuming predictions are aligned
    lrouge_1f, lrouge_2f, lrouge_Lf = get_rouge(ltext, lsum)
    rrouge_1f, rrouge_2f, rrouge_Lf = get_rouge(rtext, rsum)
    
    # Calculate scores with swapped left and right predictions
    if reverse:
        lrouge_1r, lrouge_2r, lrouge_Lr = get_rouge(ltext, rsum)
        rrouge_1r, rrouge_2r, rrouge_Lr = get_rouge(rtext, lsum)
    
    # Instantiate the left and right rouge scores before possibly reversing scores
    lrouge_1, lrouge_2, lrouge_L = lrouge_1f, lrouge_2f, lrouge_Lf
    rrouge_1, rrouge_2, rrouge_L = rrouge_1f, rrouge_2f, rrouge_Lf
    
    # If the reverse scoring is better than the forward scoring, swap left and right texts
    if ((lrouge_2f + rrouge_2f) < (lrouge_2r + rrouge_2r)) and reverse:
        # Swap left and right text
        temp = ltext
        ltext = rtext
        rtext = temp
        
        lrouge_1, lrouge_2, lrouge_L = lrouge_1r, lrouge_2r, lrouge_Lr
        rrouge_1, rrouge_2, rrouge_L = rrouge_1r, rrouge_2r, rrouge_Lr
            
    row['new_l_text'] = ltext
    row['new_r_text'] = rtext
    row['l_rouge1'] = lrouge_1
    row['l_rouge2'] = lrouge_2
    row['l_rougeL'] = lrouge_L
    row['r_rouge1'] = rrouge_1
    row['r_rouge2'] = rrouge_2
    row['r_rougeL'] = rrouge_L
    
    return row

In [447]:
def get_self_bleus(preds1: List[str], preds2: List[str]) -> List[float]:
    '''
        Calculates self-bleu scores for lists of left and right predictions
        
        Parameters:
            -preds1: List[str]
                List of left (or right) predicted summaries
            -preds2: List[str]
                List of right (or left) predicted summaries
        
        Return:
            -List of self-BLEU scores for each pair of texts
    '''
    
    # Iterate over predicted pairs and calculate self-bleu scores
    self_bleus = []
    for pred1, pred2 in tqdm(zip(preds1, preds2)):
        bleu = bleu_score.sentence_bleu(pred1, pred2, weights = (1.0,))
        self_bleus.append(bleu)
        
    return self_bleus

def get_self_bleus_df(df: pd.DataFrame, model: str) -> List[float]:
    '''
        Calculate the self-BLEU scores given a dataframe with predictions
        
        Parameters:
            -df: pd.DataFrame
                Dataframe containing at least the model predictions with columns named ('<MODEL>_lsum', '<MODEL>_rsum')
            -model: str
                Name of the model responsible for predictions
                
        Return:
            List of self-BLEU scores for each pair of texts
    '''
    
    ltexts = df[f'{model}_lsum'].values
    rtexts = df[f'{model}_rsum'].values
    
    self_bleus = get_self_bleus(ltexts, rtexts)
    
    return self_bleus

In [448]:
def get_bert_scores_df(df:pd.DataFrame, model:str) -> pd.DataFrame:
    '''
        Calculate bert scores given a dataframe with model predictions and summaries. Should run rouge_and_reverse
            first to ensure summaries and predictions are aligned
            
        Parameters:
            -df: pd.DataFrame
                Dataframe with predicted summaries (new_[lr]_text) and reference summaries ([left|right]_sum)
            -model: str
                Name of model responsible for predictions
        
        Return:
            Original dataframe with additional columns 'l_bscore' and 'r_bscore' holding BERTScores
    '''
    
    l_bscores = bert_scorer(df['new_l_text'].astype(str).tolist(), df['left_sum'].astype(str).tolist())
    r_bscores = bert_scorer(df['new_r_text'].astype(str).tolist(), df['right_sum'].astype(str).tolist())
    
    df['l_bscore'] = l_bscores['f1']
    df['r_bscore'] = r_bscores['f1']
    
    return df

In [19]:
def get_avg_metrics(df: pd.DataFrame):
    '''
        Calculate average coverage and diversity metrics for a dataframe with row-level metrics
        
        Parameters:
            -df: pd.Dataframe
                Dataframe holding ROUGE scores, self-BLEU, and BERTScores after running above functions
        
        Return:
            Dictionary holding average metrics across ROUGE, self-BLEU, and BERTScores
    '''
    
    l_rouge1 = df['l_rouge1'].mean()
    l_rouge2 = df['l_rouge2'].mean()
    l_rougeL = df['l_rougeL'].mean()
    l_bscore = df['l_bscore'].mean()
    
    r_rouge1 = df['r_rouge1'].mean()
    r_rouge2 = df['r_rouge2'].mean()
    r_rougeL = df['r_rougeL'].mean()
    r_bscore = df['r_bscore'].mean()
    
    self_bleu = df['self_bleu'].mean()
    
    return {'l_rouge1': l_rouge1, 
            'l_rouge2': l_rouge2, 
            'l_rougeL': l_rougeL,
            'l_bscore': l_bscore,
            'r_rouge1': r_rouge1, 
            'r_rouge2': r_rouge2, 
            'r_rougeL': r_rougeL,
            'r_bscore': r_bscore,
            'self_bleu': self_bleu}

def print_metrics(metric_dict: dict):
    '''
        Prints average metrics for dictionary returned by get_avg_metrics
        
        Parameters:
            -metric_dict
                Dictionary holding average metrics
        
        Return: None
    '''
    
    for metric, val in metric_dict.items():
        print(f'     {metric:10s}: {val * 100.:.2f}')

# Evaluation 

##  Experiment 1

In [62]:
# Merge lr_res to bart and primera predictions to get reference summaries
bart_zs_res = bart_zs_res.merge(lr_res[['title_date', 'left_sum', 'right_sum']], on = 'title_date')
prim_zs_res = prim_zs_res.merge(lr_res[['title_date', 'left_sum', 'right_sum']], on = 'title_date')

In [64]:
# Copy prediction columns to left and right so that the same evaluation functions can be used
# Duplicated because the zero shot setting only outputs single predictions
bart_zs_res['bart_lsum'] = bart_zs_res['bart_sum']
bart_zs_res = bart_zs_res.rename({'bart_sum': 'bart_rsum'}, axis = 1)

prim_zs_res['primera_lsum'] = prim_zs_res['primera_sum']
prim_zs_res = prim_zs_res.rename({'primera_sum': 'primera_rsum'}, axis = 1)

In [67]:
# Rouge scores for zero-shot setting
bart_zs_res = bart_zs_res.progress_apply(lambda row: rouge_and_reverse(row, 'bart'), axis = 1)
prim_zs_res = prim_zs_res.progress_apply(lambda row: rouge_and_reverse(row, 'primera'), axis = 1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:07<00:00, 93.31it/s]


In [68]:
# Self-BLEU scores for zero-shot setting
bart_zs_res['self_bleu'] = get_self_bleus_df(bart_zs_res, 'bart')
prim_zs_res['self_bleu'] = get_self_bleus_df(prim_zs_res, 'primera')

735it [00:00, 746.79it/s]
735it [00:09, 81.65it/s]


In [69]:
# BERTScores for zero-shot setting
bart_zs_res = get_bert_scores_df(bart_zs_res, 'bart')
prim_zs_res = get_bert_scores_df(prim_zs_res, 'primera')

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

In [70]:
# Average scores across predictions
bart_stat = get_avg_metrics(bart_zs_res)
prim_stat = get_avg_metrics(prim_zs_res)

In [71]:
print('---BART---')
print_metrics(bart_stat)
print('---PRIMERA---')
print_metrics(prim_stat)

---BART---
     l_rouge1  : 17.61
     l_rouge2  : 1.29
     l_rougeL  : 13.48
     l_bscore  : 2.58
     r_rouge1  : 17.50
     r_rouge2  : 1.35
     r_rougeL  : 13.54
     r_bscore  : 2.53
     self_bleu : 29.11
---PRIMERA---
     l_rouge1  : 11.12
     l_rouge2  : 1.28
     l_rougeL  : 8.27
     l_bscore  : -18.10
     r_rouge1  : 11.23
     r_rouge2  : 1.24
     r_rougeL  : 8.29
     r_bscore  : -18.46
     self_bleu : 7.31


## Experiment 2 

In [16]:
# Calculate ROUGE Scores and reverse predictions if needed for baselines and ToGL-Decoding
lr_res      = lr_res.progress_apply(lambda row: rouge_and_reverse(row, 'lexrank'), axis = 1)
cmos_res    = cmos_res.progress_apply(lambda row: rouge_and_reverse(row, 'cmos'), axis = 1)

bart_fs_res = bart_fs_res.progress_apply(lambda row: rouge_and_reverse(row, 'bart'), axis = 1)
prim_fs_res = prim_fs_res.progress_apply(lambda row: rouge_and_reverse(row, 'primera'), axis = 1)

togl_res    = togl_res.progress_apply(lambda row: rouge_and_reverse(row, 'togl'), axis = 1)
coco_res    = coco_res.progress_apply(lambda row: rouge_and_reverse(row, 'coco'), axis = 1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:03<00:00, 186.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:03<00:00, 233.47it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:03<00:00, 229.99it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:08<00:00, 87.52it/s]
100%|███████████████████████████████████████████████

In [17]:
# Calculate Self-BLEU Scores
lr_res['self_bleu']      = get_self_bleus_df(lr_res, 'lexrank')
cmos_res['self_bleu']    = get_self_bleus_df(cmos_res, 'cmos')

bart_fs_res['self_bleu'] = get_self_bleus_df(bart_fs_res, 'bart')
prim_fs_res['self_bleu'] = get_self_bleus_df(prim_fs_res, 'primera')

togl_res['self_bleu']    = get_self_bleus_df(togl_res, 'togl')
coco_res['self_bleu']    = get_self_bleus_df(coco_res, 'coco')

735it [00:02, 328.85it/s]
735it [00:00, 1131.26it/s]
735it [00:00, 845.36it/s]
735it [00:09, 74.57it/s]
735it [00:00, 744.26it/s]
735it [00:02, 345.26it/s]


In [20]:
# Calculate BERT Scores
lr_res_ = get_bert_scores_df(lr_res, 'lexrank')
cmos_res_ = get_bert_scores_df(cmos_res, 'cmos')
coco_res_ = get_bert_scores_df(coco_res, 'coco')
togl_res_ = get_bert_scores_df(togl_res, 'togl')
bart_fs_res_ = get_bert_scores_df(bart_fs_res, 'bart')
prim_fs_res_ = get_bert_scores_df(prim_fs_res, 'primera')

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f6b8981c310>
Exception ignored in: Traceback (most recent call last):
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f6b8981c310>    <function _MultiProcessingDataLoaderIter.__del__ at 0x7f6b8981c310>
self._shutdown_workers()

Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1493, in _shutdown_workers
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
        if w.is_alive():    self._shutdown_workers()
self._shutdown_workers()
  File "/usr/lib/python3.8/multiprocessi

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f6b8981c310>Exception ignored in: 
Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f6b8981c310>Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x7f6b8981c310>  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__


Traceback (most recent call last):
Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x7f6b8981c310>  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
    
self._shutdown_workers()Traceback (most recent call last):
        
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __d

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

In [32]:
# Get average metrics and prepare for DataFrame display
lr_stat   = get_avg_metrics(lr_res)
cmos_stat = get_avg_metrics(cmos_res)
bart_stat = get_avg_metrics(bart_fs_res)
prim_stat = get_avg_metrics(prim_fs_res)
coco_stat = get_avg_metrics(coco_res)
togl_stat = get_avg_metrics(togl_res)

lr_stat['model']   = 'LexRank'
cmos_stat['model'] = 'CMOS'
bart_stat['model'] = 'BART'
prim_stat['model'] = 'PRIMERA'
coco_stat['model'] = 'CoCoSum'
togl_stat['model'] = 'ToGL-Decoding'

In [37]:
# Aggregate performance metrics across model for easy display
all_stat = pd.DataFrame([lr_stat, cmos_stat, bart_stat, prim_stat, coco_stat, togl_stat])
all_stat = all_stat[['model'] + list(all_stat.columns)[:-1]]

In [40]:
all_stat

Unnamed: 0,model,l_rouge1,l_rouge2,l_rougeL,l_bscore,r_rouge1,r_rouge2,r_rougeL,r_bscore,self_bleu
0,LexRank,0.096816,0.008024,0.076513,-0.184348,0.135319,0.017496,0.103736,-0.172808,0.126337
1,CMOS,0.126254,0.018229,0.105159,0.010428,0.12251,0.015087,0.101993,0.010075,0.261325
2,BART,0.181609,0.014011,0.140382,0.036343,0.187096,0.015722,0.145058,0.038252,0.276103
3,PRIMERA,0.106487,0.014488,0.079086,-0.202654,0.109478,0.014274,0.080863,-0.20308,0.061918
4,CoCoSum,0.170468,0.022529,0.132281,-0.010275,0.165706,0.01986,0.130348,-0.008377,0.151335
5,ToGL-Decoding,0.177729,0.012607,0.141611,0.029455,0.177537,0.012226,0.141667,0.025904,0.272091


# Experiment 3 

In [20]:
# Calculate all metrics for each value of sigma tested
for i, togl_res_w in enumerate(togl_abl):
    togl_abl[i] = togl_res_w.progress_apply(lambda row: rouge_and_reverse(row, 'togl'), axis = 1)
print('Completed Rouge Evaluation')

for i, togl_res_w in enumerate(togl_abl):
    togl_abl[i]['self_bleu'] = get_self_bleus_df(togl_abl[i], 'togl')
print('Completed Self-Bleu Evaluation')
    
for i, togl_res_w in enumerate(togl_abl):
    togl_abl[i] = get_bert_scores_df(togl_abl[i], 'lexrank')
print('Completed Bert Score Evaluation')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:03<00:00, 225.79it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:03<00:00, 227.09it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:03<00:00, 217.60it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 735/735 [00:03<00:00, 225.75it/s]


Completed Rouge Evaluation


735it [00:00, 738.01it/s]
735it [00:00, 738.06it/s]
735it [00:01, 715.36it/s]
735it [00:01, 610.91it/s]


Completed Self-Bleu Evaluation


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f29dd33f280>Exception ignored in: 
<function _MultiProcessingDataLoaderIter.__del__ at 0x7f29dd33f280>Traceback (most recent call last):
Exception ignored in: 
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
<function _MultiProcessingDataLoaderIter.__del__ at 0x7f29dd33f280>Traceback (most recent call last):
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__

    Traceback (most recent call last):
    self._shutdown_workers()  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
self._shutdown_workers()

  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1493, in _shutdown_workers
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f29dd33f280><function _MultiProcessingDataLoaderIter.__del__ at 0x7f29dd33f280>Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7f29dd33f280>Traceback (most recent call last):
Traceback (most recent call last):

  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
Traceback (most recent call last):
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
            self._shutdown_workers()self._shutdown_workers()self._shutdown_workers()


  File "/home/ndeas/envs/test_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1493, in _shutdown_workers
  File "/home/ndeas/envs/test_env/lib/python3.8/site-package

Completed Bert Score Evaluation


In [26]:
# Aggregate metrics for each sigma value and prepare for Dataframe display
togl_abl_stat = []
for i, togl_res_w in enumerate(togl_abl):
    togl_abl_stat.append(get_avg_metrics(togl_res_w))
    togl_abl_stat[i]['togl_weight'] = (i + 2) / 10.

togl_abl_stat = pd.DataFrame(togl_abl_stat)
togl_abl_stat = togl_abl_stat[['togl_weight'] + list(togl_abl_stat.columns)[:-1]]

In [27]:
togl_abl_stat

Unnamed: 0,togl_weight,l_rouge1,l_rouge2,l_rougeL,l_bscore,r_rouge1,r_rouge2,r_rougeL,r_bscore,self_bleu
0,0.2,0.177385,0.013694,0.142372,0.034799,0.179667,0.012961,0.14294,0.034284,0.260219
1,0.3,0.167495,0.01412,0.134879,0.058693,0.17362,0.014497,0.13944,0.050904,0.23436
2,0.4,0.143871,0.011253,0.120499,0.100386,0.152655,0.013568,0.124562,0.084721,0.2116
3,0.5,0.126727,0.008733,0.105333,0.119465,0.132356,0.0105,0.110143,0.119975,0.178446
