In [41]:
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

from ast import literal_eval
import json

from rouge_score import rouge_scorer
from torchmetrics.text.bert import BERTScore
from nltk.translate import bleu_score

In [416]:
RES_BASE      = '../../results/'

BART_ZS_FP    = RES_BASE + 'bart_zshot_gens.csv'
PRIMERA_ZS_FP = RES_BASE + 'primera_zshot_gens.csv'

LR_RES_FP     = RES_BASE + 'lexrank_results.csv'
CMOS_RES_FP   = RES_BASE + 'cmos_results.csv'
TOGL_RES_FP   = RES_BASE + '/togl_decoding/togl_predictions.csv'
COCO_RES_FP   = RES_BASE + 'cocosum_results.csv'
BART_FS_FP    = RES_BASE + 'bart_fshot_gens.csv'
PRIMERA_FS_FP = RES_BASE + 'primera_fshot_gens.csv'

# Data Reading 

In [428]:
# Extractive Baselines
lr_res = pd.read_csv(LR_RES_FP)
cmos_res = pd.read_csv(CMOS_RES_FP)

# Abstractive Pre-trained Baselines
bart_zs_res = pd.read_csv(BART_ZS_FP)
bart_fs_res = pd.read_csv(BART_FS_FP)
prim_zs_res = pd.read_csv(PRIMERA_ZS_FP)
prim_fs_res = pd.read_csv(PRIMERA_FS_FP)

# Abstract Contrastive Approaches
coco_res = pd.read_csv(COCO_RES_FP)
togl_res = pd.read_csv(TOGL_RES_FP)

In [429]:
lr_res['title_date'] = lr_res['title'] + '_' + lr_res['date']
lr_res = lr_res[['title_date', 'lexrank_lsum', 'lexrank_rsum', 'left_sum', 'right_sum']]
cmos_res['title_date'] = cmos_res['title'] + '_' + cmos_res['date']
cmos_res = cmos_res[['title_date', 'cmos_lsum', 'cmos_rsum', 'left_sum', 'right_sum']]

In [430]:
togl_res.columns = ['title_date', 'togl_lsum', 'togl_rsum']
togl_res = togl_res.merge(lr_res[['title_date', 'left_sum', 'right_sum']], on = 'title_date')

bart_fs_res = bart_fs_res.merge(lr_res[['title_date', 'left_sum', 'right_sum']], on = 'title_date')
prim_fs_res = prim_fs_res.merge(lr_res[['title_date', 'left_sum', 'right_sum']], on = 'title_date')

# Evaluation Setup

In [431]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer = True)

bert_scorer = BERTScore(model_name_or_path = 'roberta-large-mnli', 
                        rescale_with_baseline = True, 
                        lang = 'en',
                        verbose = True, 
                        max_length = 500)

## Functions 

In [433]:
def get_rouge(text, ref):
    '''
        Get Rouge-F1 scores given a text and reference
    '''
    
    rouge = scorer.score(text, ref)
    rouge1 = rouge['rouge1'].fmeasure
    rouge2 = rouge['rouge2'].fmeasure
    rougeL = rouge['rougeL'].fmeasure
    
    return rouge1, rouge2, rougeL

def rouge_and_reverse(row, model):
    '''
        Calculate rouge scores given a row from a results dataset.
        Also swap the left and right text for approaches that are not expected to distinguish
            left and right perspectives.
    '''
    
    lsum, rsum   = row[f'left_sum'], row['right_sum']
    ltext, rtext = row[f'{model}_lsum'], row[f'{model}_rsum']
    
    # Rouge Scores
    lrouge_1f, lrouge_2f, lrouge_Lf = get_rouge(ltext, lsum)
    rrouge_1f, rrouge_2f, rrouge_Lf = get_rouge(rtext, rsum)
    
    lrouge_1r, lrouge_2r, lrouge_Lr = get_rouge(ltext, rsum)
    rrouge_1r, rrouge_2r, rrouge_Lr = get_rouge(rtext, lsum)
    
    lrouge_1, lrouge_2, lrouge_L = lrouge_1f, lrouge_2f, lrouge_Lf
    rrouge_1, rrouge_2, rrouge_L = rrouge_1f, rrouge_2f, rrouge_Lf
    reverse = False
    
    # If the reverse scoring is better than the forward scoring, swap left and right texts
    if (lrouge_2f + rrouge_2f) < (lrouge_2r + rrouge_2r):
        temp = ltext
        ltext = rtext
        rtext = temp
        
        lrouge_1, lrouge_2, lrouge_L = lrouge_1r, lrouge_2r, lrouge_Lr
        rrouge_1, rrouge_2, rrouge_L = rrouge_1r, rrouge_2r, rrouge_Lr
        
        reverse = True
    
    row['new_l_text'] = ltext
    row['new_r_text'] = rtext
    row['l_rouge1'] = lrouge_1
    row['l_rouge2'] = lrouge_2
    row['l_rougeL'] = lrouge_L
    row['r_rouge1'] = rrouge_1
    row['r_rouge2'] = rrouge_2
    row['r_rougeL'] = rrouge_L
    
    return row

In [437]:
def get_self_bleus(preds1, preds2):
    self_bleus = []
    for pred1, pred2 in tqdm(zip(preds1, preds2)):
        bleu = bleu_score.sentence_bleu(pred1, pred2, weights = (1.0,))
        self_bleus.append(bleu)
    return self_bleus

def get_self_bleus_df(df, model):
    ltexts = df[f'{model}_lsum'].values
    rtexts = df[f'{model}_rsum'].values
    
    self_bleus = get_self_bleus(ltexts, rtexts)
    
    return self_bleus

In [439]:
def get_bert_scores_df(df, model):
    l_bscores = bert_scorer(df['new_l_text'].astype(str).tolist()[0:1], df['left_sum'].astype(str).tolist()[0:1])
    r_bscores = bert_scorer(df['new_r_text'].astype(str).tolist()[0:1], df['right_sum'].astype(str).tolist()[0:1])
    
    df['l_bscore'] = l_bscores['f1']
    df['r_bscore'] = r_bscores['f1']
    
    return df

# Evaluation 

##  Experiment 1

## Experiment 2 

In [434]:
lr_res      = lr_res.progress_apply(lambda row: rouge_and_reverse(row, 'lexrank'), axis = 1)
cmos_res    = cmos_res.progress_apply(lambda row: rouge_and_reverse(row, 'cmos'), axis = 1)

bart_fs_res = bart_fs_res.progress_apply(lambda row: rouge_and_reverse(row, 'bart'), axis = 1)
prim_fs_res = prim_fs_res.progress_apply(lambda row: rouge_and_reverse(row, 'primera'), axis = 1)

togl_res    = togl_res.progress_apply(lambda row: rouge_and_reverse(row, 'togl'), axis = 1)
coco_res    = coco_res.progress_apply(lambda row: rouge_and_reverse(row, 'coco'), axis = 1)

100%|███████████████████████████████████████████████████████████████████████████████| 735/735 [00:04<00:00, 148.95it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 735/735 [00:03<00:00, 192.14it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 735/735 [00:04<00:00, 168.84it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 735/735 [00:07<00:00, 104.69it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 735/735 [00:04<00:00, 156.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 735/735 [00:12<00:00, 58.96it/s]


In [438]:
lr_res['self_bleu']      = get_self_bleus_df(lr_res, 'lexrank')
cmos_res['self_bleu']    = get_self_bleus_df(cmos_res, 'cmos')

bart_fs_res['self_bleu'] = get_self_bleus_df(bart_fs_res, 'bart')
prim_fs_res['self_bleu'] = get_self_bleus_df(prim_fs_res, 'primera')

togl_res['self_bleu']    = get_self_bleus_df(togl_res, 'togl')
coco_res['self_bleu']    = get_self_bleus_df(coco_res, 'coco')

735it [00:03, 229.42it/s]
735it [00:01, 730.91it/s]
735it [00:01, 525.71it/s]
735it [00:15, 47.66it/s]
735it [00:01, 458.65it/s]
735it [00:03, 202.68it/s]


In [441]:
lr_res_ = get_bert_scores_df(lr_res, 'lexrank')
cmos_res_ = get_bert_scores_df(cmos_res, 'cmos')
coco_res_ = get_bert_scores_df(coco_res, 'coco')
togl_res_ = get_bert_scores_df(togl_res, 'togl')
bart_fs_res_ = get_bert_scores_df(bart_fs_res, 'bart')
prim_fs_res_ = get_bert_scores_df(prim_fs_res, 'primera')

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "C:\Users\ndeas\miniconda3\lib\multiprocessing\queues.py", line 232, in _feed
    close()
  File "C:\Users\ndeas\miniconda3\lib\multiprocessing\connection.py", line 177, in close
    self._close()
  File "C:\Users\ndeas\miniconda3\lib\multiprocessing\connection.py", line 277, in _close
    _CloseHandle(self._handle)
OSError: [WinError 6] The handle is invalid

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\ndeas\miniconda3\lib\threading.py", line 926, in _bootstrap_inner
    self.run()
  File "C:\Users\ndeas\miniconda3\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\ndeas\miniconda3\lib\multiprocessing\queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times



  0%|          | 0/1 [00:05<?, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x000001F77F962F78>
Traceback (most recent call last):
  File "C:\Users\ndeas\miniconda3\lib\multiprocessing\connection.py", line 132, in __del__
    self._close()
  File "C:\Users\ndeas\miniconda3\lib\multiprocessing\connection.py", line 277, in _close
    _CloseHandle(self._handle)
OSError: [WinError 6] The handle is invalid
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceCla

  0%|          | 0/1 [00:00<?, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x000001F77F962F78>
Traceback (most recent call last):
  File "C:\Users\ndeas\miniconda3\lib\multiprocessing\connection.py", line 132, in __del__
    self._close()
  File "C:\Users\ndeas\miniconda3\lib\multiprocessing\connection.py", line 277, in _close
    _CloseHandle(self._handle)
OSError: [WinError 6] The handle is invalid


  0%|          | 0/1 [00:05<?, ?it/s]

ValueError: Length of values (4) does not match length of index (735)

In [123]:
def get_avg_metrics(df):
    l_rouge1 = df['l_rouge1'].mean()
    l_rouge2 = df['l_rouge2'].mean()
    l_rougeL = df['l_rougeL'].mean()
    r_rouge1 = df['r_rouge1'].mean()
    r_rouge2 = df['r_rouge2'].mean()
    r_rougeL = df['r_rougeL'].mean()
    
    self_bleu = df['self_bleu'].mean()
    
    return {'l_rouge1': l_rouge1, 
            'l_rouge2': l_rouge2, 
            'l_rougeL': l_rougeL,
            'l_bscore': l_bscore,
            'r_rouge1': r_rouge1, 
            'r_rouge2': r_rouge2, 
            'r_rougeL': r_rougeL,
            'r_bscore': r_bscore,
            'self_bleu': self_bleu

In [128]:
get_avg_metrics(lr_res)

(0.1060946720651919,
 0.1060946720651919,
 0.0811671010399922,
 0.14589109178997778,
 0.14589109178997778,
 0.10925546591557105,
 0.12633690320695098)

In [129]:
get_avg_metrics(cmos_res)

(0.14122461171454748,
 0.14122461171454748,
 0.1155106390652887,
 0.13573061115311313,
 0.13573061115311313,
 0.11021186049868195,
 0.26132524432516635)

In [130]:
get_avg_metrics(togl_res)

(0.17772615022316526,
 0.17772615022316526,
 0.14118185834573072,
 0.17889027812804886,
 0.17889027812804886,
 0.14251107693517712,
 0.2720906059832041)