## Libraries

In [1]:
import sys 
sys.path.append(r"C:\Pro\Stages\A4 - DVRC\Work\BARTScore")
sys.path.append(r"C:\Pro\Stages\A4 - DVRC\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\myLibraries") 

In [6]:
from custom_score.score import BERTScoreStaticSampleTest, BERTScoreDynamicSampleTest
from custom_score.utils import serialized_to_model
from bart_score import BARTScorer
import torch
import tensorflow_datasets as tfds
from datetime import datetime
import pandas as pd

### Checkup and linkage

In [3]:
torch.cuda.is_available()

True

## Datasets

### Billsum

In [3]:
billsum = tfds.load('huggingface:billsum')
billsum_test = tfds.as_dataframe(billsum["test"])
billsum_test = billsum_test.loc[:, ["text", "summary"]]
billsum_test.text = billsum_test.text.str.decode("utf-8")
billsum_test.summary = billsum_test.summary.str.decode("utf-8")
billsum_test.head(5)

Unnamed: 0,text,summary
0,SECTION 1. SHORT TITLE.\n\n This Act may be...,Local Innovation and Coastal Protection Act of...
1,SECTION 1. SHORT TITLE.\n\n This Act may be...,Gun Show Background Check Act of 2008 - Amends...
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,Recycled Roads Act of 2003 - Directs the Secre...
3,SECTION 1. SHORT TITLE.\n\n This Act may be...,Prosthetic and Custom Orthotic Parity Act of 2...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,"Investing in Neighborhood-focused, Vital, Evid..."


### Opinosis

In [None]:
opinosis_builder = tfds.builder("opinosis")
opinosis_builder.download_and_prepare()
opinosis = opinosis_builder.as_dataset()

## Benchmark

### Classic BERTScore

In [15]:
bert_scores, bert_runtime = BERTScoreDynamicSampleTest(billsum_test)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaM

### Static BERTScore

In [4]:
w2v = serialized_to_model(r'C:\Pro\Stages\A4 - DVRC\Work\Models\serialized_w2v.pkl')

In [5]:
word2vec_scores, word2vec_runtime = BERTScoreStaticSampleTest(billsum_test, w2v, 3, withIdf = False)

### BARTScore

In [11]:
def BARTScoreDynamicSampleTest(data, limit=3):
    """
    Benchmarking function allowing to compute classical bertscore as well as its runtime.

    :param1 data (DataFrame) : Dataframe containing all references and candidates. Required Format : [col0: Reference, col1: Candidate].
    :param2 limit (int): Number of individuals to compute.
    :param3 modelPath (string): Path to the wanted model in the HuggingFace repository.
    :param4 model (object): Model to use directly for computation.
    :param5 nbLayers (int): Number of layers in the custom model (has to be filled-in if modelPath!=None or model!=None).  
    
    :output1 scores (list): List of Precision, Recall and F1score for each computed individual.
    :output2 runtime (float): Elasped time between the start and end of score computation for all individuals.
    """

    nbIter = 1
    scores = []
    bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')
    init_time = datetime.now()
    for row in data.iterrows():
        curCand = [" ".join(row[1][1].split("\n"))]
        curRef = [" ".join(row[1][0].split("\n"))]
        assert len(curCand) == len(curRef)
        
        score = bart_scorer.score(curRef, curCand, batch_size=4)
        score = score[0]
        
        scores.append(score)
        if nbIter >= limit:
            break
        nbIter += 1
    runtime = (datetime.now() - init_time).total_seconds()
    return scores, runtime

In [12]:
bart_scores, bart_runtime = BARTScoreDynamicSampleTest(billsum_test, limit=3)

## Evalutation

### Runtime table

In [16]:
runtimeTable = [bert_runtime, word2vec_runtime, bart_runtime]
runtimeDf = pd.DataFrame(runtimeTable, columns=["runtime"], index=["Roberta-24-layers", "Word2Vec", "BART-large-CNN"])
runtimeDf

Unnamed: 0,runtime
Roberta-24-layers,8.840713
Word2Vec,5.608159
BART-large-CNN,0.349999


### Quality evaluation

In [17]:
bert_scores_Df = pd.DataFrame(bert_scores, columns=["Bert_P", "Bert_R", "Bert_F"])
bert_scores_Df

Unnamed: 0,Bert_P,Bert_R,Bert_F
0,0.849191,0.781935,0.814176
1,0.818807,0.699549,0.754495
2,0.832273,0.697599,0.759008


In [8]:
word2vec_scores_Df = pd.DataFrame(word2vec_scores, columns=["W2V_P", "W2V_R", "W2V_F"])
word2vec_scores_Df

Unnamed: 0,W2V_P,W2V_R,W2V_F
0,0.646725,0.902619,0.753539
1,0.699015,0.941868,0.80247
2,0.652091,0.959232,0.776389


In [13]:
bart_scores_Df = pd.DataFrame(bart_scores, columns=["BARTScore"])
bart_scores_Df

Unnamed: 0,BARTScore
0,-2.924354
1,-3.258326
2,-3.256637
