In [1]:
# %%capture
# !pip install datasets
# !pip install transformers

## Libraries and Dependencies

In [2]:
import torch
import nltk
from nltk.tokenize import sent_tokenize
from datasets import load_dataset
from tqdm.notebook import tqdm

# The models the authors used:
from transformers import BertForMaskedLM, BertTokenizer
from transformers import AlbertForMaskedLM, AlbertTokenizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Liora\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

## Algorithm Implementation

In [21]:
def mask_sentence(sentence, mask_token, i, M, L_min):
    return [mask_token
            if (j - i) % M == 0
            and (len(sentence[j]) >= L_min
                 or sentence[j].startswith('##')
                 or sentence[min(j+1, len(sentence)-1)].startswith('##'))
            else sentence[j]
            for j in range(len(sentence))]

def no_copy_guard(sentence, summary):
    sentence = ' '.join(sentence)
    summary = ' '.join(summary)
    return sentence in summary

In [13]:
def BLANC_help(text, summary, model, tokenizer, M=6, L_min=4, sep='[SEP]', device='cpu'):
    """
    Calculates BLANC score between a given text and its summary using a specified model.

    Parameters:
    - text (List[List[str]]): List of sentences represented as a list of tokens.
    - summary (List[str]): The tokenized summary of the text.
    - model: BERT-type model
    - tokenizer: The tokenizer associated with the model used.
    - M (int): Parameter M for the algorithm (default is 6).
    - L_min (int): Minimum length requirement for masked words (default is 4).
    - sep (str): Separator between the inference help (filler/summary) and a sentence from the text (default is '[SEP]').

    Returns:
    - float: BLANC score for the given text and its summary.
    """

    filler = ['.'] * len(summary)
    S = [[0, 0], [0, 0]]

    for sentence in text:
        if no_copy_guard(sentence, summary) : 
           continue
        for i in range(M):
            masked_sentence = mask_sentence(sentence, tokenizer.mask_token, i, M, L_min)

            input_base = filler + [sep] + masked_sentence
            input_help = summary + [sep] + masked_sentence

            tokenized_input_base = torch.tensor(tokenizer.convert_tokens_to_ids(input_base)).to(device) # Shape: [sequence_length]
            tokenized_input_help = torch.tensor(tokenizer.convert_tokens_to_ids(input_help)).to(device) # Shape: [sequence_length]

            out_base = model(input_ids=tokenized_input_base.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]
            out_help = model(input_ids=tokenized_input_help.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]

            out_base = torch.argmax(out_base.squeeze(0), dim=-1)  # Shape: [sequence_length]
            out_help = torch.argmax(out_help.squeeze(0), dim=-1)  # Shape: [sequence_length]

            masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == tokenizer.mask_token]

            for j in masked_tokens:
                idx = len(summary + [sep]) + j
                predicted_word_base = tokenizer.convert_ids_to_tokens(out_base[idx].item())
                predicted_word_help = tokenizer.convert_ids_to_tokens(out_help[idx].item())

                # print(f'predicted_word_base[{idx - len(summary + [sep])}]: {predicted_word_base}')
                # print(f'predicted_word_help[{idx - len(summary + [sep])}]: {predicted_word_help}')
                # print(f'sentence[{j}]: {sentence[j]}')

                k = int(predicted_word_base == sentence[j])
                m = int(predicted_word_help == sentence[j])
                S[k][m] += 1


    B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])

    return B

## Datasets

In [4]:
# cnn_dailymail_ds = load_dataset("cnn_dailymail", '3.0.0', split='test')
# print(cnn_dailymail_ds)

In [6]:
DailyNews_ds = load_dataset('json', data_files='../datasets/DailyNews_300.json', split='train')
DailyNews_ds

Dataset({
    features: ['scores', 'text', 'summary', 'annotators_ids'],
    num_rows: 300
})

## Model and Tokenizer

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(DEVICE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Preprocessing

In [8]:
summaries = DailyNews_ds['summary'] # (List[str])
texts = DailyNews_ds['text']  # (List[str]) each string is a paragraph made of a few sentences

In [9]:
# each text in texts is a list of sentences (each sentence is a string)
texts = [sent_tokenize(text.strip()) for text in texts] # List[List[str]]
assert len(texts) == len(summaries) == 300

In [10]:
tokenized_texts = [[tokenizer.tokenize(sentence) for sentence in text] for text in texts]
tokenized_summaries = [tokenizer.tokenize(summary) for summary in summaries]

## Running the Program

In [23]:
BLANC_help(tokenized_texts[0], tokenized_summaries[0], model, tokenizer, device=DEVICE)

['mario', 'man', '##d', '##zuki', '##c', 'po', '##unce', '##s', 'to', 'fire', 'the', 'ball', 'past', 'jordan', 'pick', '##ford', 'and', 'put', 'croatia', 'into', 'the', 'world', 'cup', 'final', '.']
False
so we keep moving
['mario', 'man', '##d', '##zuki', '##c', 'po', '##unce', '##s', 'to', 'fire', 'the', 'ball', 'past', 'jordan', 'pick', '##ford', 'and', 'put', 'croatia', 'into', 'the', 'world', 'cup', 'final', '.']
['photo', ':', 'reuters', 'independent', '.', 'ie', 'former', 'england', 'defender', 'gary', 'neville', 'suggested', 'gareth', 'south', '##gate', "'", 's', 'squad', 'had', 'done', 'more', 'than', 'could', 'have', 'been', 'expected', 'of', 'them', 'at', 'this', 'world', 'cup', 'as', 'they', 'bowed', 'out', 'with', 'a', 'semi', '-', 'final', 'defeat', 'against', 'croatia', '.']
False
so we keep moving
['photo', ':', 'reuters', 'independent', '.', 'ie', 'former', 'england', 'defender', 'gary', 'neville', 'suggested', 'gareth', 'south', '##gate', "'", 's', 'squad', 'had', 'do

0.11069418386491557

In [13]:
scores = [BLANC_help(text, summary, model, tokenizer, device=DEVICE)
          for summary, text in tqdm(zip(tokenized_summaries, tokenized_texts))]
scores

0it [00:00, ?it/s]

[0.11069418386491557,
 0.08967391304347826,
 0.14035087719298245,
 0.08250355618776671,
 0.06504065040650407,
 0.16971279373368145,
 0.159375,
 0.06736526946107785,
 0.08818011257035648,
 0.1277258566978193,
 0.12241653418124006,
 0.15737704918032788,
 0.12459016393442623,
 0.09556313993174062,
 0.2875816993464052,
 0.08672086720867209,
 0.13921901528013583,
 0.1620795107033639,
 0.19607843137254902,
 0.10483870967741936,
 0.146875,
 0.0517464424320828,
 0.3137254901960784,
 0.14250614250614252,
 0.12538226299694188,
 0.13870967741935483,
 0.17256637168141592,
 0.087248322147651,
 0.1478405315614618,
 0.11192660550458716,
 0.0783132530120482,
 0.19327731092436976,
 0.14363143631436315,
 0.16032608695652173,
 0.17725752508361203,
 0.10580204778156997,
 0.15217391304347827,
 0.16971279373368145,
 0.06985769728331177,
 0.19457013574660634,
 0.1188118811881188,
 0.12459016393442623,
 0.16265060240963855,
 0.09561128526645768,
 0.09987029831387809,
 0.103125,
 0.2073170731707317,
 0.1803921