In [16]:
import pandas as pd
import torch
import nltk
from nltk.tokenize import sent_tokenize

# The models the authors used:
try:
    from transformers import BertForMaskedLM, BertTokenizer, AdamW, get_linear_schedule_with_warmup
    from transformers import AlbertForMaskedLM, AlbertTokenizer
except ModuleNotFoundError:
    %pip install transformers
    from transformers import BertForMaskedLM, BertTokenizer, AdamW, get_linear_schedule_with_warmup
    from transformers import AlbertForMaskedLM, AlbertTokenizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/nazanin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
dataset = pd.read_json('./datasets/DailyNews_300.json')
summary = dataset.iloc[0,2]
text = dataset.iloc[0,3]

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
def BLANC_help(summary, text, model, M=6, L_min=4, sep=' '):
    """
    Calculate BLANC similarity between summary and text using a specified model.

    Parameters:
    - summary (str): The summary text.
    - text (List[List[str]]): List of sentences represented as a list of words.
    - model: BERT model type
    - M (int): Parameter M for the algorithm (default is 6).
    - L_min (int): Minimum length requirement for masked words (default is 4).
    - sep (str): Separator between the inference help (filler/summary) and a sentence from the text (default is '').

    Returns:
    - float: BLANC similarity score.
    """
    
    text_sents = sent_tokenize(text)

    filler = '.' * len(summary)
    S = [[0, 0], [0, 0]]

    for sentence in text_sents:
        for i in range(1, M + 1):
            masked_sentence = ''.join(['<MASK>' if (j - i) % M == 0 and len(sentence[j]) >= L_min else sentence[j] for j in range(len(sentence))])
            print(masked_sentence)
            input_base = filler + sep + masked_sentence
            input_help = summary + sep + masked_sentence
            tokenized_input_base = tokenizer(input_base, return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
            tokenized_input_help = tokenizer(input_help, return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
            out_base = model(**tokenized_input_base)
            out_help = model(**tokenized_input_help)
            masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == '<MASK>']

            for j in masked_tokens:
                k = int(out_base[j] == sentence[j])
                m = int(out_help[j] == sentence[j])
                S[k][m] += 1
        break

    try:
      B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])
    except ZeroDivisionError:
      B = 0.0
    
    return B


In [21]:
acc = BLANC_help(summary, text, model)
print(acc)

Mario Mandzukic pounces to fire the ball past Jordan Pickford and put Croatia into the World Cup final.
Mario Mandzukic pounces to fire the ball past Jordan Pickford and put Croatia into the World Cup final.
Mario Mandzukic pounces to fire the ball past Jordan Pickford and put Croatia into the World Cup final.
Mario Mandzukic pounces to fire the ball past Jordan Pickford and put Croatia into the World Cup final.
Mario Mandzukic pounces to fire the ball past Jordan Pickford and put Croatia into the World Cup final.
Mario Mandzukic pounces to fire the ball past Jordan Pickford and put Croatia into the World Cup final.
0.0


Ideas for improvement:
1. add error handling to ensure that the input to the function (summary, text, model) are of the expected types or shapes