In [1]:
import pandas as pd
import torch
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# The models the authors used:
try:
    from transformers import BertForMaskedLM, BertTokenizer, AdamW, get_linear_schedule_with_warmup
    from transformers import AlbertForMaskedLM, AlbertTokenizer
except ModuleNotFoundError:
    %pip install transformers
    from transformers import BertForMaskedLM, BertTokenizer, AdamW, get_linear_schedule_with_warmup
    from transformers import AlbertForMaskedLM, AlbertTokenizer

nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/nazanin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [8]:
dataset = pd.read_json('../datasets/DailyNews_300.json')
summaries = dataset.iloc[:,2]
texts = dataset.iloc[:,3]

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
def BLANC_help(summary, text, model, M=6, L_min=4, sep=' '):
    """
    Calculate BLANC similarity between summary and text using a specified model.

    Parameters:
    - summary (str): The summary text.
    - text (List[List[str]]): List of sentences represented as a list of words.
    - model: BERT model type
    - M (int): Parameter M for the algorithm (default is 6).
    - L_min (int): Minimum length requirement for masked words (default is 4).
    - sep (str): Separator between the inference help (filler/summary) and a sentence from the text (default is '').

    Returns:
    - float: BLANC similarity score.
    """

    text_sents = sent_tokenize(text)

    filler = '.' * len(summary)
    S = [[0, 0], [0, 0]]

    for sentence in text_sents:
        sentence = word_tokenize(sentence)
        for i in range(M):
            masked_sentence = ['[MASK]' if (j - i) % M == 0 and len(sentence[j]) >= L_min else sentence[j] for j in range(len(sentence))]
            input_base = filler + sep + ' '.join(masked_sentence)
            input_help = summary + sep + ' '.join(masked_sentence)
            tokenized_input_base = tokenizer(input_base, return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
            tokenized_input_help = tokenizer(input_help, return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
            out_base = model(**tokenized_input_base).logits
            out_help = model(**tokenized_input_help).logits
            masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == '[MASK]']
            # print(f'masked token indecies: {masked_tokens}')

            for j in masked_tokens:
                predicted_idx_base = torch.argmax(out_base[:, j], dim=-1).item()
                predicted_idx_help = torch.argmax(out_help[:, j], dim=-1).item()

                predicted_word_base = tokenizer.convert_ids_to_tokens(predicted_idx_base)
                predicted_word_help = tokenizer.convert_ids_to_tokens(predicted_idx_help)
                
                # print(f'predicted_word_base: {predicted_word_base}')
                # print(f'predicted_word_help: {predicted_word_help}')
                # print(f'sentence[{j}]: {sentence[j]}\n')
                k = int(predicted_word_base == sentence[j])
                m = int(predicted_word_help == sentence[j])
                S[k][m] += 1

    print(f'S: {S}')
    try:
      B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])
    except ZeroDivisionError:
      B = 0.0

    return B



In [None]:
BLANC_help(summaries[0], texts[0], model)

In [25]:
scores = [BLANC_help(summary, text, model)
          for summary, text in zip(summaries, texts)]
scores

0.0


Ideas for improvement:
1. try other models
2. try other datasets
3. test on other problems