In [None]:
# %pip install datasets

In [1]:
import pandas as pd
import torch
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm.notebook import tqdm

# The models the authors used:
try:
    from transformers import BertConfig, BertForMaskedLM, BertTokenizer, AdamW, get_linear_schedule_with_warmup
    from transformers import AlbertForMaskedLM, AlbertTokenizer
except ModuleNotFoundError:
    %pip install transformers
    from transformers import BertForMaskedLM, BertTokenizer, AdamW, get_linear_schedule_with_warmup
    from transformers import AlbertForMaskedLM, AlbertTokenizer

nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/nazanin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cpu')

In [8]:
DailyNews_ds = load_dataset('json', data_files='DailyNews_300.json', split='train')
DailyNews_ds

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
# model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(DEVICE)

config = BertConfig.from_pretrained('bert-base-uncased')
model = BertForMaskedLM(config).to(DEVICE)

print(config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def mask_sentence(sentence, i, M, L_min):
    return [tokenizer.mask_token 
            if (j - i) % M == 0 
            and (len(sentence[j]) >= L_min 
                 or sentence[j].startswith('##') 
                 or sentence[min(j+1, len(sentence)-1)].startswith('##')) 
            else sentence[j] 
            for j in range(len(sentence))]

In [10]:
def BLANC_help(summary, text, model, M=6, L_min=4, sep='[SEP]', device='cpu'):
    """
    Calculates BLANC similarity between summary and text using a specified model.

    Parameters:
    - summary ([List[str]]): A tokenized summary.
    - text (List[List[str]]): A tokenized text; list of sentences represented as a list of tokens.
    - model: BERT-type model.
    - M (int): Parameter M for the algorithm (default is 6).
    - L_min (int): Minimum length requirement for masked words (default is 4).
    - sep (str): Separator between the inference help (filler/summary) and a sentence from the text (default is ' ').

    Returns:
    - float: BLANC similarity score for one text-summary pair.
    """

    filler = ['.'] * len(summary)
    S = [[0, 0], [0, 0]]

    for sentence in text:
        for i in range(M):
            masked_sentence = mask_sentence(sentence, i, M, L_min)

            input_base = filler + [sep] + masked_sentence
            input_help = summary + [sep] + masked_sentence

            tokenized_input_base = torch.tensor(tokenizer.convert_tokens_to_ids(input_base)).to(device) # Shape: [sequence_length]
            tokenized_input_help = torch.tensor(tokenizer.convert_tokens_to_ids(input_help)).to(device) # Shape: [sequence_length]
            assert len(tokenized_input_base) == len(tokenized_input_help), "input_base and input_help have different lengths"

            out_base = model(input_ids=tokenized_input_base.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]
            out_help = model(input_ids=tokenized_input_help.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]

            out_base = out_base.squeeze(0)  # Shape: [sequence_length, Bert_vocab_size]
            out_help = out_help.squeeze(0)  # Shape: [sequence_length, Bert_vocab_size]

            masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == tokenizer.mask_token]

            for j in masked_tokens:
                idx = len(summary + [sep]) + j
                predicted_idx_base = torch.argmax(out_base[idx]).item()
                predicted_idx_help = torch.argmax(out_help[idx]).item()

                predicted_word_base = tokenizer.convert_ids_to_tokens(predicted_idx_base)
                predicted_word_help = tokenizer.convert_ids_to_tokens(predicted_idx_help)

                # print(f'predicted_word_base: {predicted_word_base}')
                # print(f'predicted_word_help: {predicted_word_help}')
                # print(f'sentence[{j}]: {sentence[j]}')

                k = int(predicted_word_base == sentence[j])
                m = int(predicted_word_help == sentence[j])
                S[k][m] += 1

    try:
      B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])
    except ZeroDivisionError:
      B = 0.0

    return B

In [None]:
summaries = DailyNews_ds['summary'] # (List[str])
texts = DailyNews_ds['text']  # (List[str]) each string is a paragraph made of a few sentences

# Each text in texts is a list of sentences (each sentence is a string)
texts = [sent_tokenize(text.strip()) for text in texts] # List[List[str]]
assert len(texts) == len(summaries) == 300

tokenized_texts = [[tokenizer.tokenize(sentence) for sentence in text] for text in texts]
tokenized_summaries = [tokenizer.tokenize(summary) for summary in summaries]

In [None]:
BLANC_help(tokenized_summaries[0], tokenized_texts[0], model, device=DEVICE)

In [None]:
scores = [BLANC_help(summary, text, model, device=DEVICE)
          for summary, text in tqdm(zip(tokenized_summaries, tokenized_texts))]
scores