In [1]:
%%capture
# !pip install datasets

In [2]:
import pandas as pd
import torch
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm.notebook import tqdm

# The models the authors used:
try:
    from transformers import BertConfig, BertForMaskedLM, BertTokenizer, AdamW, get_linear_schedule_with_warmup
    from transformers import AlbertForMaskedLM, AlbertTokenizer
except ModuleNotFoundError:
    %pip install transformers
    from transformers import BertForMaskedLM, BertTokenizer, AdamW, get_linear_schedule_with_warmup
    from transformers import AlbertForMaskedLM, AlbertTokenizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

In [4]:
# cnn_dailymail_ds = load_dataset("cnn_dailymail", '3.0.0', split='test')
# print(cnn_dailymail_ds)

In [5]:
DailyNews_ds = load_dataset('json', data_files='DailyNews_300.json', split='train')
DailyNews_ds

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['summary', 'scores', 'annotators_ids', 'text'],
    num_rows: 300
})

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(DEVICE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def mask_sentence(sentence, i, M, L_min):
    return [tokenizer.mask_token
            if (j - i) % M == 0
            and (len(sentence[j]) >= L_min
                 or sentence[j].startswith('##')
                 or sentence[min(j+1, len(sentence)-1)].startswith('##'))
            else sentence[j]
            for j in range(len(sentence))]

In [8]:
def BLANC_help(summary, text, model, M=6, L_min=4, sep='[SEP]', device='cpu'):
    """
    Calculate BLANC similarity between summary and text using a specified model.

    Parameters:
    - summary (str): The summary text.
    - text (List[List[str]]): List of sentences represented as a list of words.
    - model: BERT model type
    - M (int): Parameter M for the algorithm (default is 6).
    - L_min (int): Minimum length requirement for masked words (default is 4).
    - sep (str): Separator between the inference help (filler/summary) and a sentence from the text (default is ' ').

    Returns:
    - float: BLANC similarity score.
    """

    filler = ['.'] * len(summary)
    S = [[0, 0], [0, 0]]

    for sentence in text:
        for i in range(M):
            masked_sentence = mask_sentence(sentence, i, M, L_min)

            input_base = filler + [sep] + masked_sentence
            input_help = summary + [sep] + masked_sentence

            tokenized_input_base = torch.tensor(tokenizer.convert_tokens_to_ids(input_base)).to(device) # Shape: [sequence_length]
            tokenized_input_help = torch.tensor(tokenizer.convert_tokens_to_ids(input_help)).to(device) # Shape: [sequence_length]

            out_base = model(input_ids=tokenized_input_base.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]
            out_help = model(input_ids=tokenized_input_help.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]

            out_base = out_base.squeeze(0)  # Shape: [sequence_length, Bert_vocab_size]
            out_help = out_help.squeeze(0)  # Shape: [sequence_length, Bert_vocab_size]

            masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == tokenizer.mask_token]

            for j in masked_tokens:
                idx = len(summary + [sep]) + j
                predicted_idx_base = torch.argmax(out_base[idx]).item()
                predicted_idx_help = torch.argmax(out_help[idx]).item()

                predicted_word_base = tokenizer.convert_ids_to_tokens(predicted_idx_base)
                predicted_word_help = tokenizer.convert_ids_to_tokens(predicted_idx_help)

                # print(f'predicted_word_base: {predicted_word_base}')
                # print(f'predicted_word_help: {predicted_word_help}')
                # print(f'sentence[{j}]: {sentence[j]}')

                k = int(predicted_word_base == sentence[j])
                m = int(predicted_word_help == sentence[j])
                S[k][m] += 1

    # print(f'S: {S}')
    # try:
    B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])
    # except ZeroDivisionError:
    #   B = 0.0

    return B



In [9]:
summaries = DailyNews_ds['summary'] # (List[str])
texts = DailyNews_ds['text']  # (List[str]) each string is a paragraph made of a few sentences

In [10]:
# each text in texts is a list of sentences (each sentence is a string)
texts = [sent_tokenize(text.strip()) for text in texts] # List[List[str]]
assert len(texts) == len(summaries) == 300

In [11]:
tokenized_texts = [[tokenizer.tokenize(sentence) for sentence in text] for text in texts]
tokenized_summaries = [tokenizer.tokenize(summary) for summary in summaries]

In [12]:
BLANC_help(tokenized_summaries[0], tokenized_texts[0], model, device=DEVICE)

0.11069418386491557

In [13]:
scores = [BLANC_help(summary, text, model, device=DEVICE)
          for summary, text in tqdm(zip(tokenized_summaries, tokenized_texts))]
scores

0it [00:00, ?it/s]

[0.11069418386491557,
 0.08967391304347826,
 0.14035087719298245,
 0.08250355618776671,
 0.06504065040650407,
 0.16971279373368145,
 0.159375,
 0.06736526946107785,
 0.08818011257035648,
 0.1277258566978193,
 0.12241653418124006,
 0.15737704918032788,
 0.12459016393442623,
 0.09556313993174062,
 0.2875816993464052,
 0.08672086720867209,
 0.13921901528013583,
 0.1620795107033639,
 0.19607843137254902,
 0.10483870967741936,
 0.146875,
 0.0517464424320828,
 0.3137254901960784,
 0.14250614250614252,
 0.12538226299694188,
 0.13870967741935483,
 0.17256637168141592,
 0.087248322147651,
 0.1478405315614618,
 0.11192660550458716,
 0.0783132530120482,
 0.19327731092436976,
 0.14363143631436315,
 0.16032608695652173,
 0.17725752508361203,
 0.10580204778156997,
 0.15217391304347827,
 0.16971279373368145,
 0.06985769728331177,
 0.19457013574660634,
 0.1188118811881188,
 0.12459016393442623,
 0.16265060240963855,
 0.09561128526645768,
 0.09987029831387809,
 0.103125,
 0.2073170731707317,
 0.1803921

Ideas for improvement:
1. try other models
2. try other datasets
3. test on other problems