In [1]:
# %%capture
# !pip install datasets

In [2]:
import torch
import nltk
from nltk.tokenize import sent_tokenize
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm.notebook import tqdm

# The models the authors used:
from transformers import BertForMaskedLM, BertTokenizer, logging
from transformers import AlbertForMaskedLM, AlbertTokenizer


nltk.download('punkt')
logging.set_verbosity_error()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

In [None]:
# cnn_dailymail_ds = load_dataset("cnn_dailymail", '3.0.0', split='test')
# print(cnn_dailymail_ds)

In [4]:
DailyNews_ds = load_dataset('json', data_files='DailyNews_300.json', split='train')
DailyNews_ds

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['summary', 'text', 'scores', 'annotators_ids'],
    num_rows: 300
})

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(DEVICE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:
class DataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):

        summaries, texts = zip(*[(item['summary'], item['text']) for item in batch])

        # Pad summaries

        summaries_ids = self.tokenizer(summaries,
                                       add_special_tokens=False,
                                       truncation=True,
                                       padding='longest',
                                       return_tensors='pt')['input_ids']

        # Pad texts

        # Tokenizing each text into a list of sentences
        texts = [sent_tokenize(text.strip()) for text in texts] # List[List[str]]

        # Finding the maximum text length and the maximum sentence length across all texts
        max_text_len = max(len(text) for text in texts)
        max_sent_len = max(max(len(tokenizer.tokenize(sent)) for sent in text) for text in texts)

        # Padding each text with empty sentences to make them equal in length
        padded_texts = [text + [''] * (max_text_len - len(text)) for text in texts]

        # Tokenizing each sentence independently within each text
        tokenized_texts = []
        for text in padded_texts:
            text_tokens = self.tokenizer(text,
                                         add_special_tokens=False,
                                         truncation=True,
                                         padding='max_length',
                                         max_length=max_sent_len,
                                         return_tensors='pt')

            tokenized_texts.append(text_tokens['input_ids'])

        # Stacking the padded and tokenized texts to get a 3D tensor
        texts_ids = torch.stack(tokenized_texts, dim=0)

        return {'summaries_ids': summaries_ids, 'texts_ids': texts_ids}

In [33]:
# def BLANC_help(dataloader, model, tokenizer, M=6, L_min=4, device='cpu'):
#     """
#     Calculates BLANC score between summaries and texts using a BERT-type model.

#     Parameters:
#     - dataloader: DataLoader instance containing batches of data with 'summaries_ids' and 'texts_ids'.
#     - model: BERT-type model.
#     - tokenizer: Tokenizer corresponding to the BERT model.
#     - M (int, optional): Parameter M for the algorithm (default is 6).
#     - L_min (int, optional): Minimum length requirement for masked words (default is 4).
#     - device (str, optional): Device on which to perform computations ('cpu' or 'cuda'). Default is 'cpu'.

#     Returns:
#     - List[float]: A list of BLANC scores for each text in the dataset.
#     """

#     scores = []

#     for batch in tqdm(dataloader):
#         texts = batch['texts_ids'].to(device)                                                        # Shape: [batch_size, num_sentences, max_sentence_length]
#         summaries = batch['summaries_ids'].to(device)                                                # Shape: [batch_size, max_summary_length]
#         filler = torch.zeros_like(summaries).fill_(tokenizer.convert_tokens_to_ids('.')).to(device)  # Shape: [batch_size, max_summary_length]

#         batch_size, num_sentences, max_sentence_length = texts.size()
#         max_summary_length = summaries.size(1)

#         # Initialize S for each text in the batch
#         S = torch.zeros((batch_size, 2, 2), dtype=torch.float)

#         for i in range(M):
#             sent_indices = torch.arange(max_sentence_length).expand(batch_size, num_sentences, -1).to(device)
#             masked_texts = texts.clone()
#             mask = ((sent_indices - i) % M == 0) & (masked_texts != tokenizer.pad_token_id) # TODO: add a condition to check for the length of the tokens
#             masked_texts[mask] = tokenizer.mask_token_id  # Shape: [batch_size, num_sentences, max_sentence_length]

#             # Expanding filler and summaries along the second dimension to match num_sentences
#             expanded_filler = filler.unsqueeze(1).expand(-1, num_sentences, -1)       # Shape: [batch_size, num_sentences, max_summary_length]
#             expanded_summaries = summaries.unsqueeze(1).expand(-1, num_sentences, -1) # Shape: [batch_size, num_sentences, max_summary_length]

#             input_base = torch.cat((expanded_filler, masked_texts), dim=2).to(device)    # Shape: [batch_size, num_sentences, max_summary_length + max_sentence_length]
#             input_help = torch.cat((expanded_summaries, masked_texts), dim=2).to(device) # Shape: [batch_size, num_sentences, max_summary_length + max_sentence_length]

#             # The model expects input shapes to be [batch_size, seq_length]
#             out_base_ids_list = []
#             out_help_ids_list = []
#             for sent_idx in range(num_sentences):
#                 sent_input_base = input_base[:, sent_idx, :] # Shape: [batch_size, max_summary_length + max_sentence_length]
#                 sent_input_help = input_help[:, sent_idx, :] # Shape: [batch_size, max_summary_length + max_sentence_length]

#                 attention_mask_base = (sent_input_base != tokenizer.pad_token_id)
#                 attention_mask_help = (sent_input_help != tokenizer.pad_token_id)

#                 with torch.no_grad():
#                   out_base_logits = model(input_ids=sent_input_base, attention_mask=attention_mask_base).logits  # Shape: [batch_size, max_summary_length + max_sentence_length, Bert_vocab_size]
#                   out_help_logits = model(input_ids=sent_input_help, attention_mask=attention_mask_help).logits  # Shape: [batch_size, max_summary_length + max_sentence_length, Bert_vocab_size]

#                 # Getting predicted token IDs
#                 out_base_ids_list.append(out_base_logits.argmax(dim=-1))
#                 out_help_ids_list.append(out_help_logits.argmax(dim=-1))

#             out_base = torch.stack(out_base_ids_list, dim=1) # Shape: [batch_size, num_sentences, max_summary_length + max_sentence_length]
#             out_help = torch.stack(out_help_ids_list, dim=1) # Shape: [batch_size, num_sentences, max_summary_length + max_sentence_length]

#             masked_indices = mask.nonzero()

#             for idx in masked_indices:
#                 batch_idx, sentence_idx, token_idx = idx

#                 out_base_token = out_base[batch_idx, sentence_idx, max_summary_length + token_idx].item()
#                 out_help_token = out_help[batch_idx, sentence_idx, max_summary_length + token_idx].item()
#                 text_token = texts[batch_idx, sentence_idx, token_idx].item()

#                 # print(f'out_base_token: {tokenizer.convert_ids_to_tokens(out_base_token)}')
#                 # print(f'out_help_token: {tokenizer.convert_ids_to_tokens(out_help_token)}')
#                 # print(f'text_token: {tokenizer.convert_ids_to_tokens(text_token)}')

#                 k = int(out_base_token == text_token)
#                 m = int(out_help_token == text_token)
#                 S[batch_idx, k, m] += 1
#                 # print(f'S[{batch_idx}, {k}, {m}]: {S[batch_idx, k, m]}')

#         # Computing scores for each text in the batch, but setting 0.0 for batches with zero denominators to avoid ZeroDivisionError
#         denominator = S[:, 0, 0] + S[:, 1, 1] + S[:, 0, 1] + S[:, 1, 0]
#         nonzero_mask = denominator != 0.0
#         B = torch.zeros_like(denominator, dtype=torch.float)
#         B[nonzero_mask] = (S[:, 0, 1] - S[:, 1, 0])[nonzero_mask] / denominator[nonzero_mask]

#         # Extending the scores list with the scores for the texts in the current batch
#         scores.extend(B.tolist())

#     return scores


In [46]:
def BLANC_help(dataloader, model, tokenizer, M=6, L_min=4, device='cpu'):
    """
    Calculates BLANC score between summaries and texts using a BERT-type model.

    Parameters:
    - dataloader: DataLoader instance containing batches of data with 'summaries_ids' and 'texts_ids'.
    - model: BERT-type model.
    - tokenizer: Tokenizer corresponding to the BERT model.
    - M (int, optional): Parameter M for the algorithm (default is 6).
    - L_min (int, optional): Minimum length requirement for masked words (default is 4).
    - device (str, optional): Device on which to perform computations ('cpu' or 'cuda'). Default is 'cpu'.

    Returns:
    - List[float]: A list of BLANC scores for each text in the dataset.
    """

    scores = []

    for batch in tqdm(dataloader):
        texts = batch['texts_ids'].to(device)                                                        # Shape: [batch_size, num_sentences, max_sentence_length]
        summaries = batch['summaries_ids'].to(device)                                                # Shape: [batch_size, max_summary_length]
        filler = torch.zeros_like(summaries).fill_(tokenizer.convert_tokens_to_ids('.')).to(device)  # Shape: [batch_size, max_summary_length]

        batch_size, num_sentences, max_sentence_length = texts.size()
        max_summary_length = summaries.size(1)

        # Initialize S for each text in the batch
        S = torch.zeros((batch_size, 2, 2), dtype=torch.float)

        for i in range(M):
            sent_indices = torch.arange(max_sentence_length).expand(batch_size, num_sentences, -1).to(device)
            masked_texts = texts.clone()
            mask = ((sent_indices - i) % M == 0) & (masked_texts != tokenizer.pad_token_id) # TODO: add a condition to check for the length of the tokens
            masked_texts[mask] = tokenizer.mask_token_id  # Shape: [batch_size, num_sentences, max_sentence_length]

            # Expanding filler and summaries along the second dimension to match num_sentences
            expanded_filler = filler.unsqueeze(1).expand(-1, num_sentences, -1)       # Shape: [batch_size, num_sentences, max_summary_length]
            expanded_summaries = summaries.unsqueeze(1).expand(-1, num_sentences, -1) # Shape: [batch_size, num_sentences, max_summary_length]

            input_base = torch.cat((expanded_filler, masked_texts), dim=2).to(device)    # Shape: [batch_size, num_sentences, max_summary_length + max_sentence_length]
            input_help = torch.cat((expanded_summaries, masked_texts), dim=2).to(device) # Shape: [batch_size, num_sentences, max_summary_length + max_sentence_length]

            # The model expects input shapes to be [batch_size, seq_length]
            flattened_input_base = input_base.view(-1, max_summary_length + max_sentence_length)  # Shape: [batch_size * num_sentences, max_summary_length + max_sentence_length]
            flattened_input_help = input_help.view(-1, max_summary_length + max_sentence_length)  # Shape: [batch_size * num_sentences, max_summary_length + max_sentence_length]

            attention_mask_base = (flattened_input_base != tokenizer.pad_token_id)
            attention_mask_help = (flattened_input_help != tokenizer.pad_token_id)

            with torch.no_grad():
                out_base_logits = model(input_ids=flattened_input_base, attention_mask=attention_mask_base).logits  # Shape: [batch_size * num_sentences, max_summary_length + max_sentence_length, Bert_vocab_size]
                out_help_logits = model(input_ids=flattened_input_help, attention_mask=attention_mask_help).logits  # Shape: [batch_size * num_sentences, max_summary_length + max_sentence_length, Bert_vocab_size]

            # Getting predicted token IDs
            out_base_predictions = out_base_logits.argmax(dim=-1) # Shape: [batch_size * num_sentences, max_summary_length + max_sentence_length]
            out_help_predictions = out_help_logits.argmax(dim=-1) # Shape: [batch_size * num_sentences, max_summary_length + max_sentence_length]

            # Unflattening the tensors
            out_base = out_base_predictions.view(batch_size, num_sentences, max_summary_length + max_sentence_length)
            out_help = out_help_predictions.view(batch_size, num_sentences, max_summary_length + max_sentence_length)

            masked_indices = mask.nonzero()

            for idx in masked_indices:
                batch_idx, sentence_idx, token_idx = idx

                out_base_token = out_base[batch_idx, sentence_idx, max_summary_length + token_idx].item()
                out_help_token = out_help[batch_idx, sentence_idx, max_summary_length + token_idx].item()
                text_token = texts[batch_idx, sentence_idx, token_idx].item()

                # print(f'out_base_token: {tokenizer.convert_ids_to_tokens(out_base_token)}')
                # print(f'out_help_token: {tokenizer.convert_ids_to_tokens(out_help_token)}')
                # print(f'text_token: {tokenizer.convert_ids_to_tokens(text_token)}')

                k = int(out_base_token == text_token)
                m = int(out_help_token == text_token)
                S[batch_idx, k, m] += 1
                # print(f'S[{batch_idx}, {k}, {m}]: {S[batch_idx, k, m]}')

        # Computing scores for each text in the batch, but setting 0.0 for batches with zero denominators to avoid ZeroDivisionError
        denominator = S[:, 0, 0] + S[:, 1, 1] + S[:, 0, 1] + S[:, 1, 0]
        nonzero_mask = denominator != 0.0
        B = torch.zeros_like(denominator, dtype=torch.float)
        B[nonzero_mask] = (S[:, 0, 1] - S[:, 1, 0])[nonzero_mask] / denominator[nonzero_mask]

        # Extending the scores list with the scores for the texts in the current batch
        scores.extend(B.tolist())

    return scores


In [47]:
dataset = DailyNews_ds.select_columns(['summary', 'text'])
data_collator = DataCollator(tokenizer)

batch_size = 1

dataloader = DataLoader(
    dataset, batch_size=batch_size, collate_fn=data_collator, shuffle=True
    )

scores = BLANC_help(dataloader, model, tokenizer, M=6, L_min=4, device=DEVICE)

  0%|          | 0/300 [00:00<?, ?it/s]

In [48]:
len(scores)

300

In [49]:
scores

[0.09479865431785583,
 0.12339331954717636,
 0.12236286699771881,
 0.20347394049167633,
 0.15760870277881622,
 0.09969788789749146,
 0.13137255609035492,
 0.2533692717552185,
 0.1082981750369072,
 0.1649484485387802,
 0.17134416103363037,
 0.19018404185771942,
 0.12384473532438278,
 0.2631579041481018,
 0.15549597144126892,
 0.08979591727256775,
 0.1375245600938797,
 0.08884297311306,
 0.1457446813583374,
 0.21975308656692505,
 0.10104011744260788,
 0.11546841263771057,
 0.13740457594394684,
 0.10173697024583817,
 0.1638796031475067,
 0.2224108725786209,
 0.08782435208559036,
 0.14341846108436584,
 0.12456747144460678,
 0.10294117778539658,
 -0.005093378480523825,
 0.261904776096344,
 0.208271786570549,
 0.26006191968917847,
 0.1859504133462906,
 0.10852713137865067,
 0.22295081615447998,
 0.2093287855386734,
 0.16170212626457214,
 0.16607773303985596,
 0.12054329365491867,
 0.1387755125761032,
 0.12939521670341492,
 0.11337209492921829,
 0.1109243705868721,
 0.2239130437374115,
 0.089

Ideas for improvement:
1. try other models
2. try other datasets
3. test on other problems

In [30]:
# summaries = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# print(summaries, summaries.shape)
# print()

# expanded_summaries = summaries.unsqueeze(1).expand(-1, 2, -1)
# print(expanded_summaries, expanded_summaries.shape)

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]) torch.Size([3, 3])

tensor([[[1, 2, 3],
         [1, 2, 3]],

        [[4, 5, 6],
         [4, 5, 6]],

        [[7, 8, 9],
         [7, 8, 9]]]) torch.Size([3, 2, 3])
