In [1]:
%pip install datasets

In [2]:
import pandas as pd
import torch
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm.notebook import tqdm

# The models the authors used:
try:
    from transformers import BertConfig, BertForMaskedLM, BertTokenizer, AdamW, get_linear_schedule_with_warmup
    from transformers import AlbertForMaskedLM, AlbertTokenizer
except ModuleNotFoundError:
    %pip install transformers
    from transformers import BertForMaskedLM, BertTokenizer, AdamW, get_linear_schedule_with_warmup
    from transformers import AlbertForMaskedLM, AlbertTokenizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

In [6]:
DailyNews_ds = load_dataset('json', data_files='../datasets/DailyNews_300.json', split='train')
DailyNews_ds

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['annotators_ids', 'scores', 'summary', 'text'],
    num_rows: 300
})

In [21]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(DEVICE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [50]:
class DataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):

        summaries, texts = zip(*[(item['summary'], item['text']) for item in batch])

        # Pad summaries

        summaries_ids = self.tokenizer(
            summaries,
            add_special_tokens=False,
            truncation=True,
            padding='longest',
            return_tensors='pt')['input_ids']


        # Pad texts

        # Tokenizing each text into a list of sentences
        texts = [sent_tokenize(text.strip()) for text in texts] # List[List[str]]

        # Finding the maximum text length across all texts in the batch and the maximum sentence length across all texts
        max_text_len = max(len(text) for text in texts)
        max_sent_len = max(max(len(tokenizer.tokenize(sent)) for sent in text) for text in texts)

        # Padding each text with empty sentences to make them equal in length
        padded_texts = [text + [''] * (max_text_len - len(text)) for text in texts]

        # Tokenizing each sentence independently within each text
        tokenized_texts = []
        for text in padded_texts:
            text_tokens = self.tokenizer(text,
                                         add_special_tokens=False,
                                         truncation=True,
                                         padding='max_length',
                                         max_length=max_sent_len,
                                         return_tensors='pt')

            tokenized_texts.append(text_tokens['input_ids'])

        # Stacking the padded and tokenized texts along the first dimension to get a tensor of shape [batch_size, num_sentences, max_sentence_length]
        texts_ids = torch.stack(tokenized_texts, dim=0)

        return {'summaries_ids': summaries_ids, 'texts_ids': texts_ids}

In [56]:
def BLANC_help(model, dataloader, M=6, L_min=4, device='cpu'):
    """
    Calculates BLANC similarity between summaries and texts using a BERT-type model.

    Parameters:
    - model: BERT-type model.
    - dataloader: DataLoader instance containing batches of data with 'summaries_ids' and 'texts_ids'.
    - M (int, optional): Parameter M for the algorithm (default is 6).
    - L_min (int, optional): Minimum length requirement for masked words (default is 4).
    - device (str, optional): Device on which to perform computations ('cpu' or 'cuda'). Default is 'cpu'.

    Returns:
    - List[float]: A list of BLANC similarity scores for each text in the dataset.
    """

    scores = []

    for batch in dataloader:
        summaries = batch['summaries_ids'].to(device) # Shape: [batch_size, max_summary_length]
        texts = batch['texts_ids'].to(device) # Shape: [batch_size, num_sentences, max_sentence_length]

        batch_size, num_sentences, max_sentence_length = texts.size()
        max_summary_length = summaries.size(1)

        filler = torch.zeros_like(summaries).fill_(tokenizer.convert_tokens_to_ids('.'))  # Shape: [batch_size, max_summary_length]

        # Initializing S for each text in the batch
        S = torch.zeros((batch_size, 2, 2), dtype=torch.float)

        for i in range(M):
            mask_indices = torch.arange(max_sentence_length).expand(batch_size, num_sentences, -1).to(device)
            masked_texts = texts.clone()
            mask = (mask_indices % M == 0) & (masked_texts != tokenizer.pad_token_id) # TODO: add a condition to check for the length of the tokens
            masked_texts[mask] = tokenizer.mask_token_id  # Shape: [batch_size, num_sentences, max_sentence_length]

            # Expanding filler and summaries along the second dimension to match num_sentences
            expanded_filler = filler.unsqueeze(1).expand(-1, num_sentences, -1)       # Shape: [batch_size, num_sentences, max_summary_length]
            expanded_summaries = summaries.unsqueeze(1).expand(-1, num_sentences, -1) # Shape: [batch_size, num_sentences, max_summary_length]

            input_base = torch.cat((expanded_filler, masked_texts), dim=2).to(device)    # Shape: [batch_size, num_sentences, max_summary_length + max_sentence_length]
            input_help = torch.cat((expanded_summaries, masked_texts), dim=2).to(device) # Shape: [batch_size, num_sentences, max_summary_length + max_sentence_length]

            # The model expects input shapes to be [batch_size, seq_length]
            out_base_ids_list = []
            out_help_ids_list = []
            for sent_idx in range(num_sentences):
                sent_input_base = input_base[:, sent_idx, :] # Shape: [batch_size, max_summary_length + max_sentence_length]
                sent_input_help = input_help[:, sent_idx, :] # Shape: [batch_size, max_summary_length + max_sentence_length]

                attention_mask_base = (sent_input_base != tokenizer.pad_token_id)
                attention_mask_help = (sent_input_help != tokenizer.pad_token_id)

                with torch.no_grad():
                  out_base_logits = model(input_ids=sent_input_base, attention_mask=attention_mask_base).logits  # Shape: [batch_size, max_summary_length + max_sentence_length, Bert_vocab_size]
                  out_help_logits = model(input_ids=sent_input_help, attention_mask=attention_mask_help).logits  # Shape: [batch_size, max_summary_length + max_sentence_length, Bert_vocab_size]

                # Getting predicted token IDs
                out_base_ids_list.append(out_base_logits.argmax(dim=-1))
                out_help_ids_list.append(out_help_logits.argmax(dim=-1))

            out_base = torch.stack(out_base_ids_list, dim=1) # Shape: [batch_size, num_sentences, max_summary_length + max_sentence_length]
            out_help = torch.stack(out_help_ids_list, dim=1) # Shape: [batch_size, num_sentences, max_summary_length + max_sentence_length]

            # Indices of masked tokens
            masked_indices = mask.nonzero()

            for idx in masked_indices:
                batch_idx, sentence_idx, token_idx = idx

                out_base_token = out_base[batch_idx, sentence_idx, max_summary_length + token_idx].item()
                out_help_token = out_help[batch_idx, sentence_idx, max_summary_length + token_idx].item()
                text_token = texts[batch_idx, sentence_idx, token_idx].item()

                # print(f'out_base_token: {tokenizer.convert_ids_to_tokens(out_base_token)}')
                # print(f'out_help_token: {tokenizer.convert_ids_to_tokens(out_help_token)}')
                # print(f'text_token: {tokenizer.convert_ids_to_tokens(text_token)}')

                # Calculate k and m
                k = int(out_base_token == text_token)
                m = int(out_help_token == text_token)
                S[batch_idx, k, m] += 1
                # print(f'S[{batch_idx}, {k}, {m}]: {S[batch_idx, k, m]}')

        # Computing scores for each text in the batch, but setting 0.0 for batches with zero denominators to avoid ZeroDivisionError
        denominator = S[:, 0, 0] + S[:, 1, 1] + S[:, 0, 1] + S[:, 1, 0]
        nonzero_mask = denominator != 0.0
        B = torch.zeros_like(denominator, dtype=torch.float)
        B[nonzero_mask] = (S[:, 0, 1] - S[:, 1, 0])[nonzero_mask] / denominator[nonzero_mask]

        print(f'B: {B.tolist()}')
        # Extending the scores list with the scores for the texts in the current batch
        scores.extend(B.tolist())

    return scores


In [None]:
dataset = DailyNews_ds.select_columns(['summary', 'text'])
data_collator = DataCollator(tokenizer)

batch_size = 32

dataloader = DataLoader(
    dataset, batch_size=batch_size, collate_fn=data_collator, shuffle=True
    )

scores = BLANC_help(model, dataloader, M=6, L_min=4, device=DEVICE)