In [1]:
import pandas as pd
import random
from transformers import BertTokenizer, BertForMaskedLM, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch
import copy
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_json('./datasets/DailyNews_300.json')
print(data.shape)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

(300, 4)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.features = self.dataset.columns

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, idx):
        return (self.dataset.iloc[idx, 0], self.dataset.iloc[idx, 1], self.dataset.iloc[idx, 2])
    
    def map(self, preprocessing_fn, **kwargs):
        return CustomDataset(self.dataset.apply(lambda x: preprocessing_fn(x, **kwargs), axis = 1))
    
    def select_columns(self, columns):
        new_dataset = self.dataset[columns] 
        return CustomDataset(new_dataset)
    
    def get_sentences(self):
        self.dataset['sentences'] = self.dataset['text'].apply(lambda x: x.split('.'))
        return CustomDataset(self.dataset)
    
dataset = CustomDataset(data)
dataset = dataset.get_sentences()
# print(dataset.__getitem__(0))

In [4]:
def get_word_lengths(dataset, tokenizer):
    word_lengths = {}
    all_tokens = []

    for sample in dataset:
        summary = sample[0]
        preprocessed_result = tokenizer(summary, 
                                        add_special_tokens = False,
                                        truncation = True,
                                        max_length = 512,
                                        padding = False,
                                        return_attention_mask = False)
        tokens = preprocessed_result["input_ids"]
        decoded_tokens = tokenizer.convert_ids_to_tokens(tokens)
        for token in tokens:
            if token not in all_tokens:
                all_tokens.append(token)

        i = 0
        while i < len(tokens):
            if decoded_tokens[i].startswith('##'):
                combined_word = decoded_tokens[i - 1] + decoded_tokens[i][2:]
                word_lengths[tokens[i - 1]] = len(combined_word)
                word_lengths[tokens[i]] = len(combined_word)
            else:
                word_lengths[tokens[i]] = len(decoded_tokens[i])
            i += 1

    assert len(all_tokens) == len(word_lengths), "Association of tokens with word length : FAILED."

    return word_lengths

In [5]:
def preprocessing_fn(x, tokenizer):
    x["summary_ids"] = tokenizer(
        x["summary"],
        add_special_tokens = False,
        truncation = True,
        max_length = 512,
        padding = False,
        return_attention_mask = True,
    )["input_ids"]

    x["text_ids"] = tokenizer(
        x["text"],
        add_special_tokens = False,
        truncation = True,
        max_length = 512,
        padding = False,
        return_attention_mask = True,
    )["input_ids"]

    x["sentences_ids"] = tokenizer(
        x["sentences"],
        add_special_tokens = False,
        truncation = True,
        max_length = 512,
        padding = False,
        return_attention_mask = True,
    )["input_ids"]

    return x

splitted_dataset = dataset.select_columns(["summary", "text", "sentences"])
# print(splitted_dataset.__getitem__(0))

word_lengths = get_word_lengths(splitted_dataset, tokenizer)

# Tokenize the dataset
splitted_dataset = splitted_dataset.map(
    preprocessing_fn, tokenizer = tokenizer
)
print(splitted_dataset.__getitem__(0)[2])

# Remove useless columns
splitted_dataset = splitted_dataset.select_columns(["summary_ids", "text_ids", "sentences_ids"])
print(splitted_dataset.__getitem__(0)[2])

['Mario Mandzukic pounces to fire the ball past Jordan Pickford and put Croatia into the World Cup final', ' Photo: Reuters Independent', "ie \n \nFormer England defender Gary Neville suggested Gareth Southgate's squad had done more than could have been expected of them at this World Cup as they bowed out with a semi-final defeat against Croatia", ' \n \nhttps://www', 'independent', 'ie/sport/soccer/world-cup-2018/gary-neville-salutes-englands-overachievers-as-alan-shearer-and-rio-ferdinand-give-their-verdicts-37108667', 'html \n \nhttps://www', 'independent', 'ie/incoming/article37108634', 'ece/7571a/AUTOCROP/h342/52Man1', "jpg \n   Email     \nFormer England defender Gary Neville suggested Gareth Southgate's squad had done more than could have been expected of them at this World Cup as they bowed out with a semi-final defeat against Croatia", ' \n  \nA jaded England faded after had Kieran Trippier fired England ahead after just five minutes with a superb free-kick, with goals from Iv

In [6]:
def collate_fn(batch):
    # Get the tokenized sequences for each item in the batch
    text_ids_batch = [torch.tensor(item[1], dtype = torch.int) for item in batch]
    summary_ids_batch = [torch.tensor(item[0], dtype = torch.int) for item in batch]
    sentences_ids_batch = [
        [torch.tensor(sentence, dtype = torch.int) for sentence in item[2]]
        for item in batch
    ]

    # Pad sequences to the maximum length in the batch
    padded_text_ids = pad_sequence([torch.cat([item, torch.zeros(max(0, 512 - len(item)))]) for item in text_ids_batch], batch_first = True, padding_value = 0)
    padded_summary_ids = pad_sequence([torch.cat([item, torch.zeros(max(0, 512 - len(item)))]) for item in summary_ids_batch], batch_first = True, padding_value = 0)
    padded_sentences_ids = [
        pad_sequence(
            [torch.cat([sentence, torch.zeros(max(0, 512 - len(sentence)), dtype=torch.int)]) for sentence in item],
            batch_first=True,
            padding_value=0
        )
        for item in sentences_ids_batch
    ]

    return {"text_ids": padded_text_ids, "summary_ids": padded_summary_ids, "sentences_ids": padded_sentences_ids}

batch_size = 32
dataloader = DataLoader(splitted_dataset, batch_size = batch_size, collate_fn = collate_fn)

epochs = 3
def training(summary, text, model, epochs = 10):
    model_copy = copy.deepcopy(model)
    model_copy.train()

    summary = summary.unsqueeze(0)
    text = text.unsqueeze(0)
    if summary.size(1) != text.size(1):
        raise RuntimeError("Sizes along the sequence length dimension must match.")
    
    for epochs in range(epochs):
        whole_input = torch.cat((summary, text), dim = 0).long()
        outputs = model_copy(whole_input, labels = whole_input)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return model_copy

In [None]:
def modified_BLANC_help(text, model, model_tuned, p_mask = 0.15, l_min = 4):
    S = [[0, 0], [0, 0]]
    
    for sentence in text_sents:
        sentence = word_tokenize(sentence)
        for i in range(M):
            masked_sentence = ''.join(['<MASK>' if (j - i) % M == 0 and len(sentence[j]) >= L_min else sentence[j] for j in range(len(sentence))])
            input_base = filler + sep + masked_sentence
            input_help = summary + sep + masked_sentence
            tokenized_input_base = tokenizer(input_base, return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
            tokenized_input_help = tokenizer(input_help, return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
            out_base = model(**tokenized_input_base)
            out_help = model(**tokenized_input_help)
            masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == '<MASK>']

            for j in masked_tokens:
                k = int(out_base[j] == sentence[j])
                m = int(out_help[j] == sentence[j])
                S[k][m] += 1
        break
    try:
      B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])
    except ZeroDivisionError:
      B = 0.0
    
    return B

In [7]:
def blanc_tune(summary, text, model, p_mask = 0.15, l_min = 4, N = 10, epochs = 10):
    N_summary = len(summary)
    N_mask = int(N_summary*p_mask)
    set_tune = pd.DataFrame(columns = ['summary', 'text'])

    for j in range(0, N):
        print(j)
        pos = [i for i, token in enumerate(summary.tolist()) if token in word_lengths and word_lengths[token] >= l_min]
        random.shuffle(pos)
        while len(pos) != 0:
            masked_summary = summary.tolist().copy()
            for pos_to_mask in pos[:N_mask]:
                masked_summary[pos_to_mask] = '<MASK>'
                set_tune.loc[set_tune.shape[0]] = [masked_summary, text]
            pos = pos[N_mask:]

    model_tuned = training(summary, text, model, epochs)
    print('\n')      
    return model_tuned

def blanc_tune_batch(batch, model, p_mask = 0.15, l_min = 4, N = 10, epochs = 10):
    batch_tuned_models = []
    
    i = 0
    for summary, text in zip(batch['summary_ids'], batch['text_ids']):
        print(f"Summary {i} of batch")
        i += 1
        start_time = time.time()
        tuned_model = blanc_tune(summary, text, model, p_mask, l_min, N, epochs)
        end_time = time.time()
        batch_tuned_models.append(tuned_model)
        elapsed_time = end_time - start_time
        print(f"Elapsed Time: {elapsed_time} seconds")

    return batch_tuned_models

for batch in dataloader:
    print(batch)
    tuned_models = blanc_tune_batch(batch, model, epochs = epochs)

{'text_ids': tensor([[ 7986.,  2158.,  2094.,  ...,  4100.,  1011.,  2345.],
        [ 2005.,  1996.,  3822.,  ...,  2606., 11917.,  3737.],
        [ 6264.,  1517.,  1996.,  ...,  2005.,  1996.,  2110.],
        ...,
        [ 5655., 16216., 19020.,  ..., 19723., 24454.,  3686.],
        [ 1996.,  4501.,  2966.,  ..., 12763.,  3378.,  2007.],
        [ 2062.,  2084.,  1002.,  ...,  2095.,  1011.,  2214.]]), 'summary_ids': tensor([[ 2280.,  2563.,  8291.,  ...,     0.,     0.,     0.],
        [ 1048.,  1005., 10848.,  ...,     0.,     0.,     0.],
        [ 2655., 21293.,  2100.,  ...,     0.,     0.,     0.],
        ...,
        [ 5655., 16216., 19020.,  ...,     0.,     0.,     0.],
        [ 2151.,  6926.,  1010.,  ...,     0.,     0.,     0.],
        [ 3533.,  7226.,  2368.,  ...,     0.,     0.,     0.]]), 'sentences_ids': [tensor([[ 7986,  2158,  2094,  ...,     0,     0,     0],
        [ 6302,  1024, 26665,  ...,     0,     0,     0],
        [29464,  2280,  2563,  ...,     

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


9


Elapsed Time: 7.626105546951294 seconds
Summary 1 of batch
0
1
2
3
4
5
6
7
8
9


Elapsed Time: 6.819201231002808 seconds
Summary 2 of batch
0
1
2
3
4
5
6
7
8
9


Elapsed Time: 7.377008676528931 seconds
Summary 3 of batch
0
1
2
3
4
5
6
7
8
9


Elapsed Time: 7.52235221862793 seconds
Summary 4 of batch
0
1
2
3
4
5
6
7
8
9
