In [1]:
import pandas as pd
import random as rd
from transformers import BertTokenizer, BertForMaskedLM, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch

In [21]:
data = pd.read_json('./datasets/DailyNews_300.json')
print(data.shape)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

(300, 4)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.features = self.dataset.columns

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, idx):
        return (self.dataset.iloc[idx, 0], self.dataset.iloc[idx, 1])
    
    def map(self, preprocessing_fn, **kwargs):
        return CustomDataset(self.dataset.apply(lambda x: preprocessing_fn(x, **kwargs), axis = 1))
    
    def select_columns(self, columns):
        new_dataset = self.dataset[columns] 
        return CustomDataset(new_dataset)
    
dataset = CustomDataset(data)
# print(dataset.__getitem__(0))

In [42]:
def preprocessing_fn(x, tokenizer):
    x["summary_ids"] = tokenizer(
        x["summary"],
        add_special_tokens = False,
        truncation = True,
        max_length = 512,
        padding = False,
        return_attention_mask = False,
    )["input_ids"]

    x["text_ids"] = tokenizer(
        x["text"],
        add_special_tokens = False,
        truncation = True,
        max_length = 512,
        padding = False,
        return_attention_mask = False,
    )["input_ids"]

    return x

splitted_dataset = dataset.select_columns(["summary", "text"])
# print(splitted_dataset.__getitem__(0))

# Tokenize the dataset
splitted_dataset = splitted_dataset.map(
    preprocessing_fn, tokenizer = tokenizer
)
print(splitted_dataset.__getitem__(0))

# Remove useless columns
splitted_dataset = splitted_dataset.select_columns(["summary_ids", "text_ids"])
print(splitted_dataset.__getitem__(0))

('Former England defender Gary Neville suggested Gareth Southgate \' s squad had done more than could have been expected of them at this World Cup.\nIvan Perisic and Mario Mandzukic goals earned Croatia a date with France after a 2 - 1 extra - time win in Moscow.\nNeville : " This team has taken us to a place we never imagined we could get "\nEngland \' s Dejan Lovren and Domagoj Vida were sent off in the second half.\nDefender Rio Ferdinand says experience was a telling factor in the England squad.\nEngland face France in the World Cup final in Brazil.', 'Mario Mandzukic pounces to fire the ball past Jordan Pickford and put Croatia into the World Cup final. Photo: Reuters Independent.ie \n \nFormer England defender Gary Neville suggested Gareth Southgate\'s squad had done more than could have been expected of them at this World Cup as they bowed out with a semi-final defeat against Croatia. \n \nhttps://www.independent.ie/sport/soccer/world-cup-2018/gary-neville-salutes-englands-overa

In [47]:
def collate_fn(batch):
    # Get the tokenized sequences for each item in the batch
    text_ids_batch = [torch.tensor(item[1], dtype = torch.int) for item in batch]
    summary_ids_batch = [torch.tensor(item[0], dtype = torch.int) for item in batch]

    # Pad sequences to the maximum length in the batch
    padded_text_ids = pad_sequence(text_ids_batch, batch_first = True, padding_value = 0)
    padded_summary_ids = pad_sequence(summary_ids_batch, batch_first = True, padding_value = 0)

    return {"text_ids": padded_text_ids, "summary_ids": padded_summary_ids}

batch_size = 32
dataloader = DataLoader(splitted_dataset, batch_size = batch_size, collate_fn = collate_fn)

epochs = 3
def training(summary, text, model, epochs = 10):
    model_copy = model.copy()
    model_copy.train()
    for epochs in range(epochs):
        outputs = model_copy(summary, text)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return model_copy

In [53]:
def blanc_tune(summary, text, model, p_mask = 0.15, l_min = 4, N = 10, epochs = 10):
    N_summary = len(summary)
    N_mask = int(N_summary*p_mask)
    summary_detokenized = tokenizer.decode(summary.numpy().tolist())
    print(summary_detokenized)

    set_tune = pd.DataFrame(columns = ['summary', 'text'])
    for _ in range(1, N + 1):
        pos = [i for i, word in enumerate() if len(word) >= l_min]
        pos = rd.shuffle(pos)
        while len(pos) != 0:
            masked_summary = words_in_summary.copy()
            for pos_to_mask in pos[:N_mask]:
                masked_summary[pos_to_mask] = '<MASK>'
                set_tune.loc[set_tune.shape[0]] = [masked_summary, text]
    model_tuned = training(summary, text, model, epochs)
            
    return model_tuned

def blanc_tune_batch(batch, model, p_mask = 0.15, l_min = 4, N = 10):
    batch_tuned_models = []
    for summary, text in zip(batch['summary_ids'], batch['text_ids']):
        tuned_model = blanc_tune(summary, text, model, p_mask, l_min, N)
        batch_tuned_models.append(tuned_model)

    return batch_tuned_models

for batch in dataloader:
    print(batch)
    tuned_models = blanc_tune_batch(batch, model)

{'text_ids': tensor([[ 7986,  2158,  2094,  ...,  4100,  1011,  2345],
        [ 2005,  1996,  3822,  ...,  2606, 11917,  3737],
        [ 6264,  1517,  1996,  ...,  2005,  1996,  2110],
        ...,
        [ 5655, 16216, 19020,  ..., 19723, 24454,  3686],
        [ 1996,  4501,  2966,  ..., 12763,  3378,  2007],
        [ 2062,  2084,  1002,  ...,  2095,  1011,  2214]], dtype=torch.int32), 'summary_ids': tensor([[ 2280,  2563,  8291,  ...,     0,     0,     0],
        [ 1048,  1005, 10848,  ...,     0,     0,     0],
        [ 2655, 21293,  2100,  ...,     0,     0,     0],
        ...,
        [ 5655, 16216, 19020,  ...,     0,     0,     0],
        [ 2151,  6926,  1010,  ...,     0,     0,     0],
        [ 3533,  7226,  2368,  ...,     0,     0,     0]], dtype=torch.int32)}


AttributeError: module 'tensorflow' has no attribute 'Tensor'