In [50]:
import pandas as pd
import random as rd
from transformers import BertTokenizer, BertForMaskedLM, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch

In [48]:
data = pd.read_json('./datasets/DailyNews_300.json')
print(data.shape)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

(300, 4)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.features = self.dataset.columns

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return f'Text : {self.dataset.iloc[idx, 3]}, \nSummary : {self.dataset.iloc[idx, 2]}\n'
    
    def map(self, preprocessing_fn, **kwargs):
        return CustomDataset(self.dataset.apply(lambda x: preprocessing_fn(x, **kwargs), axis=1))
    
    def select_columns(self, columns):
        return self.dataset[columns]
    
dataset = CustomDataset(data)
# print(dataset.__getitem__(0))

In [49]:
def preprocessing_fn(x, tokenizer):
    x["summary_ids"] = tokenizer(
        x["summary"],
        add_special_tokens=False,
        truncation=True,
        max_length=512,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]

    x["text_ids"] = tokenizer(
        x["text"],
        add_special_tokens=False,
        truncation=True,
        max_length=512,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]

    return x

# Tokenize the dataset
splitted_dataset = dataset.map(
    preprocessing_fn, tokenizer=tokenizer
)
# print(splitted_dataset.features)

# Remove useless columns
splitted_dataset = splitted_dataset.select_columns(["summary_ids", "text_ids"])
# print(splitted_dataset)

In [52]:
def collate_fn(batch):
    # Get the tokenized sequences for each item in the batch
    text_ids_batch = [torch.tensor(item["text_ids"], dtype = torch.int) for item in batch]
    summary_ids_batch = [torch.tensor(item["summary_ids"], dtype = torch.int) for item in batch]

    # Pad sequences to the maximum length in the batch
    padded_text_ids = pad_sequence(text_ids_batch, batch_first = True, padding_value = 0)
    padded_summary_ids = pad_sequence(summary_ids_batch, batch_first = True, padding_value = 0)

    return {"text_ids": padded_text_ids, "summary_ids": padded_summary_ids}

batch_size = 32
dataloader = DataLoader(splitted_dataset, batch_size = batch_size, collate_fn = collate_fn, shuffle = True)

epochs = 3

def training(dataloader, model, epochs = 10):
    model_copy = model.copy()

    for epochs in range(epochs):
        model_copy.train()
        for batch in dataloader:
            text_ids = batch["text_ids"]
            summary_ids = batch["summary_ids"]

            outputs = model_copy(text_ids, summary_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    return model_copy

In [46]:
# dataloader = DataLoader(dataset, batch_size = 32, shuffle = True)
# print(dataloader)

In [None]:
# summary and text should be tonkenized beforehand
tokenized_summary = tokenizer(summary, return_tensors='pt', max_length=600, truncation=True)
tokenized_text = tokenizer(text, return_tensors='pt', max_length=600, truncation=True)

def blanc_tune(summary, text, model, p_mask = 0.15, l_min = 4, N = 10):
    words_in_summary = summary.split()
    N_summary = len(words_in_summary)
    N_mask = int(N_summary*p_mask)
    set_tune = pd.DataFrame(columns = ['summary', 'text'])
    for i in range(1, N + 1):
        pos = [i for i, word in enumerate(words_in_summary) if len(word) >= l_min]
        pos = rd.shuffle(pos)
        while len(pos) != 0:
            masked_summary = words_in_summary.copy()
            for pos_to_mask in pos[:N_mask]:
                masked_summary[pos_to_mask] = '<MASK>'
                set_tune.loc[set_tune.shape[0]] = [masked_summary, text]
    # add tuning of model (see below, from chatgpt, also look at homework 2)
            
    return

model_tuned = blanc_tune(tokenized_summary, tokenized_text, model)

In [None]:
# Example training loop:
for epoch in range(3):  # Replace with the desired number of epochs
    model.train()
    for batch in cloze_dataloader:
        inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True, return_special_tokens_mask=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        labels = tokenizer(batch['summary'], return_tensors='pt', padding=True, truncation=True)['input_ids'].to(device)

        # Ensure that labels are masked only at the [MASK] token positions
        labels[inputs['input_ids'] == tokenizer.mask_token_id] = -100

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [None]:
# Step 4: Save the fine-tuned model
model.save_pretrained('fine_tuned_bert_cloze_model')