In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

In [None]:
import pandas as pd 

df1 = pd.read_csv("train.csv")
df2 = pd.read_csv("test.csv")

In [None]:
text =list(df1['Text'].values)
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [None]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [None]:
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)  #101 - cls and padding also, 102 - separator

In [None]:
selection = [] 
for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103 

In [None]:
class BiasDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = BiasDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

# Training

In [None]:
from transformers import AdamW

model.train()
optim = AdamW(model.parameters(), lr=1e-4)

In [None]:
from tqdm import tqdm  # for our progress bar

epochs = 3

for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)

        loss = outputs.loss
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
model.save_pretrained("Models/")
tokenizer.save_pretrained("Models/")