In [2]:
import torch
from transformers import AutoModelForMaskedLM, BertForMaskedLM, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Bert

In [15]:
import re

def cleaning_func(text):
    text = re.sub(r'[\t\n]', ' ', text)
    text = re.sub(r'\.(\w)', r'. \1', text)
    text = re.sub(' +', ' ', text)
    text = re.split(r'\.\s*', text)
    
    return [s.strip()+'.' for s in text if s.strip()]

with open('five.txt', 'r') as text_file:
    data = cleaning_func(text_file.read())

print(data)



In [17]:

sample_data = ' '.join(data[:3])
masked_sample_data = sample_data.replace("between", '[MASK]').replace("many", '[MASK]').replace("easy", '[MASK]').replace("through", '[MASK]').replace("have", '[MASK]')
masked_sample_data

"When I glance over my notes and records of the Sherlock Holmes cases [MASK] the years '82 and '90, I am faced by so [MASK] which present strange and interesting features that it is no [MASK] matter to know which to choose and which to leave. Some, however, [MASK] already gained publicity [MASK] the papers, and others [MASK] not offered a field for those peculiar qualities which my friend possessed in so high a degree, and which it is the object of these papers to illustrate. Some, too, [MASK] baffled his analytical skill, and would be, as narratives, beginnings without an ending, while others [MASK] been but partially cleared up, and [MASK] their explanations founded rather upon conjecture and surmise than on that absolute logical proof which was so dear to him."

In [20]:
test_tokens = tokenizer(masked_sample_data, return_tensors='pt')
type(test_tokens)

transformers.tokenization_utils_base.BatchEncoding

In [22]:
test_tokens.keys()

KeysView({'input_ids': tensor([[  101,  2043,  1045,  6054,  2058,  2026,  3964,  1998,  2636,  1997,
          1996, 20052,  9106,  3572,   103,  1996,  2086,  1005,  6445,  1998,
          1005,  3938,  1010,  1045,  2572,  4320,  2011,  2061,   103,  2029,
          2556,  4326,  1998,  5875,  2838,  2008,  2009,  2003,  2053,   103,
          3043,  2000,  2113,  2029,  2000,  5454,  1998,  2029,  2000,  2681,
          1012,  2070,  1010,  2174,  1010,   103,  2525,  4227, 11845,   103,
          1996,  4981,  1010,  1998,  2500,   103,  2025,  3253,  1037,  2492,
          2005,  2216, 14099, 11647,  2029,  2026,  2767,  8679,  1999,  2061,
          2152,  1037,  3014,  1010,  1998,  2029,  2009,  2003,  1996,  4874,
          1997,  2122,  4981,  2000, 19141,  1012,  2070,  1010,  2205,  1010,
           103, 29088,  2010, 17826,  8066,  1010,  1998,  2052,  2022,  1010,
          2004, 22143,  1010, 16508,  2302,  2019,  4566,  1010,  2096,  2500,
           103,  2042,  2021,

In [24]:
# masked_token_value = 103

In [28]:
input = tokenizer(data, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [30]:
input['labels'] = input.input_ids.detach().clone()

In [32]:
## Creating masking logic

rand = torch.rand(input.input_ids.shape)
rand.shape, input.input_ids.shape

(torch.Size([427, 512]), torch.Size([427, 512]))

In [33]:
rand

tensor([[0.5935, 0.7336, 0.6031,  ..., 0.8877, 0.3073, 0.7764],
        [0.3996, 0.4269, 0.4135,  ..., 0.2101, 0.6893, 0.6130],
        [0.3558, 0.6497, 0.9041,  ..., 0.2630, 0.1745, 0.6749],
        ...,
        [0.3562, 0.6681, 0.0383,  ..., 0.8598, 0.6547, 0.0205],
        [0.2237, 0.4863, 0.1467,  ..., 0.7530, 0.8038, 0.3825],
        [0.9586, 0.5135, 0.8666,  ..., 0.7155, 0.5288, 0.6403]])

In [38]:
masked_tokens = (rand<0.15)*(input.input_ids != 101)*(input.input_ids != 102)*(input.input_ids != 0)
masked_tokens

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False,  True,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [41]:
masked_tokens[1].nonzero()

sentences = []
for sent in range(masked_tokens.shape[0]):
    sentences.append(torch.flatten(masked_tokens[sent].nonzero()).tolist())

### Masking values from the specified index in sentences

In [42]:
for i in range(masked_tokens.shape[0]):
    input.input_ids[i, sentences[i]] = 103

In [43]:
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [44]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [47]:
class Myowndataset(torch.utils.data.Dataset):
    def __init__(self, encodings) -> None:
        self.encodings = encodings
        
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings.input_ids)

In [48]:
dataset = Myowndataset(input)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

## Training starts here

In [49]:
from tqdm import tqdm
from torch.optim import AdamW
epochs = 4

optimizer = AdamW(model.parameters(), lr=1e-5)

for epoch in range(epochs):
    
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        
        optimizer.zero_grad()
        output = model(batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = output.loss
        loss.backward()
        optimizer.step()
        
        print(f"epochs: {epoch}, loss: {loss}")

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  0%|          | 0/27 [01:10<?, ?it/s]


KeyboardInterrupt: 