<a href="https://colab.research.google.com/github/JayThibs/pretrain-nlp-models/blob/main/pretrain_bert_with_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-training BERT with the text from Meditations by Marcus Aurelius

We will use the transformers library from huggingface to grab the bert model and tokenizer. We will use pure PyTorch to pre-train the model.|

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 10.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 44.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.6 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled 

In [2]:
import requests
from transformers import BertTokenizer, BertForPreTraining
import torch

In [3]:
# load toknizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
!wget https://raw.githubusercontent.com/jamescalam/transformers/main/data/text/meditations/clean.txt

--2021-08-20 18:59:16--  https://raw.githubusercontent.com/jamescalam/transformers/main/data/text/meditations/clean.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 241387 (236K) [text/plain]
Saving to: ‘clean.txt’


2021-08-20 18:59:17 (12.8 MB/s) - ‘clean.txt’ saved [241387/241387]



In [5]:
# create a list of our dataset with the sentences
with open('clean.txt', 'r') as fp:
    text = fp.read().split('\n')

In [6]:
text[:3]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.']

In [7]:
bag = [sentence for para in text for sentence in para.split(',') if sentence != '']
bag_size = len(bag)

In [8]:
# we create sentence a and b where a is the first sentence and b
# is either a sentence that comes after a or not (50% chance)
# this is to help BERT understand the context of the sentences
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
  sentences = [
               sentence for sentence in paragraph.split('.') if sentence != '' 
  ]
  num_sentences = len(sentences)
  if num_sentences > 1:
    start = random.randint(0, num_sentences-2)
    sentence_a.append(sentences[start])
    if random.random() > 0.5:
      sentence_b.append(sentences[start+1])
      label.append(0)
    else:
      sentence_b.append(bag[random.randint(0, bag_size-1)])
      label.append(1)

In [9]:
# we tokenize our dataset
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512,
                   truncation=True, padding='max_length')

inputs['next_sentence_label'] = torch.LongTensor([label]).T
inputs['next_sentence_label'][:10]

tensor([[0],
        [0],
        [0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [0]])

In [10]:
# we create a clone of our input in order to create a mask
inputs['labels'] = inputs.input_ids.detach().clone()

In [11]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [12]:
# create a tensor with random values from 0 to 1 to create our mask
rand = torch.rand(inputs.input_ids.shape)

In [14]:
# CLS token: 101
# SEP token: 102
# PAD token: 0
# Here we mask 15% of the tokens, but also make sure we don't mask our special tokens
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 1)

In [16]:
# Masking token: 103
# iterating through each row the mask our input_ids
for i in range(inputs.input_ids.shape[0]):
  selection = torch.flatten(mask_arr[i].nonzero()).tolist()
  inputs.input_ids[i, selection] = 103 # mask input_ids

In [17]:
class MeditationsDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [18]:
# create our torch dataset
dataset = MeditationsDataset(inputs)

In [19]:
# create our torch dataloader
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [21]:
# set model to the correct device for training
model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [22]:
# put model in train mode
model.train()

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [23]:
# set the optimizer
from transformers import AdamW

optim = AdamW(model.parameters(), lr=5e-5)

In [25]:
# train the model
from tqdm import tqdm

for epoch in range(2):
  loop = tqdm(loader, leave=True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    token_type_ids = batch['token_type_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    next_sentence_label = batch['next_sentence_label'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, token_type_ids=token_type_ids,
                    attention_mask=attention_mask,
                    next_sentence_label=next_sentence_label,
                    labels=labels)
    loss = outputs.loss
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())

  """
Epoch 0: 100%|██████████| 317/317 [00:32<00:00,  9.79it/s, loss=0.253]
Epoch 1: 100%|██████████| 317/317 [00:32<00:00,  9.80it/s, loss=0.11]
