In [1]:
#https://github.com/jamescalam/transformers/blob/main/course/training/08_mlm_and_nsp_training.ipynb
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers==4.1.1



In [3]:
from transformers import BertTokenizer, BertForPreTraining
import torch

In [4]:
tokenizer = BertTokenizer.from_pretrained('bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12')
model = BertForPreTraining.from_pretrained('bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12')

In [5]:
with open('/content/drive/MyDrive/GitHub/exBERT/data/preprocessed_data/text_data_step2.txt', 'r') as fp:
    text = fp.read().split('\n')

In [6]:
text[:3]

['698 haematologica | 2018; 103(4) Received: August 2, 2017. Accepted: January 22, 2018. Pre-published: February 1, 2018. ©2018 Ferrata Storti Foundation Material published in Haematologica is covered by copyright. All rights are reserved to the Ferrata Storti Foundation. Use of published material is allowed under the following terms and conditions:  https://creativecommons.org/licenses/by-nc/4.0/legalcode. Copies of published material are allowed for personal or inter- nal use. Sharing published material for non-commercial pur- poses is subject to the following conditions:  https://creativecommons.org/licenses/by-nc/4.0/legalcode, sect. 3. Reproducing and sharing published material for com- mercial purposes is not allowed without permission in writing from the publisher. Correspondence:  veronique.leblond@aphp.fr Ferrata Storti Foundation Haematologica 2018 Volume 103(4):698-706 ARTICLE Chronic Lymphoblastic Leukemia doi:10.3324/haematol.2017.170480 Check the online version for the mo

In [7]:
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)

In [8]:
import random

sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [9]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length')


In [10]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [11]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T

In [12]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [13]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [14]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [15]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [16]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [17]:
class PMDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [18]:
dataset = PMDataset(inputs)

In [19]:
loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [21]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [23]:
from tqdm.notebook import tqdm  # for our progress bar

epochs = 8

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/70 [00:00<?, ?it/s]

  """


  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

In [25]:
!ls /content/drive/MyDrive/GitHub/bluebert

1_data_prep.ipynb  elmo		mt-bluebert  requirements.txt
bert		   LICENSE.txt	NER_output   tokenizer
bluebert	   mribert	README.md


In [26]:
!mkdir /content/drive/MyDrive/GitHub/bluebert/mlm_output

In [27]:
!ls /content/drive/MyDrive/GitHub/bluebert

1_data_prep.ipynb  elmo		mribert      README.md
bert		   LICENSE.txt	mt-bluebert  requirements.txt
bluebert	   mlm_output	NER_output   tokenizer


In [28]:
output_dir = '/content/drive/MyDrive/GitHub/bluebert/mlm_output/'
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)