In [1]:
from datasets import load_dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dataset = dataset["train"]
train_dataset

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [3]:
train_dataset["text"][0:10]

['',
 ' = Valkyria Chronicles III = \n',
 '',
 ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more f

In [4]:
import re
cleaned_train_dataset = []
for sentence in train_dataset["text"]:
    sentence = re.sub(r'\n','',sentence)
    sentence = re.sub(r'\s+',' ',sentence)
    sentence = sentence.strip()
    if sentence:
        cleaned_train_dataset.append(sentence)

In [5]:
from tqdm import tqdm
from transformers import BertTokenizerFast

# create a python generator to dynamically load the data
# def batch_iterator(batch_size=10000):
#     for i in tqdm(range(0, len(train_dataset), batch_size)):
#         yield train_dataset[i : i + batch_size]["text"]

# create a tokenizer from existing one to re-use special tokens
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# bert_tokenizer = tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=32000)
tokenizer.save_pretrained("tokenizer")


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("tokenizer")

print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

# def group_texts(examples):
#     tokenized_inputs = tokenizer(
#        examples["text"], return_special_tokens_mask=True, return_tensors = 'pt', truncation=True, max_length=tokenizer.model_max_length, padding = "max_length"
#     )
#     return tokenized_inputs

# preprocess dataset
tokenized_datasets =  tokenizer(cleaned_train_dataset, return_special_tokens_mask=True, return_tensors = 'pt', truncation=True, max_length=tokenizer.model_max_length, padding = "max_length"
    )


The max length for the tokenizer is: 512


In [7]:
tokenized_datasets['labels'] = tokenized_datasets.input_ids.detach().clone()

In [7]:
import torch

In [9]:
input_ids = tokenized_datasets.input_ids
mini = input_ids[input_ids!=0].min()
print(mini)
print(input_ids.max())
#min = 100 max = 30265



tensor(100)
tensor(30265)


In [10]:
input_ids

tensor([[  101,  1027, 11748,  ...,     0,     0,     0],
        [  101, 12411,  5558,  ...,     0,     0,     0],
        [  101,  1996,  2208,  ...,     0,     0,     0],
        ...,
        [  101,  1996,  2691,  ...,     0,     0,     0],
        [  101, 13177,  2018,  ...,     0,     0,     0],
        [  101,  2691,  2732,  ...,     0,     0,     0]])

In [11]:
sentence = "[CLS] This [SEP]"
tokens = tokenizer(sentence, return_special_tokens_mask=True, return_tensors = 'pt', truncation=True, max_length=tokenizer.model_max_length)
tokens

{'input_ids': tensor([[ 101,  101, 2023,  102,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]]), 'special_tokens_mask': tensor([[1, 0, 0, 0, 1]])}

In [12]:
# create random array of floats with equal dimensions to input_ids tensor
torch.random.seed()
rand = torch.rand(tokenized_datasets.input_ids.shape)
# create mask array
#101 is for CLS, 102 is for SEP, 0 is for PAD
#15% masking
mask_arr = (rand < 0.15) * (tokenized_datasets.input_ids != 101) * (tokenized_datasets.input_ids != 102) * (tokenized_datasets.input_ids != 0)

In [13]:
tokenized_datasets.input_ids.shape

torch.Size([23767, 512])

In [14]:
selection = []
#storing indices for masked tokens
for i in range(tokenized_datasets.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [15]:
sent = "[MASK]"
tokens = tokenizer(sent, return_special_tokens_mask=True, return_tensors = 'pt', truncation=True, max_length=tokenizer.model_max_length)
tokens

{'input_ids': tensor([[101, 103, 102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]]), 'special_tokens_mask': tensor([[1, 0, 1]])}

In [8]:
import random
random.seed(42)

**MASKING**

In [17]:
for i in range(tokenized_datasets.input_ids.shape[0]):
  for j in selection[i]:
    prob = random.random()
    if(prob < 0.8):
      tokenized_datasets.input_ids[i,j] = 103
    elif (prob < 0.9):
      tokenized_datasets.input_ids[i,j] = 104+random.randint(0,30161)

In [9]:
class WikiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [19]:
data = WikiDataset(tokenized_datasets)

In [20]:
loader = torch.utils.data.DataLoader(data, batch_size=32, shuffle=True)

In [21]:
from transformers import BertConfig,BertForMaskedLM,BertForPreTraining
config = BertConfig.from_pretrained("bert-base-uncased", is_decoder=True)
model = BertForPreTraining(config=config)
model.init_weights()


In [22]:
#use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [23]:
LEARNING_RATE = 1e-4

In [24]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=LEARNING_RATE)



In [25]:
EPOCHS = 5

for epoch in range(EPOCHS):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # print("here")
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # print(input_ids.shape, attention_mask.shape, labels.shape)
        # process
        # try:
        # outputs = model(input_ids, attention_mask=attention_mask, masked_lm_labels=labels)
        # print(outputs)
        # loss = outputs.loss

        outputs = model(input_ids, attention_mask=attention_mask)
        prediction_scores = outputs.prediction_logits
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
        loss = loss_fct(prediction_scores.view(-1, config.vocab_size), labels.view(-1))

        loss.backward()

        # Update parameters
        optim.step()

        # Print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


        # except Exception as e:
        #     print(f"Error during forward/backward pass: {e}")
        #     print(f"Input shapes: {input_ids.shape}, {attention_mask.shape}, {labels.shape if labels is not None else None}")

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 743/743 [11:53<00:00,  1.04it/s, loss=0.3]  
Epoch 1: 100%|██████████| 743/743 [12:02<00:00,  1.03it/s, loss=0.205] 
Epoch 2: 100%|██████████| 743/743 [12:04<00:00,  1.03it/s, loss=0.154]
Epoch 3: 100%|██████████| 743/743 [12:03<00:00,  1.03it/s, loss=0.203] 
Epoch 4: 100%|██████████| 743/743 [12:02<00:00,  1.03it/s, loss=0.151] 


In [26]:
import os
dir_path = 'pretrainedMLM'
os.makedirs(dir_path, exist_ok=True)
model.save_pretrained(dir_path)


In [26]:
pip install huggingface-hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: distro-info 0.18ubuntu0.18.04.1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [27]:
model.push_to_hub("pretrainedMLM")
tokenizer.push_to_hub("pretrainedMLM")

model.safetensors: 100%|██████████| 440M/440M [00:50<00:00, 8.65MB/s]   


CommitInfo(commit_url='https://huggingface.co/PinkiKumari22/pretrainedMLM/commit/4b0aeb5702262add34fc34e808d42f06ccfab010', commit_message='Upload tokenizer', commit_description='', oid='4b0aeb5702262add34fc34e808d42f06ccfab010', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
import re
cleaned_train_dataset = []
for para in train_dataset['text']:
    para = re.sub(r'\n', ' ', para)  # Remove \n
    para = re.sub(r'\s+', ' ', para)  # Remove extra spaces
    para = para.strip()  # Remove leading/trailing spaces
    if para:  # Append non-empty sentences
        cleaned_train_dataset.append(para)

In [11]:
all_sentences = [sentence for para in cleaned_train_dataset for sentence in para.split('.') if sentence != '']
num_sent = len(all_sentences)

In [12]:
import random
first_sent = []
second_sent = []
isNext = []

for para in cleaned_train_dataset:
    sentences = [
        sentence for sentence in para.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50% prob that the second sentence would be the actual next sentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            first_sent.append(sentences[start])
            second_sent.append(sentences[start+1])
            isNext.append(0)
        else:
            index = random.randint(0, num_sent-1)
            # this is NotNextSentence
            first_sent.append(sentences[start])
            second_sent.append(all_sentences[index])
            isNext.append(1)

In [13]:
for i in range(3):
    print(isNext[i])
    print(first_sent[i] + '\n---')
    print(second_sent[i] + '\n')

1
Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit 
---
 In his diaries he noted significant family events including birthdays , not by their Gregorian calendar occurrence , but by their Jewish calendar dates 

1
 While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers 
---
Yue lives in a shed by his grave through the winter and in the second lunar month of the following year , his martial brothers come and pull the building down , forcing him to return home and take care of his mother 

1
 Media
---
 The writing , although inspired by Vadiraja 's Sanskrit classic of the same name , is noted for its original interpretation , imagery and style 



In [14]:
inputs = tokenizer(first_sent, second_sent, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs['labels'] = torch.LongTensor([isNext]).T

In [15]:
NSP_data = WikiDataset(inputs)
loader = torch.utils.data.DataLoader(NSP_data, batch_size=32, shuffle=True)

In [16]:
from transformers import BertForPreTraining

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForPreTraining.from_pretrained("pretrainedMLM").to(device)

In [17]:
from transformers import AdamW

# activate training mode
# model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-5)



In [18]:
torch.cuda.empty_cache()

In [18]:
from tqdm import tqdm  # for our progress bar

epochs = 5
nsp_loss_function = torch.nn.CrossEntropyLoss()
for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        # print(input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape)
        # process

        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
        
        logits_nsp = outputs.seq_relationship_logits
        loss = nsp_loss_function(logits_nsp,labels.view(-1))
        
        # # extract loss
        # loss_nsp = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    tokenizer.push_to_hub("FinalPreTrainedModel_epoch{}".format(epoch+1))
    model.push_to_hub("FinalPreTrainedModel_epoch{}".format(epoch+1))


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 463/463 [05:55<00:00,  1.30it/s, loss=0.688]
model.safetensors: 100%|██████████| 440M/440M [00:36<00:00, 12.1MB/s] 
Epoch 1: 100%|██████████| 463/463 [05:59<00:00,  1.29it/s, loss=0.69] 
model.safetensors: 100%|██████████| 440M/440M [00:32<00:00, 13.6MB/s]   
Epoch 2: 100%|██████████| 463/463 [06:00<00:00,  1.28it/s, loss=0.689]
model.safetensors: 100%|██████████| 440M/440M [00:39<00:00, 11.2MB/s] 
Epoch 3: 100%|██████████| 463/463 [06:00<00:00,  1.29it/s, loss=0.694]
model.safetensors: 100%|██████████| 440M/440M [01:03<00:00, 6.94MB/s]   
Epoch 4: 100%|██████████| 463/463 [05:59<00:00,  1.29it/s, loss=0.692]
model.safetensors: 100%|██████████| 440M/440M [00:29<00:00, 14.8MB/s] 


In [19]:
import os
dir_path = 'FinalPreTrainedModel'
os.makedirs(dir_path, exist_ok=True)
model.save_pretrained(dir_path)

In [20]:
model.push_to_hub("FinalPreTrainedModel")
tokenizer.push_to_hub("FinalPreTrainedModel")

model.safetensors: 100%|██████████| 440M/440M [01:01<00:00, 7.20MB/s]   


CommitInfo(commit_url='https://huggingface.co/PinkiKumari22/FinalPreTrainedModel/commit/f11e1c736f95dcc34cd453b950f87384ce02679b', commit_message='Upload tokenizer', commit_description='', oid='f11e1c736f95dcc34cd453b950f87384ce02679b', pr_url=None, pr_revision=None, pr_num=None)

In [31]:
from transformers import AutoModelForNextSentencePrediction
model_loaded = AutoModelForNextSentencePrediction.from_pretrained("PinkiKumari22/FinalPreTrainedModel")

Downloading model.safetensors: 100%|██████████| 440M/440M [00:16<00:00, 27.1MB/s] 


In [33]:
model_loaded

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [35]:
from transformers import AutoModelForQuestionAnswering
model_loaded2 = AutoModelForQuestionAnswering.from_pretrained("PinkiKumari22/FinalPreTrainedModel")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at PinkiKumari22/FinalPreTrainedModel and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
model_loaded2

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [22]:
print(input_ids.shape, attention_mask.shape, token_type_ids.shape, labels.shape)

torch.Size([11, 512]) torch.Size([11, 512]) torch.Size([11, 512]) torch.Size([11, 1])


In [23]:
print(logits_nsp.shape)

torch.Size([11, 2])


In [24]:
attr = dir(outputs)
attr

['__annotations__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__post_init__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'attentions',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'hidden_states',
 'items',
 'keys',
 'loss',
 'move_to_end',
 'pop',
 'popitem',
 'prediction_logits',
 'seq_relationship_logits',
 'setdefault',
 'to_tuple',
 'update',
 'values']