# This notebook will pretrain a BERT model on the WikiText-2 dataset

In [465]:
#let's get the data..let's try to use the Hugging Face API

from datasets import load_dataset

# Load the wikitext-2-raw-v1 configuration of the wikitext dataset
raw_datasets = load_dataset("wikitext", "wikitext-2-raw-v1")

# The 'raw_datasets' object will now contain the dataset, typically split into
# 'train', 'validation', and 'test' splits.
# You can access them like this:
train_data = raw_datasets["train"]
validation_data = raw_datasets["validation"]
test_data = raw_datasets["test"]

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(validation_data)}")
print(f"Number of test examples: {len(test_data)}")

Number of training examples: 36718
Number of validation examples: 3760
Number of test examples: 4358


In [499]:
import random
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
import torch

checkpoint = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, data, max_length):
        self.tokenizer=tokenizer
        self.max_length = max_length
        
        L = [p["text"] for p in data]
        self.NSP_data = []

        for paragraph in L:
            cand_sentences = paragraph.split(' . ')
            number_of_sentences = len(cand_sentences)
            if number_of_sentences<2:
                continue
            else:
                for i in range(number_of_sentences-1):
                    if random.random()<0.5:
                        self.NSP_data.append(((cand_sentences[i], cand_sentences[i+1]), True))
                else:
                    repla_sent = random.choice(L)
                    repla_sent = repla_sent.split(' . ')
                    repla_sent = random.choice(repla_sent)
                    self.NSP_data.append(((cand_sentences[i], repla_sent), False))


    
    def __getitem__(self, idx):
        pair, label = self.NSP_data[idx]
        return self.tokenizer(*pair, truncation=True, max_length=self.max_length), label
        
    def __len__(self):
        return len(self.NSP_data)

dataset = MyDataset(tokenizer, train_data, 15)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)

def collate_fn(batch):
    tokens = [item[0] for item in batch]
    tokens = data_collator(tokens)  #output will be {"input_ids" : ? , "token_type_ids" : ?, "attention_mask" : ?, "labels" : ?}

    mlm_pred_positions = tokens["labels"].clone()

    mask = (tokens["labels"] != -100)
    indices = torch.arange(tokens["labels"].shape[1])
    for i in range(tokens["labels"].shape[0]):
        mlm_pred_positions[i][mask[i]] = indices[mask[i]]
        mlm_pred_positions[i][~mask[i]] = 0
        tokens["labels"][i][~mask[i]] = 0
    mlm_pred_positions = torch.flatten(mlm_pred_positions)
            
    nsp_labels = [item[1] for item in batch]
    all_labels = tokens["labels"]
    X = {k:v for k, v in tokens.items() if k != "labels"}  #X excludes "labels", as this concerns more the output.
    X["labels"] = mlm_pred_positions
    y = (tokens["labels"], nsp_labels)
    return X,y
    
    

In [449]:
#let's build the model now
from skorch import NeuralNetClassifier
import torch.nn as nn

class BertPretraining(nn.Module):
    def __init__(self, vocab_size, num_hiddens, ffn_num_hiddens, num_heads, num_blks, dropout, max_len):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, num_hiddens)
        self.segment_embedding = nn.Embedding(2, num_hiddens)
        self.pos_embedding = nn.Parameter(torch.randn(1,max_len, num_hiddens))
        self.blks = nn.Sequential()
        for i in range(num_blks):
            self.blks.add_module(f"{i}", nn.TransformerEncoderLayer(d_model=num_hiddens, 
                                                                   nhead=num_heads,
                                                                   dim_feedforward=ffn_num_hiddens,
                                                                   dropout=dropout, 
                                                                   batch_first=True))
            

    def __forward__(self):


class CustomLoss(nn.Module):

    def __init__(self):
        super().__init__()
    
    

IndentationError: expected an indented block after function definition on line 10 (3304479224.py, line 13)

In [503]:
forward = nn.Embedding(4,3)

In [507]:
forward(torch.tensor([1,1,1,2]))

tensor([[ 0.7540, -2.0427,  0.7257],
        [ 0.7540, -2.0427,  0.7257],
        [ 0.7540, -2.0427,  0.7257],
        [ 0.3545, -1.3122, -0.5113]], grad_fn=<EmbeddingBackward0>)

In [453]:
forward(torch.tensor(1))

tensor([-0.3898, -1.0870,  0.4674], grad_fn=<EmbeddingBackward0>)

In [389]:
pair, label = NSP_data[5]

In [390]:
pair

('While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers',
 'Character designer Raita Honjou and composer Hitoshi Sakimoto both returned from previous entries , along with Valkyria Chronicles II director Takeshi Ozawa')

In [391]:
x = tokenizer(*pair, truncation=True, max_length=70)
x  #input_ids have length 63

{'input_ids': [101, 1799, 1122, 5366, 1103, 2530, 1956, 1104, 1103, 1326, 117, 1122, 1145, 9315, 2967, 27939, 117, 1216, 1112, 1543, 1103, 1342, 1167, 1111, 5389, 3970, 1111, 1326, 25551, 1116, 102, 23543, 5592, 20089, 1777, 10942, 25028, 1105, 3996, 15375, 22437, 17784, 18504, 12355, 1241, 1608, 1121, 2166, 10813, 117, 1373, 1114, 12226, 3781, 3464, 17758, 1563, 1900, 26713, 3031, 16075, 10946, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [392]:
pair2, label2 = NSP_data[6]

In [393]:
y = tokenizer(*pair2, truncation=True, max_length=70)
y  #input_ids have length 42

{'input_ids': [101, 23543, 5592, 20089, 1777, 10942, 25028, 1105, 3996, 15375, 22437, 17784, 18504, 12355, 1241, 1608, 1121, 2166, 10813, 117, 1373, 1114, 12226, 3781, 3464, 17758, 1563, 1900, 26713, 3031, 16075, 10946, 102, 134, 134, 134, 21452, 1158, 134, 134, 134, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [394]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)

In [307]:
round(tokens["input_ids"].shape[1]*0.15)

9

In [395]:
tokens = data_collator([x,y])
tokens

{'input_ids': tensor([[  101,  1799,  1122,  5366,  1103,  2530,  1956,  1104,  1103,  1326,
           117,   103,  1145,  9315,  2967, 27939,   117,  1216,  1112,  1543,
          1103,   103,  1167,  1111,   103,  3970,  1111,  1326, 19623,  1116,
           102, 23543,  5592, 20089,  1777, 10942, 25028,  1105,  3996, 15375,
         22437,   103, 18504, 12355,   103,  1608,  1121,  2166, 10813,   117,
          1373,  1114, 12226,  3781,  3464, 17758,  1563,  1900, 26713,  3031,
         16075, 10946,   102],
        [  101, 23543,  5592, 20089,  1777, 10942, 25028,  1105,  3996, 15375,
         22437, 17784, 18504,   103,  1241,  1608,   103,   103, 10813,   117,
          1373,  1114, 12226,  3781,  3464,   103,  1563,   103, 26713,  3031,
         16075, 10946,   102,   134,   134,   134,   103,  1158,   134,   134,
           103,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
       

In [396]:
z = tokens["labels"]
z

tensor([[ -100,  -100,  -100,  5366,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  1122,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  1342,  -100,  -100,  5389,  -100,  -100,  -100, 25551,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100, 17784,  -100,  -100,  1241,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100, 12355,  -100,  -100,  1121,  2166,  -100,  -100,
          -100,  -100,  -100,  -100,  -100, 17758,  -100,  1900,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100, 21452,  -100,  -100,  -100,
           134,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100

In [397]:
z.requires_grad

False

In [333]:
mask = (z!=-100)
z_copy = z.clone().detach()
z_copy[0][mask[0]] = indices[mask[0]]
z_copy[0][~mask[0]]= 0
z_copy

tensor([[    0,     0,     2,     0,     0,     5,     0,     0,     0,     0,
            10,     0,     0,     0,     0,     0,     0,    17,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,    42,     0,     0,    45,    46,     0,    48,     0,
             0,     0,    52,     0,     0,     0,     0,     0,    58,     0,
             0,     0,     0],
        [ -100,  -100,  -100,  -100,  -100, 10942,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  1241,  -100,  1121,  2166,  -100,  -100,
          1373,  -100,  -100,  -100,  -100,  -100,  -100,  1900,  -100,  -100,
          -100,  -100,  -100,  -100,   134,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100

In [334]:
z_copy = torch.flatten(z_copy)
z_copy

tensor([    0,     0,     2,     0,     0,     5,     0,     0,     0,     0,
           10,     0,     0,     0,     0,     0,     0,    17,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,    42,     0,     0,    45,    46,     0,    48,     0,
            0,     0,    52,     0,     0,     0,     0,     0,    58,     0,
            0,     0,     0,  -100,  -100,  -100,  -100,  -100, 10942,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  1241,  -100,  1121,
         2166,  -100,  -100,  1373,  -100,  -100,  -100,  -100,  -100,  -100,
         1900,  -100,  -100,  -100,  -100,  -100,  -100,   134,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100])

In [492]:
L = [p["text"] for p in train_data]

In [493]:
#at this point L is a list, where each element is paragraph or a collection of consecutive sentences
#let's try to create a list with elements (sen1, sen2, True/False)
#
import random
NSP_data = []  #elements will be (sen1, sen2, True/False) depending on whether sen2 follows sen1

for paragraph in L:
    cand_sentences = paragraph.split(' . ')
    number_of_sent = len(cand_sentences)
    if number_of_sent<2:
        continue
    else:
        for i in range(number_of_sent-1):
            if random.random()<0.5:
                NSP_data.append(((cand_sentences[i], cand_sentences[i+1]), True))
            #look at cand_sentences[i] and cand_sentences[i+1]
            else:
                repla_sent = random.choice(L)
                repla_sent = repla_sent.split(' . ')
                repla_sent = random.choice(repla_sent)
                NSP_data.append(((cand_sentences[i], repla_sent), False))
                
    

In [494]:
len(NSP_data)

73380

In [114]:
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer


checkpoint = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)

In [119]:
pair, label = NSP_data[4]

In [120]:
pair

(' The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II',
 'While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers')

In [121]:
label

True

In [140]:
x = tokenizer(*pair, truncation=True, max_length=15)
x

{'input_ids': [101, 1109, 1342, 1310, 1718, 1107, 1333, 102, 1799, 1122, 5366, 1103, 2530, 1956, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [141]:
data_collator([x])

{'input_ids': tensor([[ 101, 1109, 1342, 1310, 1718,  103, 1333,  102, 1799, 1122, 5366,  103,
         2530, 1956,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, -100, -100, 1107, -100, -100, -100, -100, -100, 1103,
         -100, -100, -100]])}

In [281]:
tokenizer.decode([0,131])

'[PAD] :'

In [430]:
x = torch.tensor([1,2,3,4], dtype=torch.float64, requires_grad=True)
x

tensor([1., 2., 3., 4.], dtype=torch.float64, requires_grad=True)

In [431]:
y = torch.dot(x,x)

In [432]:
y.backward()

In [434]:
x.grad

tensor([2., 4., 6., 8.], dtype=torch.float64)

In [435]:
x = torch.tensor([2,3,4], dtype=torch.float32)

In [436]:
x**2

tensor([ 4.,  9., 16.])

In [437]:
x+x

tensor([4., 6., 8.])

In [448]:
x = torch.tensor(2, requires_grad=True)

RuntimeError: Only Tensors of floating point and complex dtype can require gradients

In [445]:
y = x**2 + 3*x

In [446]:
y.backward()

In [447]:
x.grad

tensor(7.)

In [456]:
print(f"{i}")

5


In [480]:
L = [1,2,3,4]

next(iter(L))
next(iter(L))

1

In [491]:
L = "Hi Ho are you . jwjfei"
L.split(' . ')

['Hi Ho are you', 'jwjfei']