In [1]:
import os
import random
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import tqdm
import transformers
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, get_cosine_schedule_with_warmup
from torch import optim
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

print("Special Tokens in Tokenizer")
special_tokens = tokenizer.special_tokens_map
print(special_tokens)

special_tokens_dict = {}

for token in special_tokens.values():
    special_tokens_dict[token] = tokenizer.encode(token)[1]
print("Special_token_dict")
print(special_tokens_dict)


# Let see here we need the 1th element for the storing the embedding values for the tokens
# {'<s>': [0, 0, 2],
#  '</s>': [0, 2, 2],
#  '<unk>': [0, 3, 2],
#  '<pad>': [0, 1, 2],
#  '<mask>': [0, 50264, 2]}

all_tokens_idx = list(range(tokenizer.vocab_size))
## print(f"all_tokens_idx: {all_tokens_idx}")
all_special_tokens_idx = sorted(list(special_tokens_dict.values()))
## print(f"all_special_tokens_idx: {all_special_tokens_idx}")
all_non_special_tokens_idx = [token for token in all_tokens_idx if token not in all_special_tokens_idx]
## print(f"all_non_special_tokens_idx: {all_non_special_tokens_idx}")


Special Tokens in Tokenizer
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}
Special_token_dict
{'<s>': 0, '</s>': 2, '<unk>': 3, '<pad>': 1, '<mask>': 50264}


### Prepare Dataset

In [3]:
path_to_data = "/Users/jhanvi/Desktop/DeepLearning/RoBERTa_Implementation/HarryPotter"

text_files = os.listdir(path_to_data)
all_text = ""

for book in text_files:
    with open(os.path.join(path_to_data, book), "r") as f:
        text = f.readlines() # Read in all lines
        text = [line for line in text if "Page" not in line] # Remove lines with Page Numbers
        text = " ".join(text).replace("\n", "") # Remove all newline characters
        text = [word for word in text.split(" ") if len(word) > 0] # Remove all empty characters
        text = " ".join(text) # Combined lightly cleaned text
        all_text += text

### Split Data by "sentences" ###
all_text = all_text.split(".")

### Grab 5 Sentences at a time and put them together by the period ###
all_text_chunked = [".".join(all_text[i:i+5]) for i in range(0, len(all_text), 5)]

### Tokenize all the text! ###
tokenized_text = [tokenizer.encode(text) for text in all_text_chunked]

Token indices sequence length is longer than the specified maximum sequence length for this model (622 > 512). Running this sequence through the model will result in indexing errors


In [4]:
class MaskedLMLoader(Dataset):
    def __init__(self, tokenized_data, max_seq_len=100, masking_ratio=0.15):
        self.data = tokenized_data
        self.mask_ratio = masking_ratio
        self.max_seq_len = max_seq_len
        
    def __len__(self):
        return len(self.data)

    def _random_mask_text(self, tokens):

        ### Create Random Uniform Sample Tensor ###
        random_masking = torch.rand(*tokens.shape)## This are random numbers from range 0 to 1 with the tokens shape

        ### Set Value of Special Tokens to 1 so we DONT MASK THEM ###
        special_tokens = torch.tensor(tokenizer.get_special_tokens_mask(tokens, already_has_special_tokens=True)) ## here we create the special token tensor where the data is 0/1
        random_masking[special_tokens==1] = 1

        ### Get Boolean of Words under Masking Threshold ###
        random_masking = (random_masking < self.mask_ratio)

        ### Create Labels ###
        labels = torch.full((tokens.shape), -100) ## this is full metrix/tensor with value of -100 with shape of tokens
        labels[random_masking] = tokens[random_masking] ## Create the labels with -100 and the masking tokens

        ### Get Indexes of True ###
        random_selected_idx = random_masking.nonzero() ## Select the index with non-zero values

        ### 80% Of the Time Replace with Mask Token ###
        masking_flag = torch.rand(*random_selected_idx.shape) ## Masking flag create the matrix with the random selected index with the shape
        masking_flag = (masking_flag<0.8) ## Masking flag < 80% like 0.56 < 0.80
        selected_idx_for_masking = random_selected_idx[masking_flag] ## Selected data will geting mask

        ### Seperate out remaining indexes to be assigned ###
        unselected_idx_for_masking = random_selected_idx[~masking_flag] ## Now unselected index for masking like 20% data

        ### 10% of the time (or 50 percent of the remaining 20%) we fill with random token ###
        ### The remaining times, leave the text as is ###
        masking_flag = torch.rand(*unselected_idx_for_masking.shape) ## Now that 10% from the 20% are going to be the masking 
        masking_flag = (masking_flag<0.5)
        selected_idx_for_random_filling = unselected_idx_for_masking[masking_flag] 
        selected_idx_to_be_left_alone = unselected_idx_for_masking[~masking_flag] ## this index is are not changed that are left alone 
        
        ### Fill Mask Tokens ###
        if len(selected_idx_for_masking) > 0:
            tokens[selected_idx_for_masking] = special_tokens_dict["<mask>"]
        
        ### Fill Random Tokens ###
        if len(selected_idx_for_random_filling) > 0:
            randomly_selected_tokens = torch.tensor(random.sample(all_non_special_tokens_idx, len(selected_idx_for_random_filling))) ## random sample will be fill in that 10%
            tokens[selected_idx_for_random_filling] = randomly_selected_tokens
        
        
        return tokens, labels
        
    def __getitem__(self, idx):
        data = torch.tensor(self.data[idx])

        ### Make sure sequence is within context length ###
        if len(data) > self.max_seq_len:
            rand_start_idx = random.choice(list(range(len(data) - self.max_seq_len)))
            end_idx = rand_start_idx + self.max_seq_len
            data = data[rand_start_idx:end_idx]
  
        ### Uniform Random Masking ###
        masked_tokens, label = self._random_mask_text(data)

        return masked_tokens, label

mlm = MaskedLMLoader(tokenized_text)

for masked_tokens, labels in mlm:
    print(masked_tokens[labels!=-100])
    print(labels[labels!=-100])
    print(masked_tokens)
    print(labels)
    break

# for example let we have the 
# selected_idx_for_masking = tensor([4,52,35,25,63,633,156,246,1356,90]) ## this data is used for the masking [80% data]
# select_index_for_left_alone = tensor([52]) ## which is untouched index which is same as that no masking and no random filling [10% data]
# select_index_for_random_filling = tensor([93]) ## which is randomly filled not the accurate filling [10% data]

# Here Starting Of sentence is special token represents 0 as well as 2 represents the end of sentence
# tensor([    0,   863,   479, 50264,   479,   248,   384,   305,   226, 27785,
#           234,   272, 32721, 16802,   221,  3293,  7831,  1589,  1941, 31534,
#          4014,   163,  5216,  3732, 50264,  1491, 50264,     5,    78, 50264,
#         40588,    41,  4795,    56,  3187,    66,    81,  7080, 14673,   346,
#           237,     6,  9522, 18068,  3936,     4,   427,     4, 14571,   211,
#          4668,   607,    56, 50264,   885, 22036,    11,     5,   419,   722,
#             9,     5,   662,    30,    10,  7337,     6, 50264, 23894,  6496,
#            31,    39, 11884,  3268,    17,    27, 50264,   929,     2])
# here we see that masking with -100 but the specific data is not masked so that data is used in the furthur implementation for true prediction
# tensor([-100, -100, -100,  229, -100, -100, -100, -100, -100, -100, -100, -100,
#         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
#         2774, -100,   13, -100, -100,   86,    6, -100, -100, -100, -100, -100,
#         -100, -100,   23, -100, -100, -100, -100, -100, -100, -100, -100, -100,
#         -100, -100, -100, -100, -100,   57, -100, -100, -100, -100, -100, -100,
#         -100, -100, -100, -100, -100, -100, -100, 1368, -100, -100, -100, -100,
#         -100, -100, -100, -100,   29, -100, -100])

tensor([ 2121, 50264, 50264, 12729, 50264, 16877, 50264, 50264, 50264, 50264,
        50264, 50264])
tensor([ 1589,  1941,    13,    23,   427,  1368, 23894,    31,    39,  3268,
           17,    27])
tensor([    0,   863,   479,   229,   479,   248,   384,   305,   226, 27785,
          234,   272, 32721, 16802,   221,  3293,  7831,  2121, 50264, 31534,
         4014,   163,  5216,  3732,  2774,  1491, 50264,     5,    78,    86,
            6,    41,  4795,    56,  3187,    66,    81,  7080, 12729,   346,
          237,     6,  9522, 18068,  3936,     4, 50264,     4, 14571,   211,
         4668,   607,    56,    57,   885, 22036,    11,     5,   419,   722,
            9,     5,   662,    30,    10,  7337,     6, 16877, 50264,  6496,
        50264, 50264, 11884, 50264, 50264, 50264,    29,   929,     2])
tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  1589,  1941,  -100,
         -100,  -100,  -

### Collate Function

In [5]:
def collate_fn(batch):
    
    token_samples = []
    label_samples = []
    
    for tokens, labels in batch:
        
        token_samples.append(tokens)
        label_samples.append(labels)
        
    max_len = max([len(token) for token in token_samples]) ## Here the max_len in the whole dataset is 100
    print(max_len)
    
    padding_masks = []
    
    for idx in range(len(token_samples)):
        sample = token_samples[idx]
        label = label_samples[idx]
        
        seq_len = len(sample)
        
        diff = max_len - seq_len ## Difference used when we calculate the max_len to seq_len for padding purpose
    
        #print(diff)
        
        if diff > 0:
            
            padding = torch.tensor([special_tokens_dict['<pad>'] for _ in range(diff)])
            ##print(padding)
            sample = torch.concatenate((sample, padding))
            ##print(sample)
            
            token_samples[idx] = sample
            
            ## label padding
            label_padding = torch.tensor([-100 for _ in range(diff)])
            label = torch.concatenate((label,label_padding))
            label_samples[idx] = label
            
            padding_mask = (sample != special_tokens_dict['<pad>'])
            padding_masks.append(padding_mask)
        
        else:
            
            padding_masks.append(torch.ones(max_len))
            
    token_samples = torch.stack(token_samples)
    label_samples = torch.stack(label_samples)
    padding_masks = torch.stack(padding_masks)
    
    ##print(token_samples.shape, label_samples.shape, padding_masks.shape)
    
    batch = {
        "input_ids" : token_samples,
        "labels" : label_samples,
        "attention_mask": padding_masks.bool()
    }
    return batch
        
loader = DataLoader(mlm, batch_size=64, collate_fn=collate_fn)

for sample in loader:
    print(sample['attention_mask'].shape)
    break

100
torch.Size([64, 100])


### Attention Mechanism

In [6]:
class SelfAttentionEncoder(nn.Module):

    def __init__(self,
        embed_dim=768,
        num_heads = 12,
        proj_p = 0.0, ## dropout
        attn_p = 0.0 ):## Attention Dropout
        
        super(SelfAttentionEncoder, self).__init__()
        
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = self.embed_dim // self.num_heads
        
        self.q_proj = nn.Linear(embed_dim,embed_dim)
        self.k_proj = nn.Linear(embed_dim,embed_dim)
        self.v_proj = nn.Linear(embed_dim,embed_dim)
        self.attn_drop = nn.Dropout(attn_p)
        self.proj = nn.Linear(embed_dim,embed_dim)
        self.proj_drop = nn.Dropout(proj_p)
        
    def forward(self, x, attention_mask=None):
        
        batch_size, seq_len, embed_dim = x.shape
        
        q = self.q_proj(x).reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1,2)
        k = self.k_proj(x).reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1,2)
        v = self.v_proj(x).reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1,2)
        
        attn = q @ k.transpose(-2,-1) * self.head_dim**-0.05
        
        
        if attention_mask is not None:
            
            attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
            attn = attn.masked_fill(~attention_mask, float("-inf"))
            
        # print(attn)
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        x = attn @ v
        
        #print(x.shape)
        
        x = x.transpose(1,2).flatten(2)
        
        #print(x.shape)
        
        x = self.proj(x)
        x = self.proj_drop(x)
        
        return x
        
        

rand_x = torch.randn(2,5,16)

padding = torch.tensor([[True,True,True,True,True],
                        [True,True,True,False,False]])

attn = SelfAttentionEncoder(embed_dim=16, num_heads=2)
output = attn(rand_x,padding)

# attention_mask.shape =torch.Size([2, 5])
# attn.shape = torch.Size([2, 2, 5, 5]) now we need to convert it to (2,5) -> (2,2,5,5)


## Now the output of print(attn) after the 
# tensor([[[[ 6.0092e-01, -1.1851e+00, -8.4532e-02,  8.2474e-01, -2.0042e-01],
#           [ 2.9442e-01, -1.4769e-01,  8.2316e-01,  2.7357e-01, -6.7785e-01],
#           [-2.1567e-01,  1.1731e-01, -4.4792e-02, -4.8720e-02, -1.1119e-03],
#           [-1.3498e+00,  1.8599e+00, -2.9678e-01, -7.1736e-01,  7.5775e-01],
#           [ 1.9210e-01, -1.4102e-01,  2.0816e-01,  3.2017e-01,  1.2134e-01]],

#          [[-6.3280e-01,  4.4333e-01, -4.9401e-01,  5.8445e-01,  4.5367e-01],
#           [-6.1286e-02, -4.8294e-01,  9.6761e-02, -1.2049e+00, -4.4538e-02],
#           [-1.4047e+00,  4.5752e-01, -5.5535e-01, -1.3051e+00,  8.8260e-01],
#           [ 6.9645e-01, -1.8630e-01, -3.5097e-01,  7.6276e-01, -2.7011e-01],
#           [ 1.5986e+00, -2.6503e-01, -5.9812e-01,  1.5880e+00,  5.5917e-01]]],


#         [[[ 8.8883e-01, -6.4672e-01,  1.5378e+00,        -inf,        -inf],
#           [ 6.9943e-01, -1.6775e-01, -7.7830e-01,        -inf,        -inf],
#           [ 6.8239e-01, -1.6550e+00, -3.5772e-01,        -inf,        -inf],
#           [ 1.1485e+00, -5.3262e-01,  1.1484e+00,        -inf,        -inf],
#           [-9.4539e-01, -2.0303e-01, -1.9375e+00,        -inf,        -inf]],

#          [[-7.7953e-01,  2.3295e-01,  7.3413e-03,        -inf,        -inf],
#           [-5.9923e-01,  1.8716e+00,  6.6245e-01,        -inf,        -inf],
#           [-4.8800e-02,  6.2469e-01, -4.9597e-01,        -inf,        -inf],
#           [-1.4099e+00,  1.6088e+00, -1.7086e-01,        -inf,        -inf],
#           [ 6.6090e-01, -1.9622e+00,  6.8945e-01,        -inf,        -inf]]]],
#        grad_fn=<MaskedFillBackward0>)
        
        
        

### Fully Connected Layer

In [None]:
class MLP(nn.Module):
    
    def __init__(self,
                 in_features,
                 hidden_features,
                 out_features,
                 mlp_p = 0):
        
        super(MLP, self).__init__()
        
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.drop1 = nn.Dropout(mlp_p)
        self.fc2 = nn.Linear(hidden_features,out_features)
        self.drop2 = nn.Dropout(mlp_p)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = self.drop2(x)
    
        return x

class Block(nn.Module):
    
    def __init__(self,
                 mlp_ratio=4,
                 embed_dim=768,
                 num_heads=12,
                 attn_p = 0,
                 mlp_p = 0,
                 proj_p = 0):
        
        super(Block, self).__init__()
        
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.attention = SelfAttentionEncoder(embed_dim=embed_dim, num_heads=num_heads, attn_p=attn_p, proj_p=proj_p)
        self.mlp = MLP(embed_dim, int(embed_dim*mlp_ratio), embed_dim, mlp_p)
        self.layer_norm2 = nn.LayerNorm(embed_dim)
        
    def forward(self, x, attention_mask=None):
        
        x = x + self.attention(self.layer_norm1(x), attention_mask)
        x = x + self.mlp(self.layer_norm2(x))
        
        return x
        

In [None]:
class RoBERTa(nn.Module):
    
    def __init__(self,
                 max_seq_len=512,
                 vocab_size=tokenizer.vocab_size,
                 embed_dim=768,
                 num_heads=12,
                 depth=12,
                 mlp_ratio=4,
                 attn_p=0,
                 pos_p=0,
                 proj_p=0,
                 mlp_p=0):
        super(RoBERTa, self).__init__()
        
        self.max_seq_len = max_seq_len
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.cls_token = nn.Parameter(torch.zeros(1,1,embed_dim))
        self.pos_embeddings = nn.Embedding(max_seq_len+1, embed_dim)
        self.pos_drop = nn.Dropout(pos_p)
        
        self.blocks = nn.ModuleList(
            [
                Block(
                    mlp_ratio=mlp_ratio, 
                    embed_dim=embed_dim, 
                    num_heads=num_heads, 
                    attn_p=attn_p, 
                    proj_p=proj_p,
                    mlp_p=mlp_p)
                    
                    for _ in range(depth)
                
            ]
        )
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)
    
    def forward(self, x, attention_mask=None):
        device = x.device
        
        batch_size, seq_len = x.shape
        
        if seq_len > self.max_seq_len:
            x = x[:,-self.max_seq_len:]
            
        avail_idx = torch.arange(0, seq_len+1, dtype=torch.long, device=device)
        #print(avail_idx) #tensor([0, 1, 2, 3, 4, 5]) where the cls token + rest of 5 index for tensor
        tok_embed = self.embeddings(x)
        #print(tok_embed.shape) # torch.Size([2, 5, 768])
        #print(self.cls_token.shape) # torch.Size([1,1,768])
        
        ##################################################
        ## Appending cls token in front of the tensor
        cls_token = self.cls_token.expand(batch_size, -1,-1)
        #print(cls_token.shape) # torch.Size([2, 1, 768])
        ##################################################
        
        ## Concatenate the cls token infront of the token embedding
        tok_embed = torch.cat((cls_token,tok_embed),dim=1) 
        #print(tok_embed.shape) # torch.Size([2, 6, 768])
        
        ##################################################
        ## Apply the positional embedding to the available indexes
        pos_embed = self.pos_embeddings(avail_idx)
        
        ## Now add token_embedding with the Positional embedding for position information purpose
        x = tok_embed + pos_embed
        x = self.pos_drop(x)
        
        ## apply the logic on the blocks
        for block in self.blocks:
            x = block(x)
        ## Slicing off the first cls token we dont need right now 
        cls_token_final = x[:,0]
        
        ## Slicing off the remaining tokens
        x = x[:,1:]
        
        x = self.head(x)
        
        return x
        
## Test Case
        
rand_x = torch.randint(0,10, (2,5))
padding = torch.tensor([[True,True,True,True,True],[True,True,True,True,False]])
roberta = RoBERTa()

out = roberta(rand_x,padding)

print(out.shape)

torch.Size([2, 5, 50265])


In [23]:
### DEFINE TRAINING PARAMETERS ###
iterations = 15000
max_len = 100
evaluate_interval = 100
embedding_dim = 384
depth = 4
num_heads = 4
lr = 0.0005
batch_size = 64

DEVICE = "mps" if torch.mps.is_available() else "cpu"
print(DEVICE)
### DEFINE MODEL AND OPTIMIZER ###
model = RoBERTa(max_seq_len=max_len, 
                embed_dim=embedding_dim, 
                depth=depth, 
                num_heads=num_heads, 
                attn_p=0.1, 
                mlp_p=0.1, 
                proj_p=0.1, 
                pos_p=0.1)

model = model.to(DEVICE)
print(model)
optimizer = optim.AdamW(model.parameters(), lr=lr)

### DEFINE LOSS FUNCTION ###
loss_fn = nn.CrossEntropyLoss()

### Build DataLoader ###
dataset = MaskedLMLoader(tokenized_text, max_seq_len=max_len)
trainset, testset = torch.utils.data.random_split(dataset, [int(0.95*len(dataset)),int(len(dataset) - int(0.95*len(dataset)))])
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

### Define Scheduler ###
scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer, 
                                            num_warmup_steps=1500, 
                                            num_training_steps=iterations)

mps
RoBERTa(
  (embeddings): Embedding(50265, 384)
  (pos_embeddings): Embedding(101, 384)
  (pos_drop): Dropout(p=0.1, inplace=False)
  (blocks): ModuleList(
    (0-3): 4 x Block(
      (layer_norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (attention): SelfAttentionEncoder(
        (q_proj): Linear(in_features=384, out_features=384, bias=True)
        (k_proj): Linear(in_features=384, out_features=384, bias=True)
        (v_proj): Linear(in_features=384, out_features=384, bias=True)
        (attn_drop): Dropout(p=0.1, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.1, inplace=False)
      )
      (mlp): MLP(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0, inplace=False)
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop2): Dropout(p=0, inplace=False)
      )
      (layer_norm