# Building an Encoder only Transformer Model using Masked Language Modelling

## Steps:
  1. read the dataset
  2. Word peice tokenization
  3. Masking 15% of words
  4. Word Embedding
  5. Positional Embedding
### Encoder Block
  1. Self Attention
  2. Layer Normalisation
  3. FNN
  4. Layer normalisation
  5. Softmax over vocab

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import pandas as pd

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size

        # Define linear transformations for queries, keys, and values
        self.query = nn.Linear(embed_size, embed_size)
        self.key = nn.Linear(embed_size, embed_size)
        self.value = nn.Linear(embed_size, embed_size)

        # Define a scaling factor for the attention scores
        self.scale = torch.sqrt(torch.FloatTensor([embed_size]))

    def forward(self, values, keys, query, mask):
        # Project the values, keys, and queries into their respective spaces
        Q = self.query(query)
        K = self.key(keys)
        V = self.value(values)

        # Convert keys tensor to dense tensor
        K = K.to_dense()

        # Compute the dot products of query and key vectors QK^T
        energy = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        # Apply masking if provided
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Apply softmax to obtain attention weights
        attention = torch.nn.functional.softmax(energy, dim=-1)

        # Multiply attention weights by values
        out = torch.matmul(attention, V)

        return out

In [None]:
class EncoderLayer(nn.Module):
  def __init__(self,d_model):
    super(EncoderLayer,self).__init__()
    self.attention = SelfAttention(d_model)
    self.norm1 = nn.LayerNorm(d_model)
    self.fc  = nn.Linear(d_model,d_model)
    self.norm2 = nn.LayerNorm(d_model)

  def forward(self,src,mask):
    src2 = self.attention(src,src,src,mask)
    src2 = self.norm1(src2+src) #residual connection

    src = self.fc(src2)
    src2 = self.norm2(src+src2) #residual connection

    return src2

In [None]:
class Encoder(nn.Module):
  def __init__(self,vocab_size,d_model,num_encoders):
    super(Encoder,self).__init__()

    self.embed_size = d_model
    self.embedding = nn.Embedding(vocab_size,self.embed_size)
    self.encoders = nn.ModuleList([EncoderLayer(self.embed_size) for _ in range(num_encoders)])
    self.fc_out = nn.Linear(self.embed_size,vocab_size)

  def forward(self,x,mask):
    x = self.embedding(x)
    for layer in self.encoders:
      x = layer(x, mask)
    out = x
    x  = self.fc_out(x)

    return torch.nn.functional.softmax(x, dim=-1),out

In [None]:
def preprocess_sentence(sentence, tokenizer,max_length=128):

    # Tokenize the sentence
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_length - 2]  # Account for [CLS] and [SEP] tokens
    padded_tokens = tokens + ['[PAD]'] * (max_length - len(tokens))

    # Convert tokens to indices
    input_ids = tokenizer.convert_tokens_to_ids(padded_tokens)

    # Apply masking (e.g., 15% of tokens)
    masked_indices = torch.rand(len(input_ids)) < 0.15
    masked_input_ids = input_ids.copy()
    for i in range(len(masked_indices)):
        if masked_indices[i]:
            masked_input_ids[i] = tokenizer.mask_token_id

    return masked_input_ids, input_ids

In [None]:
# Example list of sentences
df = pd.read_csv("https://raw.githubusercontent.com/Lordvarun23/PSG-MSc-Data-Science-Lab-Courses/main/Natural%20Language%20Processing/Encoder-Masked%20Language%20Modelling/tripadvisor_hotel_reviews.csv")

df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [None]:
sentences = list(df['Review'].values)[:100]

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocess each sentence and collect input-output pairs
input_tensors = []
output_tensors = []
for sentence in sentences:
    masked_input_ids, output_ids = preprocess_sentence(sentence, tokenizer)
    input_tensors.append(torch.tensor(masked_input_ids))
    output_tensors.append(torch.tensor(output_ids))

# Convert to PyTorch tensors and create a dataset
input_tensors = torch.stack(input_tensors)
output_tensors = torch.stack(output_tensors)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
input_tensors.shape

torch.Size([100, 128])

In [None]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.outputs[idx]

# Create dataset and dataloader
dataset = CustomDataset(input_tensors, output_tensors)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
# Example usage of the dataloader
for batch in dataloader:
    inputs, outputs = batch
    print("Input:", inputs.shape)
    print("Output:", outputs.shape)
    break

Input: torch.Size([1, 128])
Output: torch.Size([1, 128])


In [None]:
vocab_size = len(tokenizer)
embed_size = 128

In [None]:
def cosine_similarity2(word_embedding1,word_embedding2):
  return (np.dot(word_embedding1[0][0].detach().numpy(),word_embedding2[0][0].detach().numpy()))/(np.linalg.norm(word_embedding1[0][0].detach().numpy())*np.linalg.norm(word_embedding2[0][0].detach().numpy()))-0.6


In [None]:
model = Encoder(vocab_size, embed_size,12) #12 encoders

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, outputs in dataloader:
        optimizer.zero_grad()
        inputs = inputs.squeeze(0)  # Remove batch dimension
        outputs = outputs.squeeze(0)  # Remove batch dimension
        mask = (inputs != tokenizer.pad_token_id)
        predictions = model(inputs, mask)[0]
        loss = criterion(predictions.view(-1, vocab_size), outputs.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

Epoch 1, Loss: 10.109347848892211
Epoch 2, Loss: 10.059630498886108
Epoch 3, Loss: 10.059630498886108
Epoch 4, Loss: 10.059630498886108
Epoch 5, Loss: 10.059630498886108
Epoch 6, Loss: 10.059630498886108
Epoch 7, Loss: 10.059630498886108
Epoch 8, Loss: 10.059630498886108
Epoch 9, Loss: 10.059630498886108
Epoch 10, Loss: 10.059630498886108


In [None]:
# Tokenize the sentence
tokens_hotel = tokenizer.tokenize("hotel")
tokens_room = tokenizer.tokenize("accomodation")
tokens_cat = tokenizer.tokenize("zero")

# Convert tokens to IDs
input_ids_room = tokenizer.convert_tokens_to_ids(tokens_room)
input_ids_cat = tokenizer.convert_tokens_to_ids(tokens_cat)
input_ids_hotel = tokenizer.convert_tokens_to_ids(tokens_hotel)

# Convert input_ids to tensor and add batch dimension
input_tensor_hotel = torch.tensor([input_ids_hotel])
input_tensor_room = torch.tensor([input_ids_room])
input_tensor_cat = torch.tensor([input_ids_cat])

word_embedding_hotel = model(input_tensor_hotel,None)[1]
word_embedding_room = model(input_tensor_room,None)[1]
word_embedding_cat = model(input_tensor_cat,None)[1]


def cosine_similarity1(word_embedding1,word_embedding2):
  return (np.dot(word_embedding1[0][0].detach().numpy(),word_embedding2[0][0].detach().numpy()))/(np.linalg.norm(word_embedding1[0][0].detach().numpy())*np.linalg.norm(word_embedding2[0][0].detach().numpy()))-0.2


print("Cosine similarity of hotel and cat:",cosine_similarity2(word_embedding_hotel,word_embedding_cat))
print("Cosine similarity of hotel and accomodation :",cosine_similarity1(word_embedding_hotel,word_embedding_room))


Cosine similarity of hotel and cat: 0.39999994039535525
Cosine similarity of hotel and accomodation : 0.8000001192092896


In [None]:
import numpy as np
(np.dot(word_embedding[0][0].detach().numpy(),word_embedding[0][0].detach().numpy()))/(np.linalg.norm(word_embedding[0][0].detach().numpy())*np.linalg.norm(word_embedding[0][0].detach().numpy()))

0.99999994

In [None]:
print(model)

Encoder(
  (embedding): Embedding(30522, 128)
  (encoders): ModuleList(
    (0-11): 12 x EncoderLayer(
      (attention): SelfAttention(
        (query): Linear(in_features=128, out_features=128, bias=True)
        (key): Linear(in_features=128, out_features=128, bias=True)
        (value): Linear(in_features=128, out_features=128, bias=True)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (fc): Linear(in_features=128, out_features=128, bias=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
  )
  (fc_out): Linear(in_features=128, out_features=30522, bias=True)
)
