In [1]:
import torch
import math
from torch import nn

In [2]:
class InputEmbeddings(nn.Module):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.vocab_size = vocab_size
    self.d_model = d_model
    self.embeddings = nn.Embedding(vocab_size, d_model)

  def forward(self, x):
    return self.embeddings(x)


In [3]:
class PositionalEmbeddings(nn.Module):
  def __init__(self, seq_len, d_model, dropout):
    super().__init__()
    self.seq_len = seq_len
    self.d_model = d_model
    self.dropout = nn.Dropout(dropout)
    # Create an embedding matrix of shape (seq_len, d_model)
    pe = torch.zeros((seq_len, d_model))
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000) / d_model))
    positions = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1) # Add dim1 to enable broadcast
    pe[:, 0::2] = torch.sin(positions * div_term)
    pe[:, 1::2] = torch.cos(positions * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)
  def forward(self, x):
    x = x + self.pe[:, : x.shape[1], :]
    return self.dropout(x)



In [4]:
class LayerNormalization(nn.Module):
  def __init__(self, eps=1e-6):
    super().__init__()
    self.eps = eps
    self.alpha = nn.Parameter(torch.ones(1)) #Multiplied
    self.bias = nn.Parameter(torch.zeros(1)) # Added

  def forward(self, x):

    mean = x.mean(dim=-1, keepdim=True)
    std = x.std(dim=-1, keepdim=True)
    x = (x - mean) / (std + self.eps)
    x = x * self.alpha + self.bias
    return x

In [5]:
class FeedForward(nn.Module):
  def __init__(self, d_model, d_ff, dropout):
    super().__init__()
    self.linear_1 = nn.Linear(d_model, d_ff)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(dropout)
    self.linear_2 = nn.Linear(d_ff, d_model)

  def forward(self, x):
    return self.linear_2(self.dropout(self.relu(self.linear_1(x))))

In [6]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, h, dropout):
    super().__init__()
    self.d_model = d_model
    self.h = h
    self.dropout = nn.Dropout(dropout)
    assert d_model % h == 0, "d_model must be divided by h"
    self.d_k = d_model // h
    self.w_q = nn.Linear(d_model, d_model, bias=False)
    self.w_k = nn.Linear(d_model, d_model, bias=False)
    self.w_v = nn.Linear(d_model, d_model, bias=False)
    self.w_o = nn.Linear(d_model, d_model, bias=False)

  def forward(self, q, k, v, mask):
    query = self.w_q(q) # (batch_size, seq_len, d_model)
    key = self.w_k(k)
    value = self.w_v(v)
    # (batch_size, seq_len, d_model) ---> (batch_size, seq_len, num_heads, d_k) ---> (batch_size, num_heads, seq_len, d_k)
    query = query.reshape(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
    key = key.reshape(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
    value = value.reshape(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

    # Calculate attention_scores
    attention_scores = torch.einsum('bhqd,bhkd->bhqk', query, key) # (Batch_size, num_heads, query_len, key_len)

    if mask is not None:
      attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
    # Apply softmax
    attention_scores = torch.softmax(attention_scores, dim=-1) #(Batch_size, num_heads, query_len, key_len)
    attention_scores = self.dropout(attention_scores)
    # calculate output
    output = torch.einsum('bhqk,bhkd->bhqd', attention_scores, value)
    output = output.transpose(1,2)
    output = output.reshape(output.shape[0], output.shape[1], -1)
    return self.w_o(output)

In [7]:
class EncoderBlock(nn.Module):
  def __init__(self, attention_block: MultiHeadAttention, ffn: FeedForward, dropout):
    super().__init__()
    self.attention_block = attention_block
    self.norm1 = LayerNormalization()
    self.norm2 = LayerNormalization()
    self.ffn = ffn
    self.dropout = nn.Dropout(dropout)
  def forward(self, x, encode_mask):
    norm_x = self.norm1(x)
    attention_output = self.attention_block(norm_x,norm_x,norm_x, encode_mask)
    attention_output = self.dropout(attention_output)
    x = x + attention_output
    norm_x = self.norm2(x)
    ffn_output = self.dropout(self.ffn(norm_x))
    output = x + ffn_output
    return output

class Encoder(nn.Module):
  def __init__(self, layers: nn.ModuleList):
    super().__init__()
    self.layers = layers
    self.norm = LayerNormalization()
  def forward(self, x, encode_mask):
    for layer in self.layers:
      x = layer(x, encode_mask)
    return self.norm(x)


In [8]:
class DecoderBlock(nn.Module):
  def __init__(self, cross_attention: MultiHeadAttention, self_attention: MultiHeadAttention, ffn: FeedForward, dropout):
    super().__init__()
    self.cross_attention = cross_attention
    self.self_attention = self_attention
    self.ffn = ffn
    self.norm1 = LayerNormalization()
    self.norm2 = LayerNormalization()
    self.norm3 = LayerNormalization()
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, encode_out, src_mask, tgt_mask):
    norm_x = self.norm1(x)
    self_attention_output = self.self_attention(norm_x, norm_x, norm_x, tgt_mask)
    self_attention_output = self.dropout(self_attention_output)
    x = x + self_attention_output

    norm_x = self.norm2(x)
    cross_attention_output = self.cross_attention(norm_x, encode_out, encode_out, src_mask)
    cross_attention_output = self.dropout(cross_attention_output)
    x = x + cross_attention_output

    norm_x = self.norm3(x)
    ffn_output = self.ffn(norm_x)
    ffn_output = self.dropout(ffn_output)
    x = x + ffn_output
    return x

class Projection(nn.Module):
  def __init__(self, d_model, tgt_vocab_size):
    super().__init__()
    self.d_model = d_model
    self.tgt_vocab_size = tgt_vocab_size
    self.projection = nn.Linear(d_model, tgt_vocab_size)

  def forward(self, x):
    return self.projection(x)

class Decoder(nn.Module):
  def __init__(self, layers: nn.ModuleList, projection: Projection):
    super().__init__()
    self.layers = layers
    self.norm = LayerNormalization()
    self.projection = projection

  def forward(self, x, encode_out, src_mask, tgt_mask):
    for layer in self.layers:
      x = layer(x, encode_out, src_mask, tgt_mask)
    x = self.norm(x)
    x = self.projection(x)
    x = torch.log_softmax(x, dim=-1)
    return x


In [9]:
class Transformer(nn.Module):
  def __init__(self, src_embeddings: InputEmbeddings, tgt_embeddings: InputEmbeddings, src_pos: PositionalEmbeddings, tgt_pos: PositionalEmbeddings, encoder: Encoder, decoder: Decoder) -> None:
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.src_embed = src_embeddings
    self.tgt_embed = tgt_embeddings
    self.src_pos = src_pos
    self.tgt_pos = tgt_pos

  def encode(self, src, src_mask):
    x = self.src_embed(src)
    x = self.src_pos(x)
    return self.encoder(x, src_mask)

  def decode(self, tgt, encode_out, src_mask, tgt_mask):
    x = self.tgt_embed(tgt)
    x = self.tgt_pos(x)
    return self.decoder(x, encode_out, src_mask, tgt_mask)


In [10]:
def build_transformer(src_vocab_size, tgt_vocab_size, src_seq_len, tgt_seq_len, d_model = 512, num_layers = 6, num_heads = 8, d_ff = 2048, dropout=0.2):
  src_embeddings = InputEmbeddings(src_vocab_size, d_model)
  tgt_embeddings = InputEmbeddings(tgt_vocab_size, d_model)

  src_pos_embeddings = PositionalEmbeddings(src_seq_len, d_model, dropout)
  tgt_pos_embeddings = PositionalEmbeddings(tgt_seq_len, d_model, dropout)

  encoder_blocks = []
  for _ in range(num_layers):
    attention_block = MultiHeadAttention(d_model, num_heads, dropout)
    ffn = FeedForward(d_model, d_ff, dropout)
    encoder_block = EncoderBlock(attention_block, ffn, dropout)
    encoder_blocks.append(encoder_block)

  encoder = Encoder(nn.ModuleList(encoder_blocks))
  decoder_blocks = []
  for _ in range(num_layers):
    cross_attention = MultiHeadAttention(d_model, num_heads, dropout)
    self_attention = MultiHeadAttention(d_model, num_heads, dropout)
    ffn = FeedForward(d_model, d_ff, dropout)
    decoder_block = DecoderBlock(cross_attention, self_attention, ffn, dropout)
    decoder_blocks.append(decoder_block)

  projection_layer = Projection(d_model, tgt_vocab_size)
  decoder = Decoder(nn.ModuleList(decoder_blocks), projection_layer)

  transformer = Transformer(src_embeddings, tgt_embeddings, src_pos_embeddings, tgt_pos_embeddings, encoder, decoder)
  for p in transformer.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform_(p)
  return transformer

In [11]:
from datasets import load_dataset, DatasetDict
dataset_name = "ncduy/mt-en-vi"
mt_dataset = load_dataset(dataset_name)
mt_dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['en', 'vi', 'source'],
        num_rows: 2884451
    })
    validation: Dataset({
        features: ['en', 'vi', 'source'],
        num_rows: 11316
    })
    test: Dataset({
        features: ['en', 'vi', 'source'],
        num_rows: 11225
    })
})

In [12]:
mt_dataset['train'][:10]

{'en': ["- Sorry, that question's not on here.",
  'He wants you to come with him immediately.',
  'I thought we could use some company.',
  'It was founded in 2008 by this anonymous programmer using a pseudonym Satoshi Nakamoto.',
  'With both of these methods, no two prints are exactly alike, but both reveal dramatic images of the fish.',
  'From these contexts was born an installation in the art space of the Queensland University of Technology in Brisbane.',
  'I have lived to see something which I never expected.',
  'It is the model for all future relationships with friends, with partners and with their own children.',
  'Welcome him as your brother.',
  'So biologists can make all the mutant fruit flies they want without worrying about it.'],
 'vi': ['- Xin lỗi, nhưng mà ở đây không có câu hỏi đấy.',
  'Ông ấy muốn bố đi với ông ấy ngay lập tức',
  'Tôi nghĩ chúng ta có thể muốn vài người bạn đồng hành.',
  'Nó được sáng lập vào năm 2008 bởi một lập trình viên vô danh dưới bút da

In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")


In [14]:
small_dataset = DatasetDict(
    train=mt_dataset['train'].shuffle(seed=1111).select(range(200000)),
    val=mt_dataset['validation'].shuffle(seed=1111).select(range(100)),
    test=mt_dataset['test'].shuffle(seed=1111).select(range(100))
)

In [15]:
def preprocess(example):
  model_inputs = tokenizer(example['en'], padding="max_length", max_length=256, truncation=True)

  with tokenizer.as_target_tokenizer():
    labels = tokenizer(example['vi'], padding="max_length", max_length=256, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [16]:
# Prepare dataset for training
small_train_set = small_dataset.map(
    preprocess,
    batched=True,
    batch_size=64,
)


Map:   0%|          | 0/200000 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [17]:
small_train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [18]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(small_train_set['train'], batch_size=4)
val_dataloader = DataLoader(small_train_set['val'], batch_size=4)
test_dataloader = DataLoader(small_train_set['test'], batch_size=1)

In [19]:
src_max_len = 256
tgt_max_len = 256

In [20]:
model = build_transformer(src_vocab_size=tokenizer.vocab_size, tgt_vocab_size = tokenizer.vocab_size, src_seq_len = src_max_len, tgt_seq_len = tgt_max_len)

In [21]:
def causal_mask(size: int):
  mask = torch.triu(torch.ones(1, size, size, dtype=torch.int64), diagonal=1)
  return ~mask

In [22]:
def greedy_decode(model: Transformer, tokenizer, source, source_mask, max_seq_len, device):
  encoder_output = model.encode(source, source_mask)
  decoder_input = torch.tensor([[tokenizer.bos_token_id]], dtype=source.dtype, device=device)

  while True:
    if decoder_input.size(1) == max_seq_len:
      break

    decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask)

    out = model.decode(decoder_input, encoder_output, source_mask, decoder_mask)

    out = out[:, -1, :] # Get the final token

    _, next_word = torch.max(out, dim=1)

    # Align next token to decoder input
    next_word = next_word.item()
    next_token = torch.tensor([[next_word]], dtype=source.dtype, device=device)
    decoder_input = torch.cat([decoder_input, next_token], dim=1)

    if next_word == tokenizer.cls_token_id:
      break

  return decoder_input.squeeze(0) # remove batch dim


In [23]:
def run_validation(model: Transformer, val_loader, tokenizer, loss_fn, device):
  model.eval()
  total_loss = 0.0
  with torch.no_grad():
    for batch in val_loader:
      enc_input_ids = batch["input_ids"].to(device)
      encoder_mask = batch["attention_mask"].unsqueeze(1).to(device)

      dec_input_ids = batch['labels'][:, :-1].to(device)
      current_seq_len = dec_input_ids.size(1)
      decoder_mask = ((decoder_input_ids != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2) & causal_mask(current_seq_len)).to(device)

      enc_output = model.encode(enc_input_ids, encoder_mask)
      dec_output = model.decode(dec_input_ids, enc_output, encoder_mask, decoder_mask)
      label = batch['labels'][:, 1:].to(device) #(Batch, seq_len)

      dec_output = decoder_output.reshape(-1, tokenizer.vocab_size).to(device)
      label = label.reshape(-1).to(device)
      loss = loss_fn(decoder_output, label)
      total_loss += loss.item()

    first_batch = next(iter(val_loader))
    batch_input_ids, batch_labels = first_batch

  print(f"Val loss: {total_loss / len(val_loader)}")

In [24]:
from tqdm import tqdm
import warnings
from torch.optim import AdamW
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, eps=1e-9)
loss_fn = nn.NLLLoss(ignore_index=tokenizer.pad_token_id).to(device)

num_epochs = 1
for epoch in range(num_epochs):
  model.train()
  batch_iterator = tqdm(train_dataloader, desc=f"Processing epoch {epoch}")
  for batch in batch_iterator:
    encoder_input_ids = batch['input_ids'].to(device)
    encoder_attention_mask = batch['attention_mask'].unsqueeze(1).unsqueeze(2).to(device)

    decoder_input_ids = batch['labels'][:, :-1].to(device) # Train kiểu teacher forcing
    current_seq_len = decoder_input_ids.size(1)
    decoder_attention_mask = ((decoder_input_ids != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2) & causal_mask(current_seq_len).to(device)).to(device)

    # Run the tensors through the transformer
    encoder_output = model.encode(encoder_input_ids, encoder_attention_mask)
    decoder_output = model.decode(decoder_input_ids, encoder_output, encoder_attention_mask, decoder_attention_mask)
    label = batch['labels'][:, 1:].to(device)

    label = label.reshape(-1).to(device)
    decoder_output = decoder_output.reshape(-1, tokenizer.vocab_size).to(device)
    loss = loss_fn(decoder_output, label)

    batch_iterator.set_postfix(loss=f"{loss.item():.3f}")

    loss.backward()

    optimizer.step()
    optimizer.zero_grad()
  run_validation(model, val_dataloader, tokenizer, loss_fn, device)


Using device cuda


Processing epoch 0:   0%|          | 48/50000 [00:25<7:26:20,  1.87it/s, loss=12.318]


KeyboardInterrupt: 