# **get_config()**

In [17]:
from src.config import get_config

config = get_config()
config

{'batch_size': 8,
 'num_epochs': 20,
 'lr': 0.0001,
 'seq_len': 350,
 'd_model': 512,
 'datasource': 'opus_books',
 'lang_src': 'en',
 'lang_tgt': 'it',
 'model_folder': 'weights',
 'model_basename': 'tmodel_',
 'preload': 'latest',
 'tokenizer_file': 'assets/tokenizer_{0}.json',
 'experiment_name': 'runs/tmodel'}

# **get_all_sentences()**

## eng-to-it

In [52]:
import json

eng_it_data = []
with open(
    file = r"datasets/train_eng_it.json", 
    mode = "r", 
    encoding = "utf-8") as f:

    for line in f:
        eng_it_data.append(json.loads(line))

eng_it_data

[{'id': '0',
  'translation': {'en': 'Source: Project Gutenberg',
   'it': 'Source: www.liberliber.it/Audiobook available here'}},
 {'id': '1', 'translation': {'en': 'Jane Eyre', 'it': 'Jane Eyre'}},
 {'id': '2',
  'translation': {'en': 'Charlotte Bronte', 'it': 'Charlotte Brontë'}},
 {'id': '3', 'translation': {'en': 'CHAPTER I', 'it': 'PARTE PRIMA'}},
 {'id': '4',
  'translation': {'en': 'There was no possibility of taking a walk that day.',
   'it': 'I. In quel giorno era impossibile passeggiare.'}},
 {'id': '5',
  'translation': {'en': 'We had been wandering, indeed, in the leafless shrubbery an hour in the morning; but since dinner (Mrs. Reed, when there was no company, dined early) the cold winter wind had brought with it clouds so sombre, and a rain so penetrating, that further out-door exercise was now out of the question.',
   'it': "La mattina avevamo errato per un'ora nel boschetto spogliato di foglie, ma dopo pranzo (quando non vi erano invitati, la signora Reed desinava pr

In [53]:
from src.train import get_all_sentences

### source

In [54]:
eng_it_source = get_all_sentences(
    ds = eng_it_data,
    lang = "en"
)

eng_it_source

<generator object get_all_sentences at 0x000002251F31A430>

In [22]:
next(eng_it_source)

'Source: Project Gutenberg'

### target

In [55]:
eng_it_target = get_all_sentences(
    ds = eng_it_data,
    lang = "it"
)

eng_it_target

<generator object get_all_sentences at 0x000002251F31A4A0>

In [25]:
next(eng_it_target)

'Source: www.liberliber.it/Audiobook available here'

## eng-to-fil

In [56]:
from datasets import load_dataset

eng_fil_data = load_dataset(
    "rhyliieee/tagalog-filipino-english-translation",
    split = "train")

eng_fil_data

Dataset({
    features: ['tagalog', 'english'],
    num_rows: 84177
})

In [57]:
from src.train import get_all_sentences

### source

In [58]:
eng_fil_source = get_all_sentences(
    ds = eng_fil_data,
    lang = "english"
)

eng_fil_source

<generator object get_all_sentences at 0x000002251F31A9E0>

In [9]:
next(eng_fil_source)

'Describe what you would see if you went to the Grand Canyon.'

### target

In [59]:
eng_fil_target = get_all_sentences(
    ds = eng_fil_data,
    lang = "tagalog"
)

eng_fil_target

<generator object get_all_sentences at 0x000002251F3533C0>

In [12]:
next(eng_fil_target)

' Ilarawan kung ano ang makikita mo kung pupunta ka sa Grand Canyon.'

# **get_or_build_tokenizer() -> Tokenizer**

In [60]:
from src.train import get_or_build_tokenizer

## eng-to-fil

### source

In [61]:
eng_fil_src_tokenizer = get_or_build_tokenizer(
    config = config,
    ds = eng_fil_data,
    lang = "english"
)

eng_fil_src_tokenizer

<tokenizers.Tokenizer at 0x2251d64f490>

#### .get_vocab()

In [62]:
eng_fil_src_tokenizer.get_vocab(with_added_tokens = True)

{'glucose': 3550,
 'Everyone': 6142,
 'Em': 18501,
 '1812': 13307,
 'calculating': 4148,
 'gravely': 26842,
 'Bordeaux': 28050,
 'event': 585,
 'Telephone': 18712,
 'See': 9270,
 'glimmered': 22715,
 'unfolds': 14483,
 'GPUs': 12084,
 'origins': 5905,
 'servings': 14963,
 'yo': 11768,
 'Adults': 20471,
 'flavonoids': 26789,
 'Arkansas': 16823,
 'workshop': 11028,
 'Caves': 23460,
 'CTA': 13337,
 'contributed': 4928,
 'Repetitive': 23933,
 'trenches': 25060,
 'gene': 5010,
 'measurement': 6237,
 'Road': 5782,
 'Paella': 19597,
 'tapestry': 12027,
 'Cardio': 16188,
 'resided': 24857,
 'jealousy': 22798,
 'Provide': 678,
 'Zhang': 29250,
 'keeps': 4051,
 'chilies': 19817,
 'evident': 6173,
 'Allocate': 12618,
 'clot': 24258,
 'Dollar': 12643,
 'deregulation': 22603,
 'pharmacy': 27236,
 'ethnicities': 19921,
 'internet': 521,
 'gray': 5987,
 'toilets': 8574,
 'MagSafe': 20746,
 'Startups': 19666,
 'Approval': 25343,
 'brush': 4721,
 'suffix': 19202,
 'Substance': 24006,
 'Oklahoma': 16300

#### .get_vocab_size()

In [63]:
eng_fil_src_tokenizer.get_vocab_size(with_added_tokens = True)

30000

#### .encode()

In [64]:
tokenized_src_text = eng_fil_src_tokenizer.encode(
    sequence = "Describe what you would see if you went to the Grand Canyon."
)

tokenized_src_text

Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

##### .tokens

In [65]:
tokenized_src_text.tokens

['Describe',
 'what',
 'you',
 'would',
 'see',
 'if',
 'you',
 'went',
 'to',
 'the',
 'Grand',
 'Canyon',
 '.']

##### .ids

In [66]:
tokenized_src_text.ids

[240, 152, 29, 80, 340, 116, 29, 1379, 8, 6, 4881, 6072, 4]

##### .attention_mask

In [67]:
tokenized_src_text.attention_mask

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

#### .token_to_id()

In [68]:
sos_token = eng_fil_src_tokenizer.token_to_id(
    token = "[SOS]"
)

sos_token

2

In [69]:
eos_token = eng_fil_src_tokenizer.token_to_id(
    token = "[EOS]"
)

eos_token

3

In [70]:
pad_token = eng_fil_src_tokenizer.token_to_id(
    token = "[PAD]"
)

pad_token

1

In [71]:
enc_num_padding_tokens = config["seq_len"] - len(tokenized_src_text.ids) - 2
enc_num_padding_tokens

335

#### **ENCODER INPUT**

In [72]:
import torch

encoder_input = torch.cat(
            [
                torch.tensor([sos_token], dtype = torch.int64),
                torch.tensor(tokenized_src_text.ids, dtype = torch.int64),
                torch.tensor([eos_token], dtype = torch.int64),
                torch.tensor([pad_token] * enc_num_padding_tokens, dtype = torch.int64),
            ],
            dim = 0,
        ) 

encoder_input

tensor([   2,  240,  152,   29,   80,  340,  116,   29, 1379,    8,    6, 4881,
        6072,    4,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,   

In [73]:
encoder_input.shape

torch.Size([350])

#### **ENCODER MASK**

In [74]:
encoder_mask = (encoder_input != pad_token).unsqueeze(0).unsqueeze(0).int()
encoder_mask

tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [75]:
encoder_mask.shape

torch.Size([1, 1, 350])

#### .save()

In [76]:
eng_fil_src_tokenizer.save("path")

#### .decode()

In [77]:
eng_fil_src_tokenizer.decode(encoder_input.tolist(), 
                             skip_special_tokens = False)

'[SOS] Describe what you would see if you went to the Grand Canyon . [EOS] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

### target

In [78]:
eng_fil_tgt_tokenizer = get_or_build_tokenizer(
    config = config,
    ds = eng_fil_data,
    lang = "tagalog"
)

eng_fil_tgt_tokenizer

<tokenizers.Tokenizer at 0x2251d650890>

#### .get_vocab()

In [79]:
eng_fil_tgt_tokenizer.get_vocab(with_added_tokens = True)

{'16π': 29352,
 'Janeiro': 16693,
 'potluck': 21008,
 'Explore': 24642,
 'magwasak': 28465,
 'sand': 26265,
 'cool': 6240,
 'paggapas': 26060,
 'Concord': 21312,
 'here': 15107,
 'Monte': 17436,
 'Bengal': 15983,
 'keyword': 1361,
 'Usain': 23318,
 'x1': 4516,
 'pamahalaang': 11757,
 'Mahaba': 11081,
 'laban': 755,
 'intercept': 6129,
 'rebolusyong': 11244,
 'Nurse': 24993,
 'Chinatown': 24551,
 '754': 29447,
 'kabataang': 6547,
 'tk': 21134,
 'singaw': 3355,
 'daang': 5103,
 'distributor': 20598,
 'lasa': 873,
 'macroeconomics': 19610,
 'Kahalagahan': 5285,
 'tingga': 22518,
 'Galahad': 18156,
 'Munich': 15485,
 'simpleng': 555,
 'nakadapo': 25974,
 'smart': 2322,
 'toothbrush': 8655,
 'Parthenon': 23161,
 'pagkakabukod': 6322,
 'pagtitiwala': 5070,
 'Party': 4795,
 'almond': 4439,
 'tagal': 2678,
 'ideya': 241,
 'Cut': 15415,
 'land': 15710,
 'activated': 9169,
 'Magento': 16712,
 'binaliktad': 18410,
 '>");': 18956,
 'right': 4095,
 'pagpapaalala': 18729,
 'Capital': 12754,
 'iniksy

#### .get_vocab_size()

In [80]:
eng_fil_tgt_tokenizer.get_vocab_size(with_added_tokens = True)

30000

#### .encode()

In [81]:
tokenized_tgt_text = eng_fil_tgt_tokenizer.encode(
    sequence = ' Ilarawan kung ano ang makikita mo kung pupunta ka sa Grand Canyon.'
)

tokenized_tgt_text

Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

##### .tokens

In [82]:
tokenized_tgt_text.tokens

['Ilarawan',
 'kung',
 'ano',
 'ang',
 'makikita',
 'mo',
 'kung',
 'pupunta',
 'ka',
 'sa',
 'Grand',
 'Canyon',
 '.']

##### .ids

In [83]:
tokenized_tgt_text.ids

[260, 27, 256, 9, 1028, 66, 27, 4868, 100, 5, 4662, 5827, 7]

##### .attention_mask

In [84]:
tokenized_tgt_text.attention_mask

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [85]:
dec_num_padding_tokens = config["seq_len"] - len(tokenized_tgt_text.ids) - 1
dec_num_padding_tokens

336

#### **DECODER INPUT**

In [86]:
decoder_input = torch.cat(
            [
                torch.tensor([sos_token], dtype = torch.int64),
                torch.tensor(tokenized_tgt_text.ids, dtype = torch.int64),
                torch.tensor([pad_token] * dec_num_padding_tokens, dtype = torch.int64),
            ],
            dim = 0,
        )
decoder_input

tensor([   2,  260,   27,  256,    9, 1028,   66,   27, 4868,  100,    5, 4662,
        5827,    7,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,   

In [87]:
decoder_input.shape

torch.Size([350])

#### **DECODER MASK**

In [88]:
from src.dataset import causal_mask

decoder_mask = (decoder_input != pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0))
decoder_mask

tensor([[[1, 0, 0,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]], dtype=torch.int32)

In [89]:
decoder_mask.shape

torch.Size([1, 350, 350])

#### .decode()

- The output for the decoder is `shifted-right` (added [SOS]).

In [90]:
eng_fil_tgt_tokenizer.decode(decoder_input.tolist(), 
                             skip_special_tokens = False)

'[SOS] Ilarawan kung ano ang makikita mo kung pupunta ka sa Grand Canyon . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

## label

### **LABEL**

In [91]:
label = torch.cat(
            [
                torch.tensor(tokenized_tgt_text.ids, dtype = torch.int64),
                torch.tensor([eos_token], dtype = torch.int64),
                torch.tensor([pad_token] * dec_num_padding_tokens, dtype = torch.int64),
            ],
            dim = 0,
        )

label

tensor([ 260,   27,  256,    9, 1028,   66,   27, 4868,  100,    5, 4662, 5827,
           7,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,   

### .decode()

In [92]:
eng_fil_tgt_tokenizer.decode(label.tolist(), 
                             skip_special_tokens = False)

'Ilarawan kung ano ang makikita mo kung pupunta ka sa Grand Canyon . [EOS] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

# **get_bundled_input_and_mask()**

In [32]:
from src.train import get_bundled_input_and_mask

## **ENCODER INPUT**

In [38]:
encoder_input_and_mask = get_bundled_input_and_mask(
    sos_token = sos_token,
    eos_token = eos_token,
    pad_token = pad_token,
    tokenized_src_text = tokenized_src_text,
    enc_num_padding_tokens = enc_num_padding_tokens
)

encoder_input, encoder_mask = encoder_input_and_mask

In [39]:
encoder_input

tensor([   2,  240,  152,   29,   80,  340,  116,   29, 1379,    8,    6, 4881,
        6072,    4,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,   

In [40]:
encoder_input.shape

torch.Size([350])

## **ENCODER MASK**

In [41]:
encoder_mask

tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [42]:
encoder_mask.shape

torch.Size([1, 1, 350])

## **DECODER INPUT**

In [55]:
decoder_input_and_mask = get_bundled_input_and_mask(
    sos_token = sos_token,
    pad_token = pad_token,
    eos_token = eos_token,
    tokenized_tgt_text = tokenized_tgt_text,
    dec_num_padding_tokens = dec_num_padding_tokens
)

decoder_input, decoder_mask = decoder_input_and_mask

In [56]:
decoder_input

tensor([   2,  260,   27,  256,    9, 1028,   66,   27, 4868,  100,    5, 4662,
        5827,    7,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,   

In [57]:
decoder_input.shape

torch.Size([350])

## **DECODER MASK**

In [58]:
decoder_mask

tensor([[[1, 0, 0,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]], dtype=torch.int32)

In [59]:
decoder_mask.shape

torch.Size([1, 350, 350])

# **get_model() -> Transformer**

In [94]:
from src.train import get_model

In [95]:
from torch import device

device = "cpu"

model = get_model(
    config = config,
    vocab_src_len = eng_fil_src_tokenizer.get_vocab_size(),
    vocab_tgt_len = eng_fil_tgt_tokenizer.get_vocab_size()
).to(device)

model

Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=512, out_features=512, bias=False)
          (w_k): Linear(in_features=512, out_features=512, bias=False)
          (w_v): Linear(in_features=512, out_features=512, bias=False)
          (w_o): Linear(in_features=512, out_features=512, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (residual_connections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.1, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (norm): LayerNormalization

In [96]:
type(model)

src.model.Transformer

In [97]:
model.state_dict()

OrderedDict([('encoder.layers.0.self_attention_block.w_q.weight',
              tensor([[-0.0651, -0.0644, -0.0467,  ..., -0.0197,  0.0674, -0.0147],
                      [-0.0437,  0.0225,  0.0072,  ...,  0.0409, -0.0559, -0.0642],
                      [-0.0299, -0.0045, -0.0492,  ..., -0.0454, -0.0519,  0.0244],
                      ...,
                      [ 0.0404,  0.0404,  0.0701,  ...,  0.0191,  0.0164,  0.0339],
                      [-0.0507, -0.0274, -0.0642,  ..., -0.0217,  0.0421, -0.0128],
                      [ 0.0565, -0.0170, -0.0552,  ..., -0.0199,  0.0032,  0.0305]])),
             ('encoder.layers.0.self_attention_block.w_k.weight',
              tensor([[ 0.0622,  0.0594,  0.0558,  ..., -0.0683,  0.0518, -0.0060],
                      [ 0.0561,  0.0092,  0.0465,  ..., -0.0171,  0.0711, -0.0700],
                      [-0.0730, -0.0415,  0.0068,  ..., -0.0181,  0.0536,  0.0108],
                      ...,
                      [-0.0040,  0.0498,  0.0600,  ...,

## .encode() - ENCODER OUTPUT

In [98]:
encoder_output = model.encode(
    src = encoder_input,
    src_mask = encoder_mask
)

encoder_output

tensor([[[-1.0758, -0.3183,  0.3030,  ...,  0.9124, -0.7326,  0.5934],
         [-0.6861, -1.5884,  1.5389,  ...,  1.4303, -0.6450, -0.4349],
         [ 0.3746, -0.9457,  1.3289,  ...,  0.5405, -1.2694, -0.7005],
         ...,
         [ 0.1409, -1.2523,  1.0696,  ...,  1.6832, -0.8411, -0.4607],
         [ 0.0254, -2.2847,  1.1141,  ...,  1.2877, -0.9019, -0.2405],
         [-0.2786, -1.8332,  1.0711,  ...,  0.9532, -0.9232,  0.9248]]],
       grad_fn=<AddBackward0>)

In [99]:
encoder_output.shape

torch.Size([1, 350, 512])

## .decode() - DECODER OUTPUT

In [100]:
decoder_output = model.decode(
    encoder_output = encoder_output,
    src_mask = encoder_mask,
    tgt = decoder_input,
    tgt_mask = decoder_mask
)

decoder_output

tensor([[[-0.9673, -0.0945,  0.6777,  ...,  0.1936,  0.0362,  1.3110],
         [-0.0672, -0.7670,  0.2378,  ...,  1.0070, -0.0686,  0.5025],
         [-0.7922,  0.1111,  0.2079,  ...,  1.1729, -0.3677,  0.8631],
         ...,
         [-0.4691, -0.4147,  1.2252,  ...,  0.4172,  1.0044,  1.3540],
         [-0.3436, -0.7128,  0.1191,  ..., -0.0695,  0.2840,  1.1487],
         [-1.4476, -1.1025,  0.7166,  ...,  0.5140,  0.6403,  0.3491]]],
       grad_fn=<AddBackward0>)

In [101]:
decoder_output.shape

torch.Size([1, 350, 512])

In [102]:
decoder_output[:, -1].shape

torch.Size([1, 512])

## .project()

### for ALL **TOKENS** IN SEQUENCE

In [136]:
logits = model.project(
    x = decoder_output
)

logits

tensor([[[ 0.2369,  0.1613,  0.3271,  ...,  0.3588,  0.1631,  0.1601],
         [ 0.3664,  0.0593,  0.0747,  ...,  0.2508,  0.2975,  0.1092],
         [ 0.1905,  0.1513,  0.1399,  ...,  0.3924,  0.1827,  0.0325],
         ...,
         [ 0.2921,  0.0548,  0.0913,  ...,  0.3638,  0.0856,  0.0445],
         [ 0.3252, -0.0749,  0.0987,  ...,  0.3251,  0.0161,  0.1077],
         [ 0.1845, -0.0841,  0.0419,  ...,  0.3121, -0.0399,  0.0511]]],
       grad_fn=<ViewBackward0>)

In [137]:
logits.shape # Each TOKEN in the SEQUENCE has a LOGIT DISTRIBUTION over the ENTIRE VOCABULARY

torch.Size([1, 350, 30000])

### for LAST **TOKEN** IN SEQUENCE

In [138]:
last_token_logits = model.project(
    x = decoder_output[:, -1]
)

last_token_logits

tensor([[ 0.1845, -0.0841,  0.0419,  ...,  0.3121, -0.0399,  0.0511]],
       grad_fn=<AddmmBackward0>)

In [139]:
last_token_logits.shape # The LOGIT DISTRIBUTION of the LAST TOKEN in the SEQUENCE over the ENTIRE VOCABULARY

torch.Size([1, 30000])

# **greedy_decode()**

- The implementation automatically does a `model.project()` on the **LAST TOKEN** only, then it iterates until seq_len is exhausted or until [EOS]

In [107]:
from src.train import greedy_decode

In [108]:
model_out = greedy_decode(
    model = model, 
    source = encoder_input, 
    source_mask = (encoder_input != pad_token).unsqueeze(0).unsqueeze(0).int(), 
    tokenizer_src = eng_fil_src_tokenizer, 
    max_len = config["seq_len"], 
    device = device
    )

model_out

tensor([    2,  3706,  7505,  7505, 19551, 22777, 22579, 10684, 23033, 19551,
          305, 22777, 17437, 17437, 27867, 19551, 27867, 19551, 27867,  7748,
        22777, 27867, 27867, 16430, 23122, 19551,   350,  8665,   350, 19551,
        17437, 17437,  4111, 19551, 22777, 17437, 19551, 19551, 19551, 27867,
        19551, 27867, 22777, 22777, 19551, 23027, 19551, 17437, 19551, 19551,
        27867,  7505, 17437, 19551, 22777, 19551, 19551, 22777, 19551, 22777,
        27867, 22777, 23343, 23343,   350, 27867, 17437, 19551, 22777, 27867,
        23033,  7505, 27867, 19551, 19551,   278, 19551, 27867, 19551, 19551,
        19551, 20713, 19551,  7505, 19551, 11674, 27867, 17437, 22777, 19551,
          350, 17437, 19551, 17437, 23343, 19551,  8656, 19551,  7505, 19551,
        19551, 19551,   278, 23033, 22777, 17437, 22777, 23343, 17499, 27619,
          350,   350, 17437, 27867, 19551, 17437,   278,   350, 19551, 23343,
        23343,   350, 19551, 17437,   278,   278,  7505, 23343, 

In [111]:
model_out.shape

torch.Size([350])

In [112]:
" ".join([eng_fil_tgt_tokenizer.id_to_token(id = id) for id in model_out])

'[SOS] pinakamahalaga BY BY interval Copper }.") kalabang Marahan interval klima Copper Mood Mood bismuth interval bismuth interval bismuth da Copper bismuth bismuth pagkapira Owen interval print 1991 print interval Mood Mood inilarawan interval Copper Mood interval interval interval bismuth interval bismuth Copper Copper interval Mall interval Mood interval interval bismuth BY Mood interval Copper interval interval Copper interval Copper bismuth Copper X5 X5 print bismuth Mood interval Copper bismuth Marahan BY bismuth interval interval of interval bismuth interval interval interval islang interval BY interval iikot bismuth Mood Copper interval print Mood interval Mood X5 interval tutuklasin interval BY interval interval interval of Marahan Copper Mood Copper X5 Raphael Sulitin print print Mood bismuth interval Mood of print interval X5 X5 print interval Mood of of BY X5 of interval bismuth print Kinakabahan Mood of inilarawan imaheng of X5 X5 Congo 024 X5 of Mood 024 X5 024 X5 X5 Moo

# **Under the hood**

## *Loss Computation*

### **using nn.CrossEntropyLoss**

In [None]:
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss(
    ignore_index = eng_fil_src_tokenizer.token_to_id('[PAD]'), 
    label_smoothing = 0.1
).to("cpu")

loss = loss_fn(
    logits.view(-1, eng_fil_tgt_tokenizer.get_vocab_size()), 
    label.view(-1)
)

In [145]:
loss.item()

10.314189910888672

### **manual computation**

In [146]:
logits = logits.view(-1, eng_fil_tgt_tokenizer.get_vocab_size())
logits

tensor([[ 0.2369,  0.1613,  0.3271,  ...,  0.3588,  0.1631,  0.1601],
        [ 0.3664,  0.0593,  0.0747,  ...,  0.2508,  0.2975,  0.1092],
        [ 0.1905,  0.1513,  0.1399,  ...,  0.3924,  0.1827,  0.0325],
        ...,
        [ 0.2921,  0.0548,  0.0913,  ...,  0.3638,  0.0856,  0.0445],
        [ 0.3252, -0.0749,  0.0987,  ...,  0.3251,  0.0161,  0.1077],
        [ 0.1845, -0.0841,  0.0419,  ...,  0.3121, -0.0399,  0.0511]],
       grad_fn=<ViewBackward0>)

In [None]:
logits[0].sum() # NOT NORMALIZED YET (as you can see, it sums up to a value greater than 1)

tensor(52.3431, grad_fn=<SumBackward0>)

In [117]:
logits.shape

torch.Size([350, 30000])

In [118]:
label = label.view(-1)
label

tensor([ 260,   27,  256,    9, 1028,   66,   27, 4868,  100,    5, 4662, 5827,
           7,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,   

In [120]:
label.shape

torch.Size([350])

`logsumexp` is just a **numerically safe way to compute the log of a sum of exponentials**.
Without it, `-log P(y)` could **produce NaN or infinity** when logits are extreme.

In [121]:
log_probs = logits - torch.logsumexp(
    input = logits, 
    dim = 1, 
    keepdim = True
)
log_probs

tensor([[-10.0909, -10.1666, -10.0007,  ...,  -9.9691, -10.1648, -10.1678],
        [ -9.9609, -10.2680, -10.2527,  ..., -10.0766, -10.0299, -10.2181],
        [-10.1372, -10.1764, -10.1878,  ...,  -9.9353, -10.1450, -10.2952],
        ...,
        [-10.0355, -10.2728, -10.2363,  ...,  -9.9637, -10.2420, -10.2830],
        [-10.0016, -10.4017, -10.2280,  ..., -10.0017, -10.3107, -10.2190],
        [-10.1421, -10.4107, -10.2847,  ..., -10.0145, -10.3665, -10.2755]],
       grad_fn=<SubBackward0>)

Below is an example **NLL for the first token**. In our label, the `first token ID is 260`. The `logit value at index 260 in our log_probs` is below. 

In [133]:
-log_probs[0][label[0]]

tensor(10.2334, grad_fn=<NegBackward0>)

In [125]:
token_losses = -log_probs[torch.arange(350), label]
token_losses

tensor([10.2334, 10.3615, 10.2833, 10.1227, 10.3003, 10.4964, 10.2619, 10.2890,
        10.3175, 10.5633, 10.3826, 10.2347, 10.2309, 10.3024, 10.3959, 10.4434,
        10.3197, 10.3117, 10.3370, 10.2984, 10.3610, 10.4037, 10.2462, 10.2552,
        10.2944, 10.2070, 10.2536, 10.2641, 10.3730, 10.2049, 10.2859, 10.3144,
        10.2656, 10.2819, 10.2502, 10.1917, 10.5005, 10.2869, 10.2201, 10.4284,
        10.3427, 10.3197, 10.3268, 10.2733, 10.2677, 10.3346, 10.2214, 10.2879,
        10.2437, 10.3900, 10.1785, 10.3268, 10.3402, 10.3226, 10.3618, 10.3310,
        10.2656, 10.3106, 10.2731, 10.3660, 10.3764, 10.3134, 10.3557, 10.3655,
        10.2149, 10.3494, 10.2377, 10.3049, 10.3589, 10.3067, 10.1892, 10.2299,
        10.3181, 10.2342, 10.3269, 10.2758, 10.3814, 10.2784, 10.1803, 10.2245,
        10.2657, 10.3129, 10.2153, 10.2563, 10.3401, 10.2674, 10.3708, 10.2818,
        10.4569, 10.3924, 10.3834, 10.3933, 10.2323, 10.4948, 10.3923, 10.2771,
        10.2906, 10.3362, 10.3844, 10.44

In [126]:
loss = token_losses.mean()
loss

tensor(10.3190, grad_fn=<MeanBackward0>)

## *Decoding*

In [141]:
probability, next_word = torch.max(last_token_logits, 
                                   dim = 1,
                                   keepdim = False)

In [142]:
next_word

tensor([278])

In [143]:
probability

tensor([0.8016], grad_fn=<MaxBackward0>)

## *MultiHeadAttentionBlock*

In [None]:
import torch

x = torch.tensor([[1.0, 0.2],
                  [0.8, 0.5],
                  [0.1, 0.9]],
                  dtype = torch.float64,
                  requires_grad = False)

q, k, v = x, x, x

### **Create MASK**

In [30]:
T = 3

mask = torch.tril(torch.ones((T, T), dtype = torch.int64)).type(torch.int)
mask

tensor([[1, 0, 0],
        [1, 1, 0],
        [1, 1, 1]], dtype=torch.int32)

In [49]:
from src.model import MultiHeadAttentionBlock

masked_attention_output, masked_attention_weights = multihead_attention = MultiHeadAttentionBlock(
    d_model = 2,
    h = 2,
    dropout = 0.0
).attention(
    query = q,
    key = k,
    value = v,
    mask = mask,
    dropout = nn.Dropout(0.0)
)

### **MASKED ATTENTION WEIGHTS**

In [50]:
masked_attention_weights

tensor([[1.0000, 0.0000, 0.0000],
        [0.5018, 0.4982, 0.0000],
        [0.2733, 0.3262, 0.4004]], dtype=torch.float64)

### **MASKED ATTENTION OUTPUT**

In [51]:
masked_attention_output

tensor([[1.0000, 0.2000],
        [0.9004, 0.3495],
        [0.5744, 0.5782]], dtype=torch.float64)

# **Math behind SELF-ATTENTION**

- `Setup:` A sentence with 3 tokens `T = 3`, where each token is a 2D vector (2-dimensional vector) `C = 2`

In [None]:
import torch

x = torch.tensor([[1.0, 0.2],
                  [0.8, 0.5],
                  [0.1, 0.9]],
                  dtype = torch.float64,
                  requires_grad = False)

q, k, v = x, x, x

### **UNSCALED ATTENTION SCORES - Q @ Ktransposed**

- At this point, the values are not normalized yet which means they are not percentages yet

In [3]:
unscaled_attention_scores = torch.matmul(q, k.T)
unscaled_attention_scores 

tensor([[1.0400, 0.9000, 0.2800],
        [0.9000, 0.8900, 0.5300],
        [0.2800, 0.5300, 0.8200]], dtype=torch.float64)

### **SCALED ATTENTION SCORES - Q @ Ktransposed / sqrt(d_k)**

In [4]:
import math

d_k = k.size(-1)

scaled_attention_scores = unscaled_attention_scores / math.sqrt(d_k)
scaled_attention_scores

tensor([[0.7354, 0.6364, 0.1980],
        [0.6364, 0.6293, 0.3748],
        [0.1980, 0.3748, 0.5798]], dtype=torch.float64)

### **ATTENTION WEIGHTS - Softmax((Q @ Ktransposed) / sqrt(d_k))**

- `Attention Weights` is just a term for the normalized scaled attention scores. At this point, the values are now percentages

In [5]:
import torch.nn.functional as F

attention_weights = F.softmax(scaled_attention_scores, dim = -1)
attention_weights

tensor([[0.4016, 0.3638, 0.2346],
        [0.3620, 0.3594, 0.2786],
        [0.2733, 0.3262, 0.4004]], dtype=torch.float64)

Each row now sums up to 1. For example, ***for Token 2 (row 1)***, it has decided to pay `36.2% attention` to *Token 1*, `35.9% attention` to *itself (Token 2)*, and `27.8%` to *Token 3*

### **ATTENTION OUTPUT - (Softmax((Q @ Ktransposed) / sqrt(d_k))) * V**

- At this point, we are now updating the `V` vector using the weights we have obtained. Notice below how the values were updated

In [7]:
attention_output = torch.matmul(attention_weights, v)
attention_output

tensor([[0.7161, 0.4734],
        [0.6773, 0.5029],
        [0.5744, 0.5782]], dtype=torch.float64)

In [8]:
v

tensor([[1.0000, 0.2000],
        [0.8000, 0.5000],
        [0.1000, 0.9000]], dtype=torch.float64)

### **Create MASK TENSOR**

In [41]:
T = 3

mask = torch.tril(torch.ones((T, T), dtype = torch.int64)).type(torch.int)
mask

tensor([[1, 0, 0],
        [1, 1, 0],
        [1, 1, 1]], dtype=torch.int32)

### **MASKED SCALED ATTENTION SCORES**

- By masking the `scaled attention scores` into `-inf` before softmax, we are preventing the cheat of looking at future tokens 

In [42]:
masked_scaled_attention_scores = scaled_attention_scores.masked_fill(
    mask == 0, 
    float('-inf')
)

masked_scaled_attention_scores

tensor([[0.7354,   -inf,   -inf],
        [0.6364, 0.6293,   -inf],
        [0.1980, 0.3748, 0.5798]], dtype=torch.float64)

### **MASKED ATTENTION WEIGHTS**

- Better known as `Causal Weights` is just a term used for masked attention weights were **future tokens**, **past tokens** or **pad tokens** are 0 due to applying softmax to `-inf` values
- E.g. for the second row, it is only able to see for Tokens 1 to 2 and not to 3.

In [43]:
masked_attention_weights = F.softmax(masked_scaled_attention_scores, dim = -1)
masked_attention_weights

tensor([[1.0000, 0.0000, 0.0000],
        [0.5018, 0.4982, 0.0000],
        [0.2733, 0.3262, 0.4004]], dtype=torch.float64)

### **MASKED ATTENTION OUTPUT**

In [44]:
masked_attention_output = torch.matmul(masked_attention_weights, v)
masked_attention_output

tensor([[1.0000, 0.2000],
        [0.9004, 0.3495],
        [0.5744, 0.5782]], dtype=torch.float64)