# Loading and processing data

In [None]:
def read_file(filename):
    data = []
    with open(filename, encoding='utf-8') as fh:
        for line in fh.readlines():
            data.append(line)
    return data   

en_file_path = "Dataset//english-corpus.txt"
ur_file_path = "Dataset//urdu-corpus.txt"

english_sentences = read_file(en_file_path)
urdu_sentences = read_file(ur_file_path)


In [None]:
english_sentences = [sentence.rstrip() for sentence in english_sentences]
urdu_sentences = [sentence.rstrip() for sentence in urdu_sentences]

In [None]:
length_of_sentences = [len(line) for line in english_sentences]
print('Maximum lenght of english scentence:', max(length_of_sentences))
length_of_sentences = [len(line) for line in urdu_sentences]
print('Maximum lenght of urdu scentence:', max(length_of_sentences))

Maximum lenght of english scentence: 78
Maximum lenght of urdu scentence: 85


## Tokenization
We will use BPE as used in original paper

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

In [None]:
def create_tokenizer(filepath, vocab_size=10000, max_length=100):
    tokenizer=Tokenizer(BPE(unk_token='[UNK]'))
    # tokenizer.pre_tokenizer = Whitespace()
    trainer = BpeTrainer(special_tokens=["[PAD]", "[START]", "[END]", "[UNK]"], 
                         vocab_size=vocab_size) # order matters here
    tokenizer.train([filepath], trainer=trainer)
    tokenizer.save('eng_tokenizer.json')
    tokenizer.post_processor = TemplateProcessing(
        single="[START] $A [END]",
        special_tokens=[
            ("[START]", tokenizer.token_to_id("[START]")),
            ("[END]", tokenizer.token_to_id("[END]")),
        ]
    )
    tokenizer.enable_padding(
        length=max_length
    )
    tokenizer.enable_truncation(max_length)
    return tokenizer

In [None]:
eng_tokenizer = create_tokenizer("Dataset\\english-corpus.txt", 5000)
urdu_tokenizer = create_tokenizer("Dataset\\urdu-corpus.txt", 5000)

In [None]:
eng_output = eng_tokenizer.encode(english_sentences[0])
urdu_output = urdu_tokenizer.encode(urdu_sentences[0])
print(eng_output.tokens)
print(urdu_output.tokens)

['[START]', 'is zain ', 'your ', 'ne', 'p', 'he', 'w', '[END]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[START]', 'زین ', 'تمہارا ', 'بھ', 'تی', 'جا', ' ہے', '۔', '[END]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[

In [None]:
def convert_to_tokenIds(english_sentences, urdu_sentences):
    eng_tokens = []
    urdu_tokens = []
    for sentence in english_sentences:
        output = eng_tokenizer.encode_batchencode(sentence)
        eng_tokens.append(output.ids)
    for sentence in urdu_sentences:
        output = urdu_tokenizer.encode(sentence)
        urdu_tokens.append(output.ids)    
    return eng_tokens, urdu_tokens    

In [None]:
total = len(english_sentences)
train = int(0.9 * total)
eng_train, eng_test = english_sentences[:train], english_sentences[train:]
urdu_train, urdu_test = urdu_sentences[:train], urdu_sentences[train:]
print(f'Train set size : {len(eng_train)}')
print(f'Test set size : {len(eng_test)}')

Train set size : 22072
Test set size : 2453


In [None]:
enc_seq, dec_seq = eng_train[0], urdu_train[0]
print(enc_seq)
print(dec_seq)

is zain your nephew
زین تمہارا بھتیجا ہے۔


In [None]:
def add_padding(token_id, sequence, max_length):
    tokens_to_add = max_length - len(sequence)
    sequence = sequence + [token_id] * tokens_to_add
    return sequence

In [None]:
urdu_tokenizer.no_padding()
for enc_seq, dec_seq in zip(eng_train[:1], urdu_train[:1]):
    enc_seq = eng_tokenizer.encode(enc_seq)
    enc_seq = enc_seq.ids
    dec_seq = urdu_tokenizer.encode(dec_seq)
    dec_seq = dec_seq.ids
    dec_input, dec_output = dec_seq[:-1], dec_seq[1:]
    dec_input = add_padding('[PAD]', dec_input, 100) 
    dec_output = add_padding('[PAD]', dec_output, 100)
print(enc_seq)
print(dec_input)
print(dec_output)

[1, 696, 149, 126, 38, 806, 45, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 242, 2584, 263, 4503, 2465, 170, 132, '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',

# Creating dataset loader

In [None]:
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader

In [None]:
class TextSequenceDataset(Dataset):
    def __init__(self, eng_sentences, urdu_sentences, 
                 eng_tokenizer, urdu_tokenizer,
                 max_length):
        self.eng_sentences = eng_sentences
        self.urdu_sentences = urdu_sentences
        self.max_length = max_length
        self.eng_tokenizer = eng_tokenizer
        self.urdu_tokenizer = urdu_tokenizer

    def __len__(self):
        return len(self.eng_sentences)

    def __getitem__(self, idx):
        pad_token_id = urdu_tokenizer.token_to_id('[PAD]')
        enc_seq, dec_seq = self.eng_sentences[idx], self.urdu_sentences[idx]
        enc_seq = self.eng_tokenizer.encode(enc_seq)
        enc_seq = enc_seq.ids
        dec_seq = self.urdu_tokenizer.encode(dec_seq)
        dec_seq = dec_seq.ids
        dec_input, dec_output = dec_seq[:-1], dec_seq[1:]
        dec_input = add_padding(pad_token_id, dec_input, self.max_length) 
        dec_output = add_padding(pad_token_id, dec_output, self.max_length)
        
        return {
            "encoder_input": torch.tensor(enc_seq),
            "decoder_input": torch.tensor(dec_input),
            "decoder_output": torch.tensor(dec_output),

        }

In [None]:
train_dataset = TextSequenceDataset(eng_train, urdu_train, 
                              eng_tokenizer, urdu_tokenizer, 100)
val_dataset = TextSequenceDataset(eng_test, urdu_test, 
                              eng_tokenizer, urdu_tokenizer, 100)

In [None]:
BATCH_SIZE = 8
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
# Iterate through the DataLoader
batch = next(iter(train_dataloader))
print(batch['encoder_input'].shape)
print(batch['decoder_input'].shape)
print(batch['decoder_output'].shape)

torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 100])


# Setting up training loop

In [None]:
from model import Transformer

In [None]:
src_vocab_size = eng_tokenizer.get_vocab_size()
trgt_vocab_size = urdu_tokenizer.get_vocab_size()
model = Transformer(vocab_src=src_vocab_size, 
                    vocab_trgt=trgt_vocab_size,
                    num_heads=8)

In [None]:
model

Transformer(
  (embeddings_src): Embedding(5000, 512)
  (embeddings_trgt): Embedding(5000, 512)
  (positional_enc): PositionalEncoding()
  (linear): Linear(in_features=512, out_features=5000, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (softmax): Softmax(dim=-1)
)

In [None]:
# test data
x, y = batch['encoder_input'], batch['decoder_input']
output = model(x, y)

In [None]:
actual = batch['decoder_output']

In [None]:
logits_flat = output.view(-1, output.size(-1))
target_flat = actual.view(-1)

In [None]:
# write loss function
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.9)
# loss(logits_flat, target_flat)
# initialize optimzier
adam_optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [35]:
epochs = 1
for epoch in range(epochs):
    model.train()
    train_loss = 0.0

    for i, data in enumerate(train_dataloader):
        x, y, target = data['encoder_input'], data['decoder_input'], data['decoder_output']
        adam_optimizer.zero_grad()
        output = model(x, y)
        logits_flat = output.view(-1, output.size(-1))
        target_flat = target.view(-1)
        loss = loss_fn(logits_flat, target_flat)
        loss.backward()
        adam_optimizer.step()
        train_loss += loss.item()
        if i==10:
            break   
    # avg_loss = train_loss/len(train_dataloader)
    avg_loss = train_loss / i
    print(f"Epoch: {epoch}/{epochs}, Training Loss: {avg_loss}")

    model.eval()
    val_loss = 0.0

    # with torch.no_grad():
    #     for data in iter(val_dataloader):
    #         x, y, target = data['encoder_input'], data['decoder_input'], data['decoder_output']
    #         output = model(x, y)
    #         logits_flat = output.view(-1, output.size(-1))
    #         target_flat = target.view(-1)
    #         loss = loss_fn(logits_flat, target_flat)
    #         val_loss += loss.item()

    # avg_val_loss = val_loss/len(val_dataloader)
    # print(f"Epoch: {epoch}/{epochs}, Validation Loss: {avg_val_loss}") 


Epoch: 0/1, Training Loss: 9.296722221374512
