In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import math
from google.colab import files

In [None]:
with open("en_train.txt", 'r') as f:
    en_lines = f.readlines()
with open("fr_train.txt", 'r') as F:
    fr_lines = F.readlines()

In [None]:
def get_tokens(lines):
    for i, line in enumerate(lines):
        lines[i] = line.replace(' ', '')
        lines[i] = line.strip()
    tokens = [list(line) for line in lines]
    return tokens

en_token = get_tokens(en_lines)
fr_token = get_tokens(fr_lines)
en_token[:10]

[['G', 'o', '.'],
 ['H', 'i', '.'],
 ['H', 'i', '.'],
 ['R', 'u', 'n', '!'],
 ['R', 'u', 'n', '!'],
 ['W', 'h', 'o', '?'],
 ['W', 'o', 'w', '!'],
 ['F', 'i', 'r', 'e', '!'],
 ['H', 'e', 'l', 'p', '!'],
 ['J', 'u', 'm', 'p', '.']]

In [None]:
def flatten(tokens):
    return [items for i in tokens for items in i]

en_tokens_flat = flatten(en_token)
fr_tokens_flat = flatten(fr_token)
print(len(en_tokens_flat))
print(len(fr_tokens_flat))

3529268
4243940


In [None]:
def unique_char(tokens):
    uniq_tokens = []
    for i in tokens:
        if i not in uniq_tokens:
            uniq_tokens.append(i)
    return uniq_tokens

uniq_en_tokens = unique_char(en_tokens_flat)
uniq_fr_tokens = unique_char(fr_tokens_flat)
print(len(uniq_en_tokens))
print(len(uniq_fr_tokens))
print(uniq_en_tokens)

84
106
['G', 'o', '.', 'H', 'i', 'R', 'u', 'n', '!', 'W', 'h', '?', 'w', 'F', 'r', 'e', 'l', 'p', 'J', 'm', 'S', 't', 'a', ' ', 'I', 's', 'y', 'O', 'A', 'c', 'k', 'C', 'g', 'f', 'd', "'", '1', '9', 'K', 'L', 'N', 'T', 'B', 'D', 'b', 'q', 'z', 'v', 'M', ',', 'P', 'Y', 'x', 'j', 'U', 'E', '$', '5', '3', ':', '0', '8', 'V', '7', '&', '%', '-', '2', 'Q', '6', '4', '"', 'X', 'Z', 'é', '’', '€', '/', 'ç', '‘', 'а', '\xad', '–', 'ö']


In [None]:
def build_vocab(unique_tokens, trg=False):
    vocab = {}
    vocab["<PAD>"] = 0
    if trg:
        vocab["<START>"] = 1
        vocab["<END>"] = 2
    for e, char in enumerate(unique_tokens):
        vocab[char] = (e + 1) if not trg else (e + 3)
    return vocab

en_vocab = build_vocab(uniq_en_tokens)
fr_vocab = build_vocab(uniq_fr_tokens, trg=True)
print(fr_vocab)

{'<PAD>': 0, '<START>': 1, '<END>': 2, 'V': 3, 'a': 4, ' ': 5, '!': 6, 'S': 7, 'l': 8, 'u': 9, 't': 10, '.': 11, 'C': 12, 'o': 13, 'r': 14, 's': 15, '\u202f': 16, 'e': 17, 'z': 18, 'Q': 19, 'i': 20, '?': 21, 'Ç': 22, 'A': 23, 'f': 24, 'À': 25, "'": 26, 'd': 27, 'p': 28, 'ê': 29, '-': 30, 'n': 31, 'P': 32, 'v': 33, 'B': 34, 'j': 35, 'J': 36, 'c': 37, 'm': 38, 'y': 39, 'g': 40, 'é': 41, '’': 42, 'O': 43, 'h': 44, 'q': 45, 'M': 46, 'T': 47, 'L': 48, 'è': 49, ',': 50, 'b': 51, '1': 52, '9': 53, 'É': 54, 'I': 55, 'E': 56, 'ç': 57, 'x': 58, 'H': 59, 'N': 60, 'â': 61, 'D': 62, 'à': 63, 'F': 64, 'R': 65, 'G': 66, 'î': 67, 'û': 68, '\u2009': 69, 'U': 70, 'ô': 71, 'k': 72, 'K': 73, '8': 74, '3': 75, '0': 76, 'Ê': 77, ':': 78, '«': 79, '»': 80, 'ù': 81, 'œ': 82, 'ï': 83, '5': 84, 'Y': 85, '&': 86, '%': 87, '(': 88, ')': 89, '2': 90, '$': 91, 'ë': 92, 'w': 93, '6': 94, '‘': 95, '4': 96, 'Ô': 97, '7': 98, '"': 99, 'X': 100, 'W': 101, 'Z': 102, '\u200b': 103, 'С': 104, '+': 105, '‽': 106, '…': 107, 

In [None]:
def build_numerical(tokens, vocab):
    new_lines = []
    for line in tokens:
        new_line = []
        for char in line:
            new_line.append(vocab[char])
        new_lines.append(new_line)
    return new_lines


en_numerical = build_numerical(en_token, en_vocab)
fr_numerical = build_numerical(fr_token, fr_vocab)
en_numerical[:10]

[[1, 2, 3],
 [4, 5, 3],
 [4, 5, 3],
 [6, 7, 8, 9],
 [6, 7, 8, 9],
 [10, 11, 2, 12],
 [10, 2, 13, 9],
 [14, 5, 15, 16, 9],
 [4, 16, 17, 18, 9],
 [19, 7, 20, 18, 3]]

In [None]:
def pad_sequence(numerical, vocab):
    def _get_max_len(numerical):
        max = 0
        for i in numerical:
            length = len(i)
            if length > max:
                max = length
        return max
    pad_token = "<PAD>"
    max_len = _get_max_len(numerical)
    for i in numerical:
        while len(i) < max_len:
            i.append(vocab[pad_token])
    return numerical, vocab

padded_en_numerical, en_vocab = pad_sequence(en_numerical, en_vocab)
padded_fr_numerical, fr_vocab = pad_sequence(fr_numerical, fr_vocab)

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src_data, tgt_data):
        self.src_data = torch.tensor(src_data)
        self.tgt_data = torch.tensor(tgt_data)

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        return self.src_data[idx], self.tgt_data[idx]

----- START OF TRANSFORMER -----

In [None]:
dataset = TranslationDataset(en_numerical, fr_numerical)

dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, sequence_len, d_model, dropout_prob):
        super().__init__()
        self.sequence_len = sequence_len
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout_prob)
        self.register_buffer("positional_encoding", self.get_pos_encoding(d_model, sequence_len), False)

    def get_pos_encoding(self, d_model, max_len):
        encodings = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        two_i = torch.arange(0, d_model, 2, dtype=torch.float32)
        div_term = torch.exp(two_i * -(math.log(10000.0) / d_model))
        encodings[:, 0::2] = torch.sin(position * div_term)
        encodings[:, 1::2] = torch.cos(position * div_term)
        encodings = encodings.unsqueeze(1).requires_grad_(False)
        return encodings

    def forward(self, x):
        pe = self.positional_encoding[:x.shape[0]].detach().requires_grad_(False)
        x = x + pe
        x = self.dropout(x)
        return x

In [None]:
class AddNorm(nn.Module):
    def __init__(self, d_model, dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.ln = nn.LayerNorm(d_model)

    def forward(self, x, y):
        return self.ln(self.dropout(y) + x)

In [None]:
class FeedForward(nn.Module):
    def __init__(self, ffn_hiddens, d_model):
        super().__init__()
        self.lin1 = nn.Linear(d_model, ffn_hiddens)
        self.act = nn.ReLU()
        self.lin2 = nn.Linear(ffn_hiddens, d_model)
    
    def forward(self, x):
        return self.lin2(self.act(self.lin1(x)))

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.d_model = d_model
        self.key = nn.Linear(d_model, d_model, bias=False)
        self.query = nn.Linear(d_model, d_model, bias=False)
        self.value = nn.Linear(d_model, d_model, bias=True)
        self.output = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.scale = 1 / math.sqrt(self.d_k)
        
    def forward(self, q, k, v, mask=None):
        batch_size = q.shape[0]
        q = self.query(q)
        k = self.key(k)
        v = self.value(v)
        
        Q = q.view(batch_size, -1, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        K = k.view(batch_size, -1, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        V = v.view(batch_size, -1, self.num_heads, self.d_k).permute(0, 2, 1, 3)

        scores = Q @ K.permute(0, 1, 3, 2)
        scores *= self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = torch.softmax(scores, dim=-1)
        x = self.dropout(attn) @ V
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.d_model)
        x = self.output(x)
        return x

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, ffn_hiddens, dropout):
        super().__init__()
        # MultiheadAttention -> AddNorm -> FFN -> AddNorm
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.addnorm1 = AddNorm(d_model, dropout)
        self.ffn = FeedForward(ffn_hiddens, d_model)
        self.addnorm2 = AddNorm(d_model, dropout)

    def forward(self, x, src_mask):
        x = self.addnorm1(x, self.attention(x, x, x, mask=src_mask))
        x = self.addnorm2(x, self.ffn(x))
        return x

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, d_model, vocab_size, sequence_len, num_heads, num_blocks, ffn_hiddens, dropout_prob):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(sequence_len, d_model, dropout_prob)
        self.enc_blocks = nn.Sequential(*[EncoderBlock(d_model, num_heads, ffn_hiddens, dropout_prob)
                                        for _ in range(num_blocks)])

    def forward(self, x, src_mask):
        # (batch_size, seq_len) (32, 64)
        x = self.pos_encoding(self.embedding(x) * math.sqrt(self.d_model))
        # (batch_size, seq_len, d_model) (32, 64, 512)
        for blk in self.enc_blocks:
            x = blk(x, src_mask)
        return x

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, dropout, ffn_hiddens):
        super().__init__()
        # Masked MHA -> AddNorm -> EncoderDecoder MHA -> AddNorm -> FFN -> AddNorm
        self.mask_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.addnorm1 = AddNorm(d_model, dropout)
        
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.addnorm2 = AddNorm(d_model, dropout)

        self.ffn = FeedForward(ffn_hiddens, d_model)
        self.addnorm3 = AddNorm(d_model, dropout)

    def forward(self, dec, enc, trg_mask, src_mask):
        attention = self.mask_attention(dec, dec, dec, mask=trg_mask)
        _x = self.addnorm1(dec, attention)
        x = self.enc_dec_attn(_x, enc, enc, mask=src_mask)
        x = self.addnorm2(_x, x)
        _x = self.ffn(x)
        x = self.addnorm3(_x, x)
        return x 

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, ffn_hiddens, num_blocks, num_heads, dropout, sequence_len):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(sequence_len, d_model, dropout)
        self.dec_blocks = nn.Sequential(*[DecoderBlock(d_model, num_heads, dropout, ffn_hiddens)
                                        for _ in range(num_blocks)])
        self.lin = nn.Linear(d_model, d_model)

    def forward(self, trg, enc_out, trg_mask, src_mask):
        x = self.pos_encoding(self.embedding(trg) * math.sqrt(self.d_model))
        for blk in self.dec_blocks:
            x = blk(x, enc_out, trg_mask, src_mask)
        x = self.lin(x)
        return x

In [None]:
class Transformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config.d_model, config.enc_vocab_size, config.sequence_len,
                                         config.num_heads, config.num_blocks, config.ffn_hiddens, 
                                         config.dropout_prob)
        self.decoder = TransformerDecoder(config.dec_vocab_size, config.d_model, config.ffn_hiddens, config.num_blocks, 
                                         config.num_heads, config.dropout_prob, config.sequence_len)

    def encode(self, x, src_mask):
        return self.encoder(x, src_mask)

    def decode(self, trg, enc_out, trg_mask, src_mask):
        return self.decoder(trg, enc_out, trg_mask, src_mask)
    
    def make_src_mask(self, src):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != 0).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]
        trg_subsequent_mask = torch.tril(torch.ones(trg_len, trg_len)).bool()
        trg_mask = trg_pad_mask & trg_subsequent_mask
        return trg_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_out = self.encode(src, src_mask)
        x = self.decode(trg, enc_out, trg_mask, src_mask)
        return x

In [None]:
class TransformerConfig:
    d_model: int = 512
    enc_vocab_size: int = 84
    dec_vocab_size: int = 106
    sequence_len: int = 83
    dropout_prob: float = 0.1
    ffn_hiddens: int = 2048
    num_blocks: int = 6
    num_heads: int = 8
    

In [None]:
num_epochs = 10
config = TransformerConfig()
net = Transformer(config)
lossfn = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(net.parameters(), 3e-4)

net.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    for x, y in dataloader:
        # (batch_size, seq_len)
        optimizer.zero_grad()
        y_hat = net(x, y[:, :-1])
        y_hat = y_hat.contiguous().view(-1, y_hat.shape[-1])
        y = y[:, 1:].contiguous().view(-1)
        loss = lossfn(y_hat, y)
        print(f"Loss is: {loss}")
        epoch_loss += loss
        loss.backward()
        optimizer.step()
    print(f"Loss on epoch {epoch} was {epoch_loss.item}")
