In [None]:
import torch
import torch.nn as nn
from google.colab import files

In [None]:
train_file = files.upload()

In [None]:
with open("ptb.train.txt", 'r') as f:
    lines = f.readlines()

In [None]:
def get_tokens():
  tokens = [list(line) for line in lines]
  return tokens

token = get_tokens()

In [None]:
def flatten(tokens):
  return [items for i in tokens for items in i]

tokens = flatten(token)
print(len(tokens))

In [None]:
def unique_char(tokens):
  uniq_tokens = []
  for i in tokens:
    if i not in uniq_tokens:
      uniq_tokens.append(i)
  return uniq_tokens


uniq_tokens = unique_char(tokens)
print(len(uniq_tokens))

In [None]:
vocab = {}
for e, char in enumerate(uniq_tokens):
  vocab[char] = e

In [None]:
numerical = [vocab[char] for char in tokens]

In [None]:
seq_length = 64
num_samples = (len(wiki_numerical) - 1) // seq_length
dataset = wiki_numerical[:num_samples * seq_length].reshape(num_samples, seq_length)
dataset.shape

In [None]:
batch_size = 32
num_batches = len(dataset) // batch_size
train_iter = dataset[:num_batches * batch_size].reshape((num_batches, batch_size, seq_length))
train_iter.shape

In [None]:
labels = numerical[1:num_samples * seq_length + 1]).reshape(num_batches, batch_size, seq_length)
labels.shape

In [None]:
def textify(embedding):
    result = ""
    indices = torch.argmax(embedding, axis=1)
    for idx in indices:
        result += uniq_tokens[int(idx)]
    return result

In [None]:
print(textify(train_iter[10, :, 3]))
print(textify(labels[10, :, 3]))

----- START OF TRANSFORMER -----

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, sequence_len, d_model, dropout_prob):
        super().__init__()
        self.sequence_len = sequence_len
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout_prob)
        self.register_buffer("positional_encoding", self.get_pos_encoding(d_model, sequence_len), False)

    def get_pos_encoding(self, d_model, max_len):
        encodings = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        two_i = torch.arange(0, d_model, 2, dtype=torch.float32)
        div_term = torch.exp(two_i * -(math.log(10000.0) / d_model))
        encodings[:, 0::2] = torch.sin(position * div_term)
        encodings[:, 1::2] = torch.cos(position * div_term)
        encodings = encodings.unsqueeze(1).requires_grad_(False)
        return encodings

    def forward(self, x):
        pe = self.positional_encoding[:x.shape[0]].detach().requires_grad_(False)
        x = x + pe
        x = self.dropout(x)
        return x

In [None]:
def AddNorm(nn.Module):
    def __init__(self, d_model, dropout):
        self.dropout = nn.Dropout(dropout)
        self.ln = nn.LayerNorm(d_model)

    def forward(self, x, y):
        return self.ln(self.dropout(y), x)

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, dropout):
        self.lin1 = nn.Linear()
        self.act = nn.ReLU()
        self.lin2 = nn.Linear()

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model, dropout):
        # MultiheadAttention -> AddNorm -> FFN -> AddNorm
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.addnorm1 = AddNorm(d_model, dropout)
        self.ffn = FeedForward(ffn_hiddens, d_model)
        self.addnorm2 = AddNorm(d_model, dropout)

    def forward(self, x):
        x = self.addnorm1(x, self.attention(x, x, x))
        x = self.addnorm2(x, self.ffn(x))
        return x

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, d_model):
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoding = PositionalEncoding(sequence_len, d_model, dropout_prob)

    def forward(self, x):
        x = self.pos_endoding(self.embedding(x) * math.sqrt(self.d_model))
        for blk in self.enc_blocks:
            x = blk(x)
        return x

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        pass

    def forward(self, x):
        pass

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_blocks, num_heads, dropout):
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(sequence_len, d_model, dropout)
        self.dec_blocks = nn.Sequential(DecoderBlock(d_model, num_heads, dropout)
                                        for _ in range(num_blocks)
        self.lin = nn.Linear()
        self.softmax = nn.Softmax()

    def forward(self, x):
        x = self.pos_encoding(self.embedding(x) * math.sqrt(self.d_model))
        for blk in self.dec_blocks:
            x = blk(x)
        x = self.lin(x)
        return self.softmax(x)

In [None]:
class Transformer(nn.Module):
    def __init__(self, config):
        self.encode = TransformerEncoder()
        self.decode = TransformerDecoder()

    def encode(self, x):
        return self.encode(x)

    def decode(self, x):
        self.decode(x)

    def forward(self, x):
        x = self.encode(x)
        x = self.decode(x)
        return x

In [None]:
class TransformerConfig:
    d_model: int = 512
    vocab_size: int = 50
    sequence_len: int = 64
    enc_dropout_prob: float = 0.5
    dropout_prob: float = 0.2
    ffn_hiddens: int = 48
    num_blocks: int = 6
    num_heads: int = 8
    

In [None]:
num_epochs = 10
config = TransformerConfig()
net = Transformer(config)

optimizer = torch.optim.SGD(net.parameters(), 1e-3)

for epoch in range(num_epochs):
    for x, y in zip(train_iter, labels):
        optimizer.zero_grad()
        y_hat = net(x)
        loss = lossfn(y_hat, y)
        loss.backward()
        optimizer.step()
        