In [1]:
import torch
import torch.nn as nn
import math
from google.colab import files

In [2]:
!ls

ptb.train.txt  sample_data


In [3]:
with open("ptb.train.txt", 'r') as f:
    lines = f.readlines()

In [4]:
def get_tokens():
  tokens = [list(line) for line in lines]
  return tokens

token = get_tokens()

In [5]:
def flatten(tokens):
  return [items for i in tokens for items in i]

tokens = flatten(token)
print(len(tokens))

5101619


In [6]:
def unique_char(tokens):
  uniq_tokens = []
  for i in tokens:
    if i not in uniq_tokens:
      uniq_tokens.append(i)
  return uniq_tokens


uniq_tokens = unique_char(tokens)
print(len(uniq_tokens))

50


In [7]:
vocab = {}
for e, char in enumerate(uniq_tokens):
  vocab[char] = e

In [8]:
numerical = [vocab[char] for char in tokens]

In [9]:
seq_length = 64
num_samples = (len(numerical) - 1) // seq_length
dataset = torch.tensor(numerical[:num_samples * seq_length]).reshape(num_samples, seq_length)
dataset.shape

torch.Size([79712, 64])

In [10]:
batch_size = 32
num_batches = len(dataset) // batch_size
train_iter = dataset[:num_batches * batch_size].reshape((num_batches, batch_size, seq_length))
train_iter.shape

torch.Size([2491, 32, 64])

In [11]:
labels = torch.tensor(numerical[1:num_samples * seq_length + 1]).reshape(num_batches, batch_size, seq_length)
labels.shape

torch.Size([2491, 32, 64])

In [12]:
def textify(embedding):
    result = ""
    for idx in embedding:
        result += uniq_tokens[int(idx)]
    return result

In [13]:
print(textify(train_iter[10, 3]))
print(textify(labels[10, 3]))

ter business appears to depend heavily on the creativity and <un
er business appears to depend heavily on the creativity and <unk


----- START OF TRANSFORMER -----

In [14]:
class PositionalEncoding(nn.Module):
    def __init__(self, sequence_len, d_model, dropout_prob):
        super().__init__()
        self.sequence_len = sequence_len
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout_prob)
        self.register_buffer("positional_encoding", self.get_pos_encoding(d_model, sequence_len), False)

    def get_pos_encoding(self, d_model, max_len):
        encodings = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        two_i = torch.arange(0, d_model, 2, dtype=torch.float32)
        div_term = torch.exp(two_i * -(math.log(10000.0) / d_model))
        encodings[:, 0::2] = torch.sin(position * div_term)
        encodings[:, 1::2] = torch.cos(position * div_term)
        encodings = encodings.unsqueeze(1).requires_grad_(False)
        return encodings

    def forward(self, x):
        pe = self.positional_encoding[:x.shape[0]].detach().requires_grad_(False)
        x = x + pe
        x = self.dropout(x)
        return x

In [45]:
class AddNorm(nn.Module):
    def __init__(self, d_model, dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.ln = nn.LayerNorm(d_model)

    def forward(self, x, y):
        return self.ln(self.dropout(y) + x)

In [49]:
class FeedForward(nn.Module):
    def __init__(self, ffn_hiddens, d_model):
        super().__init__()
        self.lin1 = nn.Linear(d_model, ffn_hiddens)
        self.act = nn.ReLU()
        self.lin2 = nn.Linear(ffn_hiddens, d_model)
    
    def forward(self, x):
        return self.lin2(self.act(self.lin1(x)))

In [42]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.d_model = d_model
        self.key = nn.Linear(d_model, d_model, bias=False)
        self.query = nn.Linear(d_model, d_model, bias=False)
        self.value = nn.Linear(d_model, d_model, bias=True)
        self.output = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.scale = 1 / math.sqrt(self.d_k)
        
    def forward(self, k, q, v, mask=None):
        batch_size = q.shape[0]
        q = self.query(q)
        k = self.key(k)
        v = self.value(v)
        
        Q = q.view(batch_size, -1, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        K = k.view(batch_size, -1, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        V = v.view(batch_size, -1, self.num_heads, self.d_k).permute(0, 2, 1, 3)

        scores = Q @ K.permute(0, 1, 3, 2)
        scores *= self.scale
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('inf'))
        attn = torch.softmax(scores, dim=-1)
        x = self.dropout(attn) @ V
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.d_model)
        x = self.output(x)
        return x

In [18]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, ffn_hiddens, dropout):
        super().__init__()
        # MultiheadAttention -> AddNorm -> FFN -> AddNorm
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.addnorm1 = AddNorm(d_model, dropout)
        self.ffn = FeedForward(ffn_hiddens, d_model)
        self.addnorm2 = AddNorm(d_model, dropout)

    def forward(self, x):
        x = self.addnorm1(x, self.attention(x, x, x))
        x = self.addnorm2(x, self.ffn(x))
        return x

In [53]:
class TransformerEncoder(nn.Module):
    def __init__(self, d_model, vocab_size, sequence_len, num_heads, num_blocks, ffn_hiddens, dropout_prob):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(sequence_len, d_model, dropout_prob)
        self.enc_blocks = nn.Sequential(*[EncoderBlock(d_model, num_heads, ffn_hiddens, dropout_prob)
                                        for _ in range(num_blocks)])

    def forward(self, x):
        # (batch_size, seq_len) (32, 64)
        x = self.pos_encoding(self.embedding(x) * math.sqrt(self.d_model))
        # (batch_size, seq_len, d_model) (32, 64, 512)
        for blk in self.enc_blocks:
            x = blk(x)
        return x

In [20]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, dropout, ffn_hiddens):
        super().__init__()
        self.mask_attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.addnorm1 = AddNorm(d_model, dropout)
        
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.addnorm2 = AddNorm(d_model, dropout)

        self.ffn = FeedForward(ffn_hiddens, d_model)
        self.addnorm3 = AddNorm(d_model, dropout)

    def forward(self, dec, enc, trg_mask, src_mask):
        attention = self.mask_attention(dec, dec, dec, mask=trg_mask)
        _x = self.addnorm1(dec, attention)
        x = enc_dec_attn(_x, enc, enc, mask=src_mask)
        x = self.addnorm2(_x, x)
        _x = self.ffn(x)
        x = self.addnorm3(_x, x)
        return x 

In [21]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_blocks, num_heads, dropout, sequence_len):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(sequence_len, d_model, dropout)
        self.dec_blocks = nn.Sequential(*[DecoderBlock(d_model, num_heads, dropout)
                                        for _ in range(num_blocks)])
        self.lin = nn.Linear(d_model, d_model)

    def forward(self, x):
        x = self.pos_encoding(self.embedding(x) * math.sqrt(self.d_model))
        for blk in self.dec_blocks:
            x = blk(x)
        x = torch.softmax(self.lin(x))
        return self.softmax(x)

In [51]:
class Transformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config.d_model, config.vocab_size, config.sequence_len,
                                         config.num_heads, config.num_blocks, config.ffn_hiddens, 
                                         config.enc_dropout_prob)
        self.decoder = TransformerDecoder(config.vocab_size, config.d_model, config.num_blocks, 
                                         config.num_heads, config.dec_dropout_prob, config.sequence_len)

    def encode(self, x):
        return self.encoder(x)

    def decode(self, x):
        self.decoder(x)

    def forward(self, x):
        x = self.encode(x)
        raise Exception("decode")
        x = self.decode(x)
        return x

In [23]:
class TransformerConfig:
    d_model: int = 512
    vocab_size: int = 50
    sequence_len: int = 64
    enc_dropout_prob: float = 0.5
    dec_dropout_prob: float = 0.5
    dropout_prob: float = 0.2
    ffn_hiddens: int = 48
    num_blocks: int = 6
    num_heads: int = 8
    

In [None]:
num_epochs = 10
config = TransformerConfig()
net = Transformer(config)
lossfn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), 1e-3)

for epoch in range(num_epochs):
    for x, y in zip(train_iter, labels):
        optimizer.zero_grad()
        y_hat = net(x)
        raise Exception("here")
        loss = lossfn(y_hat, y)
        loss.backward()
        optimizer.step()
        