In [229]:
import torch, torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional  as F

import numpy as np

import time

## 01. Load Python Code File - Build the vocabulary.

In [1478]:
dataset_path = "./datasets/python.py"

In [1479]:
with open(dataset_path, "r") as f:
    code = f.read().lower()
    # to_rm = "#'()+,-./:;?[]!&"
    for char in to_rm: code = code.replace(char, "")

In [1480]:
len(code)

1582

In [1481]:
code = code[:10000]

In [1482]:
print(code[:300])


def greetname
    return f"hello {name}"

class person
    def __init__self name age
        selfname = name
        selfage = age
    
    def introduceself
        return f"my name is {selfname} and i am {selfage} years old"
    
    def birthdayself
        selfage = 1
        return f"happy bir


In [1483]:
unique_chars = sorted(list(set(code)))
len(unique_chars)

39

In [1484]:
unique_chars[:10]

['\n', ' ', '"', '0', '1', '2', '3', '4', '5', '8']

In [1485]:
char_to_index = {char: index for index, char in enumerate(unique_chars)}

In [1486]:
"".join(char_to_index.keys())

'\n "01234589=_abcdefghijlmnoprstuvwxyz{}'

In [1487]:
index_to_char = {index: char for index, char in enumerate(unique_chars)}

In [1488]:
index_to_char

{0: '\n',
 1: ' ',
 2: '"',
 3: '0',
 4: '1',
 5: '2',
 6: '3',
 7: '4',
 8: '5',
 9: '8',
 10: '9',
 11: '=',
 12: '_',
 13: 'a',
 14: 'b',
 15: 'c',
 16: 'd',
 17: 'e',
 18: 'f',
 19: 'g',
 20: 'h',
 21: 'i',
 22: 'j',
 23: 'l',
 24: 'm',
 25: 'n',
 26: 'o',
 27: 'p',
 28: 'r',
 29: 's',
 30: 't',
 31: 'u',
 32: 'v',
 33: 'w',
 34: 'x',
 35: 'y',
 36: 'z',
 37: '{',
 38: '}'}

In [1489]:
class Vocab:

    def __init__(self):
        
        self.chars: list = []
        self.char_to_index = {}
        self.index_to_char = {}

    def fit(self, text: str):
        self.chars = sorted(list(set(code)))
        self.char_to_index = {char: index for index, char in enumerate(unique_chars)}
        self.index_to_char = {index: char for index, char in enumerate(unique_chars)}

    def size(self) -> int:
        return len(self.chars)

    def encode(self, chars: str) -> int:
        return [self.char_to_index[char] for char in chars]

    def decode(self, indexes: int) -> str:
        return [self.index_to_char[index] for index in indexes]

In [1490]:
vocab = Vocab()

In [1491]:
vocab.fit(code)

In [1492]:
vocab.size()

39

In [1493]:
vocab.chars[:10]

['\n', ' ', '"', '0', '1', '2', '3', '4', '5', '8']

In [1494]:
vocab.encode("class")

[15, 23, 13, 29, 29]

In [1495]:
vocab.decode([6, 15, 4, 22, 22])

['3', 'c', '1', 'j', 'j']

## 02. Build The CodeDataset 

In [1496]:
len(code)

1582

In [1497]:
code[0:4]

'\ndef'

In [1498]:
code[4]

' '

In [1499]:
len(code)

1582

In [1500]:
context_size = 2

In [1501]:
inputs , target = [], []
for i in range(0, len(code) - context_size):
    inputs.append(vocab.encode(code[i:i+context_size]))
    target.append(vocab.encode(code[i+context_size]))

In [1502]:
inputs[22]

[31, 28]

In [1503]:
print(target[22])

[25]


In [1504]:
class CodeDatasetLangaugeModel(Dataset):

    def __init__(
        self,
        code: str,
        vocab: Vocab,
        context_size: int = 3
    ):
        super(CodeDatasetLangaugeModel, self).__init__()

        self.context_size: int = context_size
        self.vocab: Vocab = vocab
        
        self.inputs, self.targets = self._code_to_tensors(code=code)
        
    def _code_to_tensors(self, code):
        inputs , targets = [], []
        for i in range(0, len(code) - self.context_size):
            inputs.append(self.vocab.encode(code[i:i+self.context_size]))
            targets.append(self.vocab.encode(code[i+self.context_size]))

        inputs = torch.tensor(inputs).type(torch.long)
        targets = torch.tensor(targets).type(torch.long).flatten()
        
        return inputs, targets

    def __len__(self) -> int:
        return self.inputs.shape[0]

    def __getitem__(self, index: int):
        return self.inputs[index], self.targets[index]

In [1505]:
context_size= 3

In [1506]:
code_dataset = CodeDatasetLangaugeModel(
    code= code,
    vocab= vocab,
    context_size= context_size
)

In [1507]:
code_dataset[0:5]

(tensor([[ 0, 16, 17],
         [16, 17, 18],
         [17, 18,  1],
         [18,  1, 19],
         [ 1, 19, 28]]),
 tensor([18,  1, 19, 28, 17]))

In [1546]:
loader = DataLoader(
    dataset= code_dataset,
    batch_size= 32,
    shuffle=True
)

In [1547]:
len(iter(loader))

50

## 03. Build NGramLanguageModel

In [1548]:
class NGramLanguageModel(nn.Module):

    def __init__(
        self,
        vocab_size: int,
        emb_dim: int = 32,
        hidden_dim: int= 128,
        num_layers= 5,
    ):
        super(NGramLanguageModel, self).__init__()

        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        
        self.chars_embs = nn.Embedding(
            num_embeddings= vocab_size,
            embedding_dim= vocab_size
        )
        
        self.rnn = nn.LSTM(
            input_size= vocab_size,
            hidden_size= hidden_dim,
            num_layers=num_layers, 
            batch_first= True,
        )

        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64), 
            nn.ReLU(True),
            nn.Linear(64, vocab_size),
            nn.Softmax(dim=1)
        )

    def  forward(self, x: torch.Tensor):

        x = self.chars_embs(x)

        if x.ndim == 2:
            x = x.unsqueeze(0)
        
        batch_size = x.shape[0]
            
        hn = torch.randn(self.num_layers, batch_size, self.hidden_dim)
        cn = torch.randn(self.num_layers, batch_size, self.hidden_dim)

        x, _ = self.rnn(x, (hn, cn))

        return self.fc(x[:, -1, :])

In [1591]:
class MlpLanguageModel(nn.Module):
    
    def __init__(self, vocab_size, context_length, embedding_size, hidden_size):
        super().__init__()
        self.wte = nn.Embedding(vocab_size, embedding_size) # token embedding table

        self.mlp = nn.Sequential(
            nn.Linear(embedding_size, hidden_size),
            nn.Tanh(),
            nn.GELU(),
            nn.Linear(hidden_size, vocab_size)
        )

    def propogate(self, x):

        _, T, C = x.shape

        w = torch.tril(
            torch.ones(T, T)
        )

        w /= w.sum(dim=1, keepdim=True)

        return w @ x

    def forward(self, idx, targets=None):
        
        if idx.ndim == 1: idx = idx.unsqueeze(0)
            
        B, T = idx.size()
        
        x = self.wte(idx) # (B, T, embedding_size)
        
        emb = x.view(B, -1) # (B, T * embedding_size)
        
        # x = self.propogate(x)
        
        logits = self.mlp(x[:, -1, :])

        return F.softmax(logits, dim=1)

In [1592]:
torch.tril(torch.zeros(4, 4))

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [1593]:
# model = NGramLanguageModel(
#     vocab_size= vocab.size()
# )

model = MlpLanguageModel(
    vocab_size= vocab.size(),
    context_length= context_size,
    embedding_size= 10,
    hidden_size= 512
)

In [1594]:
model

MlpLanguageModel(
  (wte): Embedding(39, 10)
  (mlp): Sequential(
    (0): Linear(in_features=10, out_features=512, bias=True)
    (1): Tanh()
    (2): GELU(approximate='none')
    (3): Linear(in_features=512, out_features=39, bias=True)
  )
)

In [1595]:
x, y = code_dataset[0:1]

In [1596]:
x

tensor([[ 0, 16, 17]])

In [1597]:
x.shape

torch.Size([1, 3])

In [1598]:
model(x).shape

torch.Size([1, 39])

In [1599]:
def generate_code(
    model: nn.Module,
    vocab: Vocab,
    start_chars: str,
    max_tokens: int = 100,
    wait_time: float = 0.005
):
    encode_chars = vocab.encode(start_chars)
    
    chars = encode_chars.copy()

    indexes = list(range(vocab.size()))

    print(start_chars, end="")

    for _ in range(max_tokens):
        
        x = torch.tensor(chars).type(torch.long)
        
        probs = model(x).detach()
        
        pred =  torch.multinomial(probs, num_samples=1).item()
        
        tmp = chars.copy()
        chars[:-1] = tmp[1:]
        chars[-1] = pred

        time.sleep(wait_time)
        
        print(vocab.decode([pred])[0], end="")

In [1600]:
generate_code(
    model= model, 
    vocab= vocab,
    start_chars="python",
    max_tokens= 300 
)

pythona}lr=ce_u5neij13{yeseyp  3x"_y"g{vbd2_ge9l9evt_niohdywfpcn
zojw8_94w48buuc_p{wgt 
uz1io=_r{4
x l}vduldvsh=49 p0whnvsbx8nt9odnf8j3xm
zxb{a{rljnnw
awcz{9d 
 iggaf2ip{i_0"zifo"8hbag=0gsgzg9b
{vym3"e4htwy3}j"1o}=tipni_=4a4vnp
bgih2zz nptt=2r wdb9{}9a hj4 jgyb8 u}="wis0orl4js_
ozzta"1v9fpttm_0t_b22bh8
hc

In [1601]:
criterion = nn.CrossEntropyLoss()

In [1611]:
opt = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

In [1612]:
def train(
    model: nn.Module,
    loader: DataLoader,
    epochs: int,
    criterion,
    opt
):

    for epoch in range(epochs):

        total_loss = 0

        for inputs, targets in loader:

            opt.zero_grad()
            
            pred = model(inputs)

            loss = criterion(pred, targets)

            total_loss += loss.item()
            
            loss.backward()

            opt.step()

        if epoch%10 == 0: print(f"Epoch: {epoch}/{epochs} | loss: {total_loss/len(loader):.5f}")

In [None]:
train(
    model= model,
    loader= loader,
    criterion= criterion,
    opt= opt,
    epochs= 1000
)

Epoch: 0/1000 | loss: 3.28373
Epoch: 10/1000 | loss: 3.28492
Epoch: 20/1000 | loss: 3.28134
Epoch: 30/1000 | loss: 3.28611
Epoch: 40/1000 | loss: 3.28134
Epoch: 50/1000 | loss: 3.28372
Epoch: 60/1000 | loss: 3.28134
Epoch: 70/1000 | loss: 3.28253
Epoch: 80/1000 | loss: 3.28253
Epoch: 90/1000 | loss: 3.28134
Epoch: 100/1000 | loss: 3.28492
Epoch: 110/1000 | loss: 3.28372
Epoch: 120/1000 | loss: 3.28134
Epoch: 130/1000 | loss: 3.28134
Epoch: 140/1000 | loss: 3.28134
Epoch: 150/1000 | loss: 3.28014
Epoch: 160/1000 | loss: 3.28134
Epoch: 170/1000 | loss: 3.28014
Epoch: 180/1000 | loss: 3.28253
Epoch: 190/1000 | loss: 3.28253
Epoch: 200/1000 | loss: 3.28253
Epoch: 210/1000 | loss: 3.28014
Epoch: 220/1000 | loss: 3.27776
Epoch: 230/1000 | loss: 3.27895
Epoch: 240/1000 | loss: 3.28134
Epoch: 250/1000 | loss: 3.27895
Epoch: 260/1000 | loss: 3.28134
Epoch: 270/1000 | loss: 3.27895
Epoch: 280/1000 | loss: 3.27895
Epoch: 290/1000 | loss: 3.28253
Epoch: 300/1000 | loss: 3.28253
Epoch: 310/1000 | l

In [1608]:
generate_code(
    model= model, 
    vocab= vocab,
    start_chars="d",
    max_tokens= 100
)

de                                                                                                   