## Imports and Hyerparams

In [None]:
import torch
import tiktoken as ttk
from torch.nn import functional as F


# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------


## Working with Modular Layers

In [18]:
class AttentionLayer(torch.nn.Module):
    def __init__(self, d_model):
        super(AttentionLayer, self).__init__()
        
        
    def forward():
        pass
         
        

In [19]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        

In [20]:
class EncoderLayer(torch.nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(EncoderLayer, self).__init__()  

In [21]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(DecoderLayer, self).__init__()  

In [22]:
class Transformer(torch.nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = torch.nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = torch.nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = torch.nn.ModuleList([EncoderLayer(d_model, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = torch.nn.ModuleList([DecoderLayer(d_model, d_ff, dropout) for _ in range(num_layers)])

        self.fc = torch.nn.Linear(d_model, tgt_vocab_size)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

## Process Data

In [None]:
def process_input(file_name):
    with open(file_name, "r") as f:
        text = f.read()

        # this might be reflective of the encoder model but for right now i dont actually know
        # vocab_size = len(set(text))

        # tokenize with byte pair encoding.
        # This gives us a shorter token array lenght becuase we arent splitting by character
        enc = ttk.get_encoding("gpt2")
        encoded = enc.encode(text)
        print(f"Text: {list(text.split())[:5]}: Length: {len(text.split())}")
        print(f"Encoded: {encoded[:5]}: Length: {len(encoded)}")
        return enc.n_vocab, torch.tensor(encoded, dtype=torch.long)
        # looking at the tokenized output will essentially give us a "one to one" translation of the text

def get_batch(split, train_data, val_data, block_size, batch_size):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


vocab_size, data = process_input("input.txt")
print(f"{vocab_size=}")
print(data)
print(len(data))


## Embedding Model

In [26]:
class EmbeddingLayer(torch.nn.Module):
    def __init__(self, vocab_size, input_token_size, d_model, block_size, data):
        super().__init__()

        # define some important vars
        self.vocab_size = vocab_size
        self.data = data
        self.d_model = d_model
        self.input_size = input_token_size
        
        
        print("Input Size: ", self.input_size)
        print("Emb Size: ", self.d_model)

        self.token_embedding_table = torch.nn.Embedding(
            vocab_size, d_model
        )
        self.position_embedding_table = torch.nn.Embedding(block_size, d_model)

    def forward(self, x, y: torch.Tensor = None):
        print("Beginning of Forward")
        print(f"{x=}")
        print(f"{y=}\n")
        
        logits: torch.Tensor = self.token_embedding_table(x) 
        print(f"{logits.shape=}")
        if y is None:
            loss = None
        else:
            # logits becomes a tensor of size (Batch size, Sequence Length (T), vocab_size)
            B, T, C = logits.shape  # (Batch size, Sequence Length (T), vocab_size)
            logits = logits.view(
                B * T, C
            )  # reshape the logits so they can be used in cross entropy loss
            print(f"{y=}")
            print(f"{y.shape=}")
            print(f"{type(y)=}")
            targets = y.view(B * T)
            print(f"{logits.shape=} {logits=}")
            print(f"{targets.shape=} {targets=}")   
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            # get the predictions
            logits, _ = self.forward(x)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            x = torch.cat((x, idx_next), dim=1)  # (B, T+1)
        return x

In [27]:

n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
# block size and batch size can change
block_size = 10
batch_size = 5
xb, yb = get_batch("train", train_data, val_data, block_size, batch_size)

d_model = 20
embedding_layer = EmbeddingLayer(vocab_size, len(data), d_model, block_size, data)
# embedding_layer.to(device)


Input Size:  255888
Emb Size:  20


In [30]:
_, loss = embedding_layer.forward(xb, yb)
print(loss)
exit()

enc = ttk.get_encoding("gpt2")

decoded = enc.decode(
    embedding_layer.generate(
        torch.zeros(1, 1, dtype=torch.long), max_new_tokens=100
    )[0].tolist()
)
# print(f"{decoded:}")
# could do SGD but whatever
optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=1e-3)

# train the model
for steps in range(100):
    xb, yb = get_batch("train", train_data, val_data, 10, 5)
    logits, loss = embedding_layer.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    # print(loss.item())

Beginning of Forward


x=tensor([[  198,   198,    49, 30194,    25,   198, 22788,  6219, 11738,   565],
        [  286,  1971,   338,    11,   198,  2504,  1560,  2042, 29770,   654],
        [ 3428,  2951,   198, 23792,   465,  1490,   496,    11,   290,   326],
        [16599,     0,   198,    49,  1191,   290,   360, 49590,    11,   345],
        [  465,  3956,    11,   290,   314,  1842,   683,   880,    13,   198]])
y=tensor([[  198,    49, 30194,    25,   198, 22788,  6219, 11738,   565,   733],
        [ 1971,   338,    11,   198,  2504,  1560,  2042, 29770,   654,    13],
        [ 2951,   198, 23792,   465,  1490,   496,    11,   290,   326,   477],
        [    0,   198,    49,  1191,   290,   360, 49590,    11,   345,   547],
        [ 3956,    11,   290,   314,  1842,   683,   880,    13,   198,  1532]])

logits.shape=torch.Size([5, 10, 20])
y=tensor([[  198,    49, 30194,    25,   198, 22788,  6219, 11738,   565,   733],
        [ 1971,   338,    11,   198,  2504,  1560,  2042, 29770,   654,   

IndexError: Target 198 is out of bounds.