## Imports and Hyerparams

In [12]:
import torch
import tiktoken as ttk
from torch.nn import functional as F
from datasets import load_dataset

# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
context_length = 4 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
torch.set_default_device("cuda") if torch.cuda.is_available() else torch.set_default_device("cpu")
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------


## Working with Modular Layers

In [13]:
class AttentionLayer(torch.nn.Module):
    def __init__(self, d_model):
        super(AttentionLayer, self).__init__()
        
        
    def forward():
        pass
         
        

In [14]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        

In [15]:
class EncoderLayer(torch.nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(EncoderLayer, self).__init__()  

In [16]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(DecoderLayer, self).__init__()  

In [17]:
class Transformer(torch.nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = torch.nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = torch.nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = torch.nn.ModuleList([EncoderLayer(d_model, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = torch.nn.ModuleList([DecoderLayer(d_model, d_ff, dropout) for _ in range(num_layers)])

        self.fc = torch.nn.Linear(d_model, tgt_vocab_size)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

## Process Data

In [23]:
def process_input():

    ds = load_dataset("milkshake721/2.1M-wiki-STEM")    
    text = ds["text"]

    # this might be reflective of the encoder model but for right now i dont actually know
    # vocab_size = len(set(text))

    # tokenize with byte pair encoding.
    # This gives us a shorter token array lenght becuase we arent splitting by character
    enc = ttk.get_encoding("gpt2")
    encoded = enc.encode(text)
    # im also thinking that maybe the length of the encoded list is good? maybe?
    n_vocab = len(encoded)
    return n_vocab, torch.tensor(encoded, dtype=torch.long)
    # looking at the tokenized output will essentially give us a "one to one" translation of the text

def build_cbow_pairs(data, context_size=2):
    for i in range(context_size, len(data) - context_size):
        context = data[i - context_size:i] + data[i + 1:i + context_size + 1]
        center = data[i]
        yield context, center

vocab_size, data = process_input()
print(f"{vocab_size=}")
print(data)
print(len(data))


OSError: Not enough disk space. Needed: 2.84 GiB (download: 1.42 GiB, generated: 1.42 GiB, post-processed: Unknown size)

## Embedding Model

In [None]:
class EmbeddingLayer(torch.nn.Module):
    def __init__(self, vocab_size, input_token_size, d_model, context_length, data):
        super().__init__()

        # define some important vars
        self.vocab_size = vocab_size
        self.data = data
        self.d_model = d_model
        self.input_size = input_token_size
        
        
        print("Input Size: ", self.input_size)
        print("Emb Size: ", self.d_model)

        self.token_embedding_table = torch.nn.Embedding(
            vocab_size, d_model, 
        )
        self.linear_one = torch.nn.Linear(context_length * d_model, 128)
        self.linear_two = torch.nn.Linear(128, vocab_size)
        

    def forward(self, x):
        input_embeds = self.token_embedding_table(x).mean().view((1, -1))
        out = F.relu(self.linear_one(input_embeds))
        out = self.linear_two(out)
        return out

    # def generate(self, x, max_new_tokens):
    #     for _ in range(max_new_tokens):
    #         # get the predictions
    #         logits, _ = self.forward(x)
    #         # focus only on the last time step
    #         logits = logits[:, -1, :]  # becomes (B, C)
    #         # apply softmax to get probabilities
    #         probs = F.softmax(logits, dim=-1)  # (B, C)
    #         # sample from the distribution
    #         idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
    #         # append sampled index to the running sequence
    #         x = torch.cat((x, idx_next), dim=1)  # (B, T+1)
    #     return x

In [10]:

d_model = 8
embedding_layer = EmbeddingLayer(vocab_size, len(data), d_model, context_length, data)
# embedding_layer.to(device)


Input Size:  255888
Emb Size:  8


In [11]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=1e-3)

# train the model
for steps in range(1):
    total_loss = 0
    for context, target in build_cbow_pairs(data, context_length):
        # print(context, "->", target)

        optimizer.zero_grad()
        logits = embedding_layer(context)
        loss = loss_fn(logits, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        print(f"loss: {loss.item()}")
    

    # print(loss.item())

tensor([[ 1.3405,  0.0939,  0.2473, -1.0186,  0.2421,  1.4609,  1.2277, -0.1759,
          1.6417,  1.3647,  1.0236, -2.0002,  0.8254, -0.9711, -0.6286, -0.1855,
         -1.0792,  0.8668, -1.1851, -1.2719, -1.6562,  1.7673, -0.0830,  0.7230,
          0.2229, -0.0892, -0.4440,  0.9428,  1.5884, -1.1272, -0.0233,  0.9215]],
       device='cuda:0', grad_fn=<ViewBackward0>)


ValueError: Expected input batch_size (1) to match target batch_size (0).