## Imports and Hyerparams

In [28]:
import torch
import tiktoken as ttk
from torch.nn import functional as F
from datasets import load_dataset
from transformers import AutoTokenizer
import os
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
context_length = 4 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
torch.set_default_device("cuda") if torch.cuda.is_available() else torch.set_default_device("cpu")
print(torch.cuda.is_available())
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------


True


## Working with Modular Layers

In [29]:
class AttentionLayer(torch.nn.Module):
    def __init__(self, d_model):
        super(AttentionLayer, self).__init__()
        
        
    def forward():
        pass
         
        

In [30]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        

In [31]:
class EncoderLayer(torch.nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(EncoderLayer, self).__init__()  

In [32]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super(DecoderLayer, self).__init__()  

In [33]:
class Transformer(torch.nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = torch.nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = torch.nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = torch.nn.ModuleList([EncoderLayer(d_model, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = torch.nn.ModuleList([DecoderLayer(d_model, d_ff, dropout) for _ in range(num_layers)])

        self.fc = torch.nn.Linear(d_model, tgt_vocab_size)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

## Process Data

In [34]:
def process_input():

    ds = load_dataset("milkshake721/2.1M-wiki-STEM", split="train") 
    tokenizer = AutoTokenizer.from_pretrained("gpt2")

    def tokenize_batch(batch):
        return tokenizer(batch["text"], padding=False, truncation=False)

    tokenized_ds = ds.map(tokenize_batch, batched=True, num_proc=4)
    flat_encoded = [token for example in tokenized_ds["input_ids"] for token in example]
    # im also thinking that maybe the length of the encoded list is good? maybe?
    n_vocab = len(set(flat_encoded))
    return n_vocab, torch.tensor(flat_encoded, dtype=torch.long)
    # looking at the tokenized output will essentially give us a "one to one" translation of the text

def build_cbow_pairs(data, context_size=2):
    for i in range(context_size, len(data) - context_size):
        left = data[i - context_size:i]
        right = data[i + 1:i + context_size + 1]
        context = torch.cat((left, right))
        center = data[i]
        yield context, center

if os.path.exists("./data.pt"):
    print("here")
    data = torch.load("data.pt")
    vocab_size = int(torch.max(data)) + 1
else:
    vocab_size, data = process_input()
    torch.save(data, "data.pt")



here


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## Embedding Model

In [None]:
class EmbeddingLayer(torch.nn.Module):
    def __init__(self, vocab_size, input_token_size, d_model, context_length, data):
        super().__init__()

        # define some important vars
        self.vocab_size = vocab_size
        self.data = data
        self.d_model = d_model
        self.input_size = input_token_size

        self.token_embedding_table = torch.nn.Embedding(
            vocab_size, d_model, 
        )
        self.linear_one = torch.nn.Linear(d_model, vocab_size)
        

    def forward(self, x):
        embeds: torch.Tensor = self.token_embedding_table(x)
        input_embeds = embeds.mean(dim=0, keepdim=True)
        
        out: torch.Tensor = self.linear_one(input_embeds)
        return out.squeeze(0)

    # def generate(self, x, max_new_tokens):
    #     for _ in range(max_new_tokens):
    #         # get the predictions
    #         logits, _ = self.forward(x)
    #         # focus only on the last time step
    #         logits = logits[:, -1, :]  # becomes (B, C)
    #         # apply softmax to get probabilities
    #         probs = F.softmax(logits, dim=-1)  # (B, C)
    #         # sample from the distribution
    #         idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
    #         # append sampled index to the running sequence
    #         x = torch.cat((x, idx_next), dim=1)  # (B, T+1)
    #     return x

In [None]:

d_model = 8
embedding_layer = EmbeddingLayer(vocab_size, vocab_size, d_model, context_length, data)
# embedding_layer.to(device)


In [11]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=1e-3)

# train the model
for epoch in range(1):
    
    step = 0
    log_loss = 0
    for context, target in build_cbow_pairs(data, context_length):
        # print(context, "->", target)

        optimizer.zero_grad()
        logits = embedding_layer(context)

        
        loss = loss_fn(logits, target)
        loss.backward()
        optimizer.step()
        log_loss += loss.item()

        if step % 1000 == 0 and step > 0:
            print(f"Step {step}, Avg loss (last {1000}): {log_loss / 1000:.4f}")
            log_loss = 0
        step += 1


Step 1000, Avg loss (last 1000): 10.5790
Step 2000, Avg loss (last 1000): 9.8687


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
