In [3]:
import torch
import tiktoken as ttk
from torch.nn import functional as F

In [65]:
class EmbeddingLayer(torch.nn.Module):
    def __init__(self, vocab_size, input_token_size, embedding_dimensions, data):
        super().__init__()

        # define some important vars
        self.vocab_size = vocab_size
        self.data = data
        self.emb_d = embedding_dimensions
        self.input_size = input_token_size
        
        print("Input Size: ", self.input_size)
        print("Emb Size: ", self.emb_d)

        self.token_embedding_table = torch.nn.Embedding(
            self.vocab_size, 2
        )

    def forward(self, x, y: torch.Tensor = None):
        logits: torch.Tensor = self.token_embedding_table(x)
        print(logits.shape)
        if y is None:
            loss = None
        else:
            # logits becomesa tensor of size (Batch size, Sequence Length (T), vocab_size)
            B, T, C = logits.shape  # (Batch size, Sequence Length (T), vocab_size)
            logits = logits.view(
                B * T, C
            )  # reshape the logits so they can be used in cross entropy loss
            print(f"{y=}")
            print(y.shape)
            targets = y.view(B * T)
            print(logits.shape)
            print(targets.shape)   
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            # get the predictions
            logits, _ = self.forward(x)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            x = torch.cat((x, idx_next), dim=1)  # (B, T+1)
        return x

In [21]:
def process_input(file_name):
    with open(file_name, "r") as f:
        text = f.read()

        # this might be reflective of the encoder model but for right now i dont actually know
        vocab_size = len(set(text))

        # tokenize with byte pair encoding.
        # This gives us a shorter token array lenght becuase we arent splitting by character
        enc = ttk.get_encoding("gpt2")
        encoded = enc.encode(text)
        print(f"Text: {list(text.split())[:5]}: Length: {len(text.split())}")
        print(f"Encoded: {encoded[:5]}: Length: {len(encoded)}")
        return enc.n_vocab, torch.tensor(encoded, dtype=torch.long)
        # looking at the tokenized output will essentially give us a "one to one" translation of the text

In [6]:
def get_batch(split, train_data, val_data, block_size, batch_size):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y

In [36]:
vocab_size, data = process_input("input.txt")
print(f"{vocab_size=}")
print(data)
print(len(data))

Text: ['First', 'Citizen:', 'Before', 'we', 'proceed']: Length: 155183
Encoded: [5962, 22307, 25, 198, 8421]: Length: 255888
vocab_size=50257
tensor([ 5962, 22307,    25,  ...,    33,  1094, 42391])
255888


In [66]:

n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
# block size and batch size can change
xb, yb = get_batch("train", train_data, val_data, 10, 5)

embedding_layer = EmbeddingLayer(vocab_size, len(data), 10, data)
_, loss = embedding_layer.forward(xb, yb)
print(loss)

enc = ttk.get_encoding("gpt2")

decoded = enc.decode(
    embedding_layer.generate(
        torch.zeros(1, 1, dtype=torch.long), max_new_tokens=100
    )[0].tolist()
)
# print(f"{decoded:}")
# could do SGD but whatever
optimizer = torch.optim.Adam(embedding_layer.parameters(), lr=1e-3)

# train the model
for steps in range(100):
    xb, yb = get_batch("train", train_data, val_data, 10, 5)
    logits, loss = embedding_layer.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    # print(loss.item())

Input Size:  255888
Emb Size:  10
torch.Size([5, 10, 2])
y=tensor([[  544,   329,   470,    26,   198, 15946,  1384,   326,    11,   618],
        [47118,  3963, 14545,  4944,    51,    25,   198,  1722,  1474,   355],
        [ 2390,  8267,    46,    25,   198, 24749,    11,   616, 15876,     0],
        [49654,   389,   345,  1111,    26,   198,  1870, 26246,   705, 48010],
        [32476,   783,  3160,   287,  8838,   379,   465, 10152,    11,   198]])
torch.Size([5, 10])
torch.Size([50, 2])
torch.Size([50])


IndexError: Target 544 is out of bounds.