Codex:
0 : "Home Page"
1 : "Explore"
2 : "Product A"
3 : "Product B"
4 : "Product C"
5 : "Add Product"
6 : "Close App"
7 : "Buy"

In [3]:
import torch
import math
import random
from torch import nn
from torch.nn import functional as F


# torch.manual_seed(208)
class VirtualShopper:
    def __init__(self):
        self.products = {0: 2, 1: 3, 2: 4}
        self.product_scores = torch.tensor([0.85, 0.2, 0.1])
        # self.product_scores = F.softmax(self.product_scores, dim=0)

        # Make an environment where if the user has two or more Product C in their cart it will more likely buy
        self.selected_products = torch.multinomial(self.product_scores, num_samples=3, replacement=True)
        self.actions = [0, 1]
        self.cart = []
        for sp in self.selected_products:
            self.actions.append(self.products[int(sp)])
            adding_to_cart = 5 if random.randint(0, 1) == 1 else 1
            self.actions.append(adding_to_cart)
            if adding_to_cart == 5:
                self.cart.append(self.products[int(sp)])

        chance_of_buying = 0.
        for item in self.cart:
            if item == 4:
                chance_of_buying += 3.33
            if item == 3:
                chance_of_buying += 1.
            if item == 1:
                chance_of_buying += 0.25

        if chance_of_buying >= random.random() * 10:
            self.actions.append(7)
        else:
            self.actions.append(6)

    def get_actions(self):
        return self.actions

    def get_labels(self):
        return self.actions[-1] == 7


In [4]:
sample_size = 1_000
shopper = [VirtualShopper().get_actions() for _ in range(sample_size)]
shopper = torch.tensor(shopper)
# print(f"train_data: {shopper}")
# print(f"train_labels: {(shopper[:, -1] == 7).view(10, 1)}")

# Split dataset
split_percentage = int(0.9 * len(shopper))
train_data = shopper[:split_percentage]
val_data = shopper[split_percentage:]

In [184]:
torch.manual_seed(28)
context_length = 8
batch_size = 4


def get_batch(split):
    data = train_data if split == 'train' else val_data
    random_offsets = torch.randint(len(data) - context_length, (batch_size,))
    xb = torch.stack([data[i:i + context_length] for i in random_offsets])
    yb = torch.stack([data[i:i + context_length, -1] == 7 for i in random_offsets])
    return xb, yb.float()


xb, yb = get_batch('train')
print(xb.shape)
print(xb)

print(yb.shape)
print(yb)

torch.Size([4, 8, 9])
tensor([[[0, 1, 3, 1, 4, 1, 2, 5, 6],
         [0, 1, 2, 5, 2, 5, 2, 1, 6],
         [0, 1, 2, 1, 2, 5, 2, 5, 6],
         [0, 1, 2, 5, 3, 1, 2, 1, 6],
         [0, 1, 2, 1, 2, 1, 2, 1, 6],
         [0, 1, 2, 5, 2, 5, 2, 5, 6],
         [0, 1, 2, 1, 2, 1, 2, 5, 6],
         [0, 1, 3, 5, 2, 5, 2, 1, 7]],

        [[0, 1, 2, 1, 2, 1, 2, 1, 6],
         [0, 1, 2, 5, 2, 1, 2, 5, 6],
         [0, 1, 2, 1, 2, 5, 2, 5, 6],
         [0, 1, 2, 1, 2, 5, 2, 1, 6],
         [0, 1, 2, 5, 2, 1, 2, 5, 6],
         [0, 1, 2, 1, 2, 5, 3, 5, 6],
         [0, 1, 2, 1, 2, 5, 2, 1, 6],
         [0, 1, 2, 1, 4, 5, 4, 5, 7]],

        [[0, 1, 2, 1, 2, 1, 2, 1, 6],
         [0, 1, 2, 1, 2, 1, 2, 1, 6],
         [0, 1, 2, 5, 2, 5, 2, 5, 6],
         [0, 1, 2, 1, 3, 1, 2, 1, 6],
         [0, 1, 2, 5, 2, 1, 3, 1, 6],
         [0, 1, 2, 5, 2, 5, 2, 5, 6],
         [0, 1, 2, 5, 3, 1, 2, 5, 6],
         [0, 1, 2, 5, 4, 1, 2, 5, 6]],

        [[0, 1, 2, 1, 2, 1, 2, 5, 6],
         [0, 1, 2, 5, 

This might be the closest one yet, try adding attention to this model see how it goes

In [394]:
n_embd = 32
category = 1


# TODO: Rewrite the whole thing to use an Embeddings follow the paper closely or repurpose this as the FFN
class GPT(nn.Module):
    def __init__(self):
        super().__init__()

        self.embedding_table = nn.Embedding(9, category)
        self.linear = nn.Linear(9, category)

    def forward(self, activities, labels=None):
        logits = self.embedding_table(activities)
        B, T, C, D = logits.shape
        logits = logits.view(B * T, C * D)
        logits = self.linear(logits)

        if labels is None:
            loss = None
        else:
            B, T = logits.shape
            # print(B, T)
            labels = labels.view(B * T, 1)  # B, T, 1
            # print(labels.shape, '=====')

            loss = F.binary_cross_entropy_with_logits(logits, labels)

        return logits, loss


model = GPT()
logits, loss = model(xb, yb)
# print(loss)

In [407]:
max_iteration = 5
block_size = 8

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for _ in range(max_iteration):
    # Get Samples
    actions, labels = get_batch('train')

    # Evaluate the loss
    logits, loss = model(actions, labels)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss)

tensor(0.7905, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


In [393]:
prediction = model(torch.tensor([[[0, 1, 4, 5, 3, 5, 3, 5, 7]]]))
prediction

(tensor([[0.0939]], grad_fn=<AddmmBackward0>), None)

In [1677]:
actions, labels = get_batch('train')

In [1773]:
# Version 4 Self-Attention
import math

torch.manual_seed(1337)
B, T, C = 4, 4, 32
# x = torch.randn(B, T, C)
x = actions
C = 8

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)  # (B, T, 16)
q = query(x)  # (B, T, 16)
v = value(x)  # (B, T, 16)
wei = q @ k.transpose(-2, -1) * (1 / math.sqrt(k.size(-1)))  # (B, T, 16) @ (B, 16, T) = (B, T, T)
# tril = torch.tril(torch.ones((T, T)))
# wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ v  # (B, T, T) @ (B, T, 16) = (B, T, 16)

out.shape

torch.Size([1, 30, 16])