Codex:
0 : "Home Page"
1 : "Explore"
2 : "Product A"
3 : "Product B"
4 : "Product C"
5 : "Add Product"
6 : "Close App"
7 : "Buy"

In [586]:
import torch
import math
import random
from torch import nn
from torch.nn import functional as F


# torch.manual_seed(208)
class VirtualShopper:
    def __init__(self):
        self.products = {0: 2, 1: 3, 2: 4}
        self.product_scores = torch.tensor([0.85, 0.2, 0.1])
        # self.product_scores = F.softmax(self.product_scores, dim=0)

        # Make an environment where if the user has two or more Product C in their cart it will more likely buy
        self.selected_products = torch.multinomial(self.product_scores, num_samples=3, replacement=True)
        self.actions = [0, 1]
        self.cart = []
        for sp in self.selected_products:
            self.actions.append(self.products[int(sp)])
            adding_to_cart = 5 if random.randint(0, 1) == 1 else 1
            self.actions.append(adding_to_cart)
            if adding_to_cart == 5:
                self.cart.append(self.products[int(sp)])

        chance_of_buying = 0.
        for item in self.cart:
            if item == 4:
                chance_of_buying += 3.33
            if item == 3:
                chance_of_buying += 1.
            if item == 1:
                chance_of_buying += 0.25

        if chance_of_buying >= random.random() * 10:
            self.actions.append(7)
        else:
            self.actions.append(6)

    def get_actions(self):
        return self.actions[:-1]

    def get_labels(self):
        return self.actions[-1] == 7


In [1684]:
# for _ in range(batch_size):
#     action_batch = []
#     label_batch = []
#     for _ in range(100):
#         user_activity = VirtualShopper()
#         action_batch.append(torch.tensor(user_activity.get_actions(), dtype=torch.long))
#         label_batch.append(torch.tensor([user_activity.get_labels()], dtype=torch.bool))
#     # user_actions.append([[1, 2]])
#     user_actions.append(action_batch)
#     action_labels.append(label_batch)
# 
# user_actions = torch.tensor(user_actions)
# action_labels = torch.tensor(action_labels)
# print(user_actions.shape)
# print(action_labels.shape)
# print(torch.stack(user_actions))

# print(user_actions[0][:2])
# print(action_labels[0][:2])

batch_size = 64

train_actions = []
train_labels = []

for _ in range(3000):
    user_activity = VirtualShopper()
    train_actions.append(torch.tensor(user_activity.get_actions(), dtype=torch.float))
    train_labels.append(torch.tensor([user_activity.get_labels()], dtype=torch.float))

val_actions = []
val_labels = []
for _ in range(1000):
    user_activity = VirtualShopper()
    val_actions.append(torch.tensor(user_activity.get_actions(), dtype=torch.float))
    val_labels.append(torch.tensor([user_activity.get_labels()], dtype=torch.float))

train_actions = torch.stack(train_actions)  # (T, C)
train_labels = torch.stack(train_labels)  # (T, C)

val_actions = torch.stack(val_actions)  # (T, C)
val_labels = torch.stack(val_labels)  # (T, C)

In [1940]:
torch.manual_seed(28)

class Head(nn.Module):
    def __init__(self, head_size=3):
        super().__init__()
        self.query = nn.Linear(32, head_size, bias=False)
        self.key = nn.Linear(32, head_size, bias=False)
        self.value = nn.Linear(32, head_size, bias=False)
        # self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        # self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        # Produce a query, key, value by passing it through a linear layer
        q = self.query(x)  # (B, T, hs)
        k = self.key(x)  # (B, T, hs)
        v = self.value(x)  # (B, T, hs)

        # do the scaled attention formula refer to attention is all you need paper
        wei = q @ k.transpose(-2, -1) * (1 / math.sqrt(C))  # (B, T, hs) @ (B, hs, T) = (B, T, T) * (1 / sqrt(C)
        # normalize it to every column sums up to 1
        wei = F.softmax(wei, dim=-1)
        # wei = self.dropout(wei)
        # multiply the weight to the value
        out = wei @ v  # (B, T, T) @ (B, T, hs) = (B, T, hs)

        # return the result
        return out

# TODO: Rewrite the whole thing to use an Embeddings follow the paper closely or repurpose this as the FFN
class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.attention_head = Head(head_size=32)
        self.mlp = nn.Sequential(
            nn.Linear(32, 32*4),
            nn.ReLU(),
            nn.Linear(32*4, 1),
            # nn.Softmax(dim=-1)
        )
        self.projection = nn.Linear(8, 32, bias=False)

    def forward(self, activities, labels=None):
        proj = self.projection(activities)
        attention = self.attention_head(proj)
        logits = self.mlp(attention)  # T, C
        if labels == None:
            loss = None
        else:
            loss = F.binary_cross_entropy_with_logits(logits, labels)
        return logits, loss


model = SimpleMLP()

In [2051]:
def get_batch(split):
    actions, labels = (train_actions, train_labels) if split == 'train' else (val_actions, val_labels)
    # print(actions.shape)
    actions = actions.view(100, -1, 8)
    labels = labels.view(100, -1, 1)
    # print(actions.view(100, -1, 8).shape)
    # print(labels.view(100, -1, 1).shape)
    random_offset = random.randint(0, len(actions) + 1)
    if actions[random_offset:random_offset + 1].shape[0] == 0 or  labels[random_offset:random_offset + 1].shape[0] == 0:
        return get_batch(split)
    return actions[random_offset:random_offset + 1], labels[random_offset:random_offset + 1]


max_iteration = 5
block_size = 8

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for _ in range(max_iteration):
    # Get Samples
    actions, labels = get_batch('train')

    # Evaluate the loss
    logits, loss = model(actions, labels)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss)

tensor(0.4192, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


In [2055]:
prediction = model(torch.tensor([[[0., 1., 4., 5., 3., 5., 3., 5.]]]))
prediction

(tensor([[[-3.4182]]], grad_fn=<ViewBackward0>), None)

In [1677]:
actions, labels = get_batch('train')

In [1773]:
# Version 4 Self-Attention
import math

torch.manual_seed(1337)
B, T, C = 4, 4, 32
# x = torch.randn(B, T, C)
x = actions
C = 8

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)  # (B, T, 16)
q = query(x)  # (B, T, 16)
v = value(x)  # (B, T, 16)
wei = q @ k.transpose(-2, -1) * (1 / math.sqrt(k.size(-1)))  # (B, T, 16) @ (B, 16, T) = (B, T, T)
# tril = torch.tril(torch.ones((T, T)))
# wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ v  # (B, T, T) @ (B, T, 16) = (B, T, 16)

out.shape

torch.Size([1, 30, 16])