In [1]:
import json 
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR

from sklearn.preprocessing import OneHotEncoder

import torch.version
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

torch.cuda.is_available(), torch.version.cuda, torch.__version__
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        
trainfile = "train.jsonl"
batch_size = 100
start = 0

def loadBatchData(start, batch_size, filename):
    trainset = []
    with open(filename, "r", encoding="utf-8") as file:
        for id, line in enumerate(file):
            if id < start:
                continue
            if id >= start + batch_size:
                break
            try:
                data = json.loads(line)  # Convert JSON string to dictionary
                trainset.append(data)
            except: 
                print("bad lines")
                continue
            
    return trainset

# define action space
def getItems(trainset) -> set:
    actions = [] # items
    for session in trainset:
        for i in [i["aid"] for i in session["events"]]:
            actions.append(i)
    return set(actions)


# Encoding
- one hot encoding too waste space

In [2]:

class Encoding():
    def __init__(self, trainset: list):
        self.trainset = trainset
        self.encoding = None
        self.items_len = 0
        
        
    def getMapping(self) -> dict:
        """
        Output:
            key: item id
            data: one hot encoding
        """
        items = getItems(self.trainset)
        self.items_len = max(items) + 1
        data = np.array([list(items)])
        encoder = OneHotEncoder(sparse_output=False)
        one_hot = encoder.fit_transform(data)
        
        print(f"num items: {self.items_len}")
        
        self.encoding = {}
        for i in encoder.get_feature_names_out():
            encode = i.split("_")
            data = int(encode[0][1:])
            key = int(encode[1])
            self.encoding[key] = data  

    def id2Vec(self, X: list, y: list) -> tuple[torch.tensor, torch.tensor]:
        
        X_vec = []
        y_vecs = []
        
        for x, i in zip(X, y):
            # X
            x_vec = [0 for i in range(self.items_len)]
            x_vec[self.encoding[x]] = 1
            X_vec.append(x_vec)
            
            # y
            y_vec = [0 for i in range(self.items_len)]
            for j in i:
                y_vec[self.encoding[j]] = 1
            y_vecs.append(y_vec)           
            
        return torch.tensor(X_vec), torch.tensor(y_vecs)
    
    def ySync(self, y: list) -> list:
        max_len = 0 
        for k in y:
            if max_len < len(k):
                max_len = len(k)
                
        for k in y:
            if len(k) < max_len:
                for i in range(max_len - len(k)):
                    k.append(0)
                    
        return y
    
    def getEmbeddingTrainSet(self, context_window: int = 60*60*24*1000) -> tuple[list, list]:
        """
        X[0]: [0 ... 0 1 0 ... 0] index at X[0] = 1
        y[0]: [0...0 1 0...0 ] index at y[0] = 1
        y[0] for compute loss
        
        Improve:
        1. y duplicate for each timestamp?
        """
        self.trainset[0]
        context_window = 60*60*24*1000 # 1 day
        X = [] # Input
        y = [] # Near
        for session in tqdm(self.trainset):
            session_ = pd.DataFrame(session["events"])
            for ev in session["events"]:
                X.append(ev["aid"])
                y.append(list(session_[(session_["ts"] < ev["ts"] + context_window) & (session_["ts"] > ev["ts"] - context_window)]["aid"]))

        y = self.ySync(y)
        
        # _, y = self.id2Vec(X, y)
        
        return X, y
    

    

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    

class Embedding(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(Embedding, self).__init__()
        self.emb_size = emb_size 
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.linear = nn.Linear(emb_size, vocab_size)
        self.softmax = nn.LogSoftmax()
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.linear(x)
        x = self.softmax(x)
        return x

In [4]:
trainset = loadBatchData(start, batch_size, trainfile)

encoding = Encoding(trainset)
encoding.getMapping()
X, y = encoding.getEmbeddingTrainSet()

num items: 1855501


100%|██████████| 100/100 [00:03<00:00, 29.07it/s]


In [None]:
X = torch.tensor(X).to(device)
y = [torch.tensor(i).to(device) for i in y]

vocab_size = max(X) + 1
emb_size = 10
init_train = True
embed_path = "embed.pt"

if init_train:
    embedding = Embedding(vocab_size, emb_size).to(device)
else:
    torch.load(embed_path, map_location=lambda storage, loc: storage, weights_only=True)
optimizer = optim.Adam(embedding.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)  # Reduce LR every 10 epochs

embedding.train()
for i in range(3):
    total_loss = []
    for i, x in enumerate(tqdm(X)):
        optimizer.zero_grad()
        embedded = embedding(x)
        loss = torch.tensor(0)
        for j in y[i]:
            if j != 0:
                loss = loss - embedded[j]
        loss.backward()
        optimizer.step()
        total_loss.append(loss.item())
        
    scheduler.step()
    print(np.mean(total_loss))
    torch.save(embedding.state_dict(), embed_path)


  2%|▏         | 197/10681 [00:11<10:13, 17.09it/s]