In [1]:
import json 
import time
import os
import pandas as pd
import numpy as np

import torch
import torch.version
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F

import multiprocessing
from multiprocessing import Pool, cpu_count
import multiprocessing.managers

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

def loadStartSes(loadinfo_file):
    if os.path.exists(loadinfo_file):
        ses = []   
        with open(loadinfo_file, "r") as f:
            for line in f:
                ses.append(json.loads(line))
                
        return max(ses) + 1
    return 0   

def loadBatchData(start, batch_size, filename) -> list:
    trainset = []
    with open(filename, "r", encoding="utf-8") as file:
        for id, line in enumerate(file):
            if id < start:
                continue
            if id >= start + batch_size:
                break
            try:
                data = json.loads(line)  
                trainset.append(data)
            except: 
                print("bad lines")
                continue
            
            
    return trainset

def loadNumItems():
    with open("max_item_id.json", "r") as f:
        return json.loads(f.read())["max_item_id"] + 1

def storeNumItems(items):
    with open("max_item_id.json", "w") as f:
        f.write(json.dumps({"max_item_id": max(items)}))

def getItems(filename:  str) -> set:
    items = set()
    with open(filename, "r", encoding="utf-8") as file:
        for line in tqdm(file): 
            session = json.loads(line)  
            for ev in session["events"]:
                items.add(ev["aid"])
    
    storeNumItems(items)
    
    return items



# Embedding model
class EmbDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
class Embedding(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(Embedding, self).__init__()
        self.emb_size = emb_size 
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.linear = nn.Linear(emb_size, vocab_size)
        self.softmax = nn.LogSoftmax()
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.linear(x)
        x = self.softmax(x)
        return x



cuda


### Total number of sessions

In [2]:
def numTotalSessions():
    sum = 0
    with open("train.jsonl", "r", encoding="utf-8") as file:
        for line in tqdm(file):
            sum += 1
    print(sum)

### Set

In [3]:
# Load set
trainfile = "train.jsonl"
loadinfo_file = "loadinfo_file.json"
batch_size = 1000
batch_cnt = 0
start_ses = 0

# Train set  
epoch = 1
vocab_size = loadNumItems()
emb_size = 10
embedding_model_path = "embed.pt"
trainbatch_size = 128


### Main

In [None]:
model = Embedding(vocab_size, emb_size).to(device)
if os.path.exists(embedding_model_path):
    model.load_state_dict(torch.load(embedding_model_path, map_location=lambda storage, loc: storage, weights_only=True))


def taskGetNeighbor(
    session: list,
    context_window: int = 60*60*24*1000, 
    loadinfo_file = "loadinfo_file.json"
    ) -> None:
    
    session_ = pd.DataFrame(session["events"])
    for ev in session["events"]:
        X.append(ev["aid"])
        y.append(list(session_[(session_["ts"] < ev["ts"] + context_window) & (session_["ts"] > ev["ts"] - context_window)]["aid"]))
    
    with open(loadinfo_file, "a") as f:
        f.write(json.dumps(session["session"]) + "\n")
        
    return {session["session"]: [X, y]}

while True:
    
    # Load data
    start_ses = loadStartSes(loadinfo_file)
    trainset = loadBatchData(start_ses, batch_size, trainfile)
    start_ses += batch_size
    if trainset == []:
        break    
    
    # Get data for train
    with multiprocessing.Manager() as manager:
        X = manager.list([]) # SessionID
        y = manager.list([]) # Near SessionID

        with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
            result = pool.map(taskGetNeighbor, trainset)
            
        X = torch.tensor(list(X))
        y = [torch.tensor(lst) for lst in list(y)]
        
        pool.close()
        
    traindata = EmbDataset(X)
    trainloader = DataLoader(traindata, batch_size=trainbatch_size, shuffle=True, num_workers=4)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = StepLR(optimizer, step_size=10, gamma=0.8)  # Reduce LR every 10 epochs
    
    model.train()   
    for i in range(epoch):
        total_loss = []
        ses_cnt = 0
        
        for x in tqdm(trainloader):
            x = x.to(device)
            y = [i.to(device) for i in y]
            loss = torch.tensor(0.).to(device)
            
            optimizer.zero_grad()
            embedded = model(x)
            for i, emb in enumerate(embedded):
                loss -= torch.sum(emb[y[ses_cnt*trainbatch_size + i]])
            loss = loss / (i + 1)
            loss.backward()
            optimizer.step()
            
            total_loss.append(loss.item())
            ses_cnt += 1
            
        scheduler.step()
        torch.save(model.state_dict(), embedding_model_path)
    
        print(f"Session end: {start_ses} Epoch: {i} | Loss: {np.mean(total_loss)}")


100%|██████████| 628/628 [01:50<00:00,  5.66it/s]


Session start: 1000 Epoch: 10 | Loss: 675.912981944479


100%|██████████| 575/575 [01:31<00:00,  6.32it/s]


Session start: 2000 Epoch: 106 | Loss: 637.2207468580164


 62%|██████▏   | 328/532 [01:00<00:27,  7.37it/s]

Multithreading?