In [1]:
GLOBAL_SEED = 42

import os
os.environ["PYTHONIOENCODING"] = "utf8"
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import sys
from glob import glob

import pandas as pd
import numpy as np
from numpy import random as np_rnd
import random as rnd
import shutil
import gc
import datetime
from collections import defaultdict, Counter
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
import time
import pickle
import sklearn as skl
from sklearn import model_selection

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
from torch.optim import AdamW, Adam, SparseAdam
from transformers import get_polynomial_decay_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.__version__

'2.1.2+cu118'

In [2]:
from torch_geometric.data import Data
from torch_geometric.utils import coalesce
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv

In [3]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # python random
    rnd.seed(seed)
    # numpy random
    np_rnd.seed(seed)
    # tf random
    try:
        tf_rnd.set_seed(seed)
    except:
        pass
    # RAPIDS random
    try:
        cp.random.seed(seed)
    except:
        pass
    # pytorch random
    try:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
    except:
        pass
        
def pickleIO(obj, src, op="w"):
    if op=="w":
        with open(src, op + "b") as f:
            pickle.dump(obj, f)
    elif op=="r":
        with open(src, op + "b") as f:
            tmp = pickle.load(f)
        return tmp
    
def create_submission(df):
    df = df.reset_index()
    df["type"] = df["type"].map(CFG.contentType_mapper)
    df["session_type"] = df["session"].astype("str") + "_" + df["type"].astype("str") + "s"
    df = df[["session_type", "prediction"]].rename({"prediction": "labels"}, axis=1)
    return df


In [4]:
class CFG:
    contentType_mapper = pd.Series(["clicks", "carts", "orders"], index=[0, 1, 2])
    target_weight = (0.1, 0.3, 0.6)
    embed_dim = 28
    

In [5]:
node_feature = pickleIO(None, "node_feature.pkl", "r").to(device)
edge_index = pickleIO(None, "sampled_edge_train.pkl", "r").to(device)

In [6]:
class GCN(torch.nn.Module):
    def __init__(self, n_aids, embed_dim):
        super().__init__()
        self.aid_factors = nn.Embedding(n_aids, embed_dim, sparse=False)
        self.gcn = GCNConv(embed_dim, embed_dim)
        self.gcn_act = nn.ReLU()
        self.lin = nn.Linear(embed_dim, embed_dim)
    def forward(self, x, edge_index):
        x = self.aid_factors(x)
        x = self.gcn(x, edge_index=edge_index)
        x = self.gcn_act(x)
        x = self.lin(x)
        return x

In [7]:
model = GCN(len(node_feature), 28)
model.load_state_dict(torch.load("model_fold0_best.pth", map_location=torch.device('cpu'))["model"])
model.eval()
model.to(device)
with torch.no_grad():
    embeddings = model(node_feature, edge_index).detach().cpu().numpy()
del model, edge_index; gc.collect()

90

In [8]:
embeddings

array([[-0.17929766, -0.07816672,  0.0561974 , ..., -0.08671962,
        -0.03234575, -0.01899198],
       [ 0.27164188,  0.28182897,  0.11662263, ...,  0.04597885,
        -0.5395058 , -0.31788978],
       [-0.23975751,  0.88707423, -0.40279135, ..., -0.03156289,
         0.06961562,  0.5065333 ],
       ...,
       [ 0.0042207 ,  0.25794527,  0.25145513, ..., -0.21140511,
        -0.02078458, -0.16550325],
       [-0.3870101 ,  0.5038369 , -0.3100036 , ...,  0.38778126,
        -0.04624266, -0.3733773 ],
       [ 0.7157803 , -0.34527478, -0.6836545 , ...,  0.71059257,
        -0.16575226,  0.46625578]], dtype=float32)

In [9]:
pip install --user annoy

Note: you may need to restart the kernel to use updated packages.


In [10]:
import annoy
from annoy import AnnoyIndex

In [11]:
ntrees = 100

searcher = AnnoyIndex(CFG.embed_dim, 'euclidean')
searcher.set_seed(GLOBAL_SEED)

for idx, value in enumerate(embeddings):
    searcher.add_item(idx, value)

searcher.build(ntrees) 

True

In [12]:
del node_feature; gc.collect()

0

In [13]:
test = pd.read_parquet('./data/test.parquet')
session_types = ['clicks', 'carts', 'orders']
test_session_AIDs = test.reset_index(drop=True).groupby('session')['aid'].apply(list)
test_session_types = test.reset_index(drop=True).groupby('session')['type'].apply(list)
del test; gc.collect()

0

In [14]:
n_aids = 20

output = {
    "session": [],
    "type": [],
    "rec": [],
    "score": [],
}

for SESS, AIDs, types in tqdm(zip(test_session_AIDs.index, test_session_AIDs.values, test_session_types.values), total=len(test_session_AIDs.index)):
    candidates = searcher.get_nns_by_item(AIDs[-1], 21)[1:]
    if len(candidates) == 0: candidates = AIDs.copy()

    candidates = Counter(candidates)
    rec, score = zip(*candidates.most_common(n_aids))
    
    output["session"].extend([SESS] * 3)
    output["type"].extend([0, 1, 2])
    output["rec"].extend([" ".join(pd.Series(rec, dtype="str").values)] * 3)
    output["score"].extend([" ".join(pd.Series(score, dtype="str").values)] * 3)

output = pd.DataFrame(output).set_index(["session", "type"])

100%|███████████████████████████████████████████████████████████████████████| 1671803/1671803 [38:57<00:00, 715.10it/s]


In [15]:
output

Unnamed: 0_level_0,Unnamed: 1_level_0,rec,score
session,type,Unnamed: 2_level_1,Unnamed: 3_level_1
12899779,0,1035726 1034086 553695 1010169 1723172 1679994...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
12899779,1,1035726 1034086 553695 1010169 1723172 1679994...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
12899779,2,1035726 1034086 553695 1010169 1723172 1679994...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
12899780,0,1229906 484175 1707102 903505 812732 243201 60...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
12899780,1,1229906 484175 1707102 903505 812732 243201 60...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
...,...,...,...
14571580,1,1345247 1323437 1053699 494622 1843398 282588 ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
14571580,2,1345247 1323437 1053699 494622 1843398 282588 ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
14571581,0,923052 349927 1819514 1251085 144001 1106435 7...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
14571581,1,923052 349927 1819514 1251085 144001 1106435 7...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


In [16]:
output.reset_index().to_parquet("./raw_output.parquet")

In [17]:
output["session_type"] = [str(i[0]) + "_" + str(CFG.contentType_mapper[i[1]]) for i in output.index]

In [18]:
submission = pd.read_csv("./data/sample_submission.csv")
submission = submission.set_index("session_type")
submission.loc[output["session_type"].values, "labels"] = output["rec"].values
submission = submission.reset_index()
submission.to_csv("submission.csv", index=False)

In [19]:
submission

Unnamed: 0,session_type,labels
0,12899779_clicks,1035726 1034086 553695 1010169 1723172 1679994...
1,12899779_carts,1035726 1034086 553695 1010169 1723172 1679994...
2,12899779_orders,1035726 1034086 553695 1010169 1723172 1679994...
3,12899780_clicks,1229906 484175 1707102 903505 812732 243201 60...
4,12899780_carts,1229906 484175 1707102 903505 812732 243201 60...
...,...,...
5015404,14571580_carts,1345247 1323437 1053699 494622 1843398 282588 ...
5015405,14571580_orders,1345247 1323437 1053699 494622 1843398 282588 ...
5015406,14571581_clicks,923052 349927 1819514 1251085 144001 1106435 7...
5015407,14571581_carts,923052 349927 1819514 1251085 144001 1106435 7...
