In [5]:
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import Node2Vec
import gc
import polars as pl
import os
import sys
sys.path.append('../models')
from utils import *
from Node2Vec import *

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [2]:
train_df = pl.read_parquet('../data/train.parquet',
                           columns=['session','aid'],
                           low_memory= True,
                          )
test_df = pl.read_parquet('../data/test.parquet',columns=['session','aid'])

In [3]:
# # 이전 세션(session)의 aid 값을 포함하는 그래프 데이터 저장

# train_df = lagged_df(train_df)
# test_df = lagged_df(test_df)

# df = pl.concat([
#     train_df,
#     test_df
#     ], how="vertical")

# df = generate_Graph(df)
# torch.save(edges_torch_T,"../output/all_edges_train_and_test.pt")

  df = df.with_column(pl.col("aid").shift(periods=1).over("session")


In [4]:
edges_tensor = torch.load("../output/all_edges_train_and_test.pt")

In [5]:
# 그래프 데이터 생성
data = Data(edge_index=edges_tensor)
print(data)

Data(edge_index=[2, 223644219])


In [None]:
del edges_tensor
gc.collect()

In [None]:
model = Node2Vec(data.edge_index, embedding_dim=32, 
                 walk_length=10,                        # lenght of rw
                 context_size=5, walks_per_node=10,
                 num_negative_samples=2, 
                 p=0.2, q=0.5,                          # bias parameters
                 sparse=True).to(device)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

In [None]:
# 데이터 로더(loader)와 옵티마이저(optimizer) 설정

loader = model.loader(batch_size=128, shuffle=True,
                      num_workers=6)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

In [None]:
del data
gc.collect()

In [None]:
for epoch in range(0, 12):
    loss = Node2Vec_train()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')

In [None]:
%%time
from annoy import AnnoyIndex

index = AnnoyIndex(32, 'angular')

for idx,idx_embedding in enumerate(model.state_dict()['embedding.weight'].cpu()):
    index.add_item(idx, idx_embedding)
    
index.build(10)

In [None]:
embeddings_node2vec = model.cpu().state_dict()['embedding.weight'].numpy()
np.save("node2vec_embeddings",embeddings_node2vec)

In [None]:
del model, loader, optimizer, embeddings_node2vec
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Validation / Inference
def evaluate(path, mode="validation", n_neighbors=12):

    test = pl.read_parquet(path)

    session_types = ['clicks', 'carts', 'orders']
    test_session_AIDs = test.to_pandas().reset_index(drop=True).groupby('session')['aid'].apply(list)
    test_session_types = test.to_pandas().reset_index(drop=True).groupby('session')['type'].apply(list)

    del test
    gc.collect()
    labels = []

    type_weight_multipliers = {0: 1, 1: 6, 2: 3}

    for AIDs, types in zip(test_session_AIDs, test_session_types):
        if len(AIDs) >= 20:
                # 만약 20개 이상 aids가 있다면 (over equals 20) candidates를 구성할 필요가 없다. 기존 로직으로 충분.
            weights=np.logspace(0.1,1,len(AIDs),base=2, endpoint=True)-1
            aids_temp=defaultdict(lambda: 0)
            for aid,w,t in zip(AIDs,weights,types): 
                aids_temp[aid]+= w * type_weight_multipliers[t]

            sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
            labels.append(sorted_aids[:20])
        else:
            # 만약 20 aids to output가 없다면 -- candidates generate를 위해 word2vec embeddings을 사용한다.
            AIDs = list(dict.fromkeys(AIDs[::-1]))

            # most recent aid 찾아내기
            most_recent_aid = AIDs[0]

            # neighbors 찾아내기
            nns = [i for i in index.get_nns_by_item(most_recent_aid, n_neighbors+1)[1:]]


            labels.append((AIDs+nns)[:n_neighbors])

    labels_as_strings = [' '.join([str(l) for l in lls]) for lls in labels]

    predictions = pd.DataFrame(data={'session_type': test_session_AIDs.index, 'labels': labels_as_strings})

    prediction_dfs = []

    for st in session_types:
        modified_predictions = predictions.copy()
        modified_predictions.session_type = modified_predictions.session_type.astype('str') + f'_{st}'
        prediction_dfs.append(modified_predictions)

    sub = pd.concat(prediction_dfs).reset_index(drop=True)
    
    del prediction_dfs, predictions,labels_as_strings, labels, test_session_types,test_session_AIDs
    gc.collect()
    if mode=="test":
        sub.to_csv("submission.csv",index=False)
        return sub
    else:

        sub['labels_2'] = sub['labels'].apply(lambda x : [int(s) for s in x.split(' ')])
        submission = pd.DataFrame()
        submission['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        submission['type'] = sub.session_type.apply(lambda x: x.split('_')[1])
        submission['labels'] = sub.labels_2.apply(lambda x : [item for item in x[:] ]) #.apply(lambda x: [int(i) for i in x.split(',')[:20]])
        test_labels = pd.read_parquet('/kaggle/input/otto-train-and-test-data-for-local-validation/test_labels.parquet')
        test_labels = test_labels.merge(submission, how='left', on=['session', 'type'])
        del sub,submission
        gc.collect()
        gc.collect()
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall_per_type = test_labels.groupby(['type'])['hits'].sum() / test_labels.groupby(['type'])['gt_count'].sum() 
        score = (recall_per_type * pd.Series({'clicks': 0.1, 'carts': 0.30, 'orders': 0.60})).sum()

        return score

In [None]:
path = "/kaggle/input/otto-train-and-test-data-for-local-validation/test.parquet"
validation_score = evaluate(path,mode="validation",n_neighbors=20)

In [None]:
path = "/kaggle/input/otto-full-optimized-memory-footprint/test.parquet"
test_submission = evaluate(path,mode="test",n_neighbors=20)

In [None]:
# next step
# BayesianOptimization