In [None]:
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import Node2Vec
from tqdm import tqdm
import gc
import polars as pl
import os
import sys
import pickle
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
# node2vec 모델에서 embedding을 추출하는 것부터 시작합니다.
# 학습을 위해 그래프 데이터 추출

#이전 세션(session)의 aid 값을 포함하는 그래프 데이터 저장
train_df = pl.read_parquet('/home/conceptelling/data/train.parquet', columns=['session', 'aid'], low_memory=True)
df = train_df.with_columns(pl.col("aid").shift(periods=1).over("session")
                              #.cast(pl.Int32)
                              #.fill_null(pl.col("aid"))
                              .alias("prev_aid"))

# 세션별 첫 번째 행 제거 (prev_aid가 없는 경우)
df = df.filter(pl.col('prev_aid').is_not_null())

edges_torch_T = torch.tensor(np.transpose(df[['prev_aid', 'aid']].to_numpy()), dtype=torch.long)
# torch.save(edges_torch_T, "all_graph_train.pt")

In [None]:
data = Data(edge_index=edges_torch_T)

In [None]:
def node2vec_train(embedding_dim, walk_length, context_size, walks_per_node, p, q, lr):
    model = Node2Vec(data.edge_index, embedding_dim=embedding_dim,
                     walk_length=walk_length,
                     context_size=context_size,
                     walks_per_node=walks_per_node,
                     num_negative_samples=2,
                     p=p, q=q,
                     sparse=True).to(device)

    loader = model.loader(batch_size=1024, shuffle=True, num_workers=6)
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=lr)

    num_epochs = 2
    total_loss = 0
    for epoch in tqdm(range(num_epochs), desc="Epoch"):
        model.train()
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

    return model, total_loss / len(loader)

space = [Integer(32, 128, name='embedding_dim'),
         Integer(10, 20, name='walk_length'),
         Integer(5, 10, name='context_size'),
         Integer(10, 20, name='walks_per_node'),
         Real(0.5, 2, name='p'),
         Real(0.5, 2, name='q'),
         Real(1e-4, 1e-3, "log-uniform", name='lr')]

# 진행상황 출력
from tqdm.auto import tqdm as tqdm_auto
tqdm_auto_pbar = tqdm_auto(total=15, desc="Bayesian optimization")

models_and_losses = []

# objective 중복 평가를 방지하기 위해 코드 개선. 그런데 대충 15번 정도면 찾아내는 것 같다.
evaluated_points = set()
@use_named_args(space)
def objective(embedding_dim, walk_length, context_size, walks_per_node, p, q, lr):
    point = (embedding_dim, walk_length, context_size, walks_per_node, p, q, lr)
    if point in evaluated_points:
        return 1e9
    else:
        evaluated_points.add(point)
    tqdm_auto_pbar.update(1)
    if walk_length < context_size:
        return 1e9
    model, loss = node2vec_train(embedding_dim, walk_length, context_size, walks_per_node, p, q, lr)
    models_and_losses.append((model, loss))
    return loss


result = gp_minimize(objective, space, n_calls=15, random_state=0)
tqdm_auto_pbar.close()

In [None]:
print("Best hyperparameters found: ", result.x)
print("Minimum loss: ", result.fun)

best_params = {'embedding_dim': result.x[0], 'walk_length': result.x[1], 'context_size': result.x[2],
               'walks_per_node': result.x[3], 'p': result.x[4], 'q': result.x[5], 'lr': result.x[6]}

best_model, best_loss = min(models_and_losses, key=lambda x: x[1])

In [None]:
def node2vec(embedding_dim, walk_length, context_size, walks_per_node, p, q, lr):
    model = Node2Vec(data.edge_index, embedding_dim=embedding_dim,
                     walk_length=walk_length,
                     context_size=context_size,
                     walks_per_node=walks_per_node,
                     num_negative_samples=2,
                     p=p, q=q,
                     sparse=True).to(device)

    loader = model.loader(batch_size=1024, shuffle=True, num_workers=6)
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=lr)

    num_epochs = 5
    for epoch in tqdm(range(num_epochs), desc="Epoch"):
        model.train()
        total_loss = 0  
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}: Loss {total_loss}") 

    return model

In [None]:
# best param으로 모델 학습
trained_model = node2vec(**best_params)

In [None]:
# embedding 추출
node_embeddings = trained_model.embedding.weight.cpu().detach().numpy()
embedding_save_path = "node_embeddings.npy"
np.save(embedding_save_path, node_embeddings)