In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import StandardScaler
import gc

  from .autonotebook import tqdm as notebook_tqdm


cpu


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
## Prepare Input Data
### Node Features

In [None]:
# Load node2vec embeddings and node features
node2vec_embeddings = np.load("./output/node_embeddings.npy")
aid_features = pl.read_parquet("./data/aid_features.parquet")
aid_features_agg = pl.read_parquet("./data/aid_features_agg.parquet")
aid_features_all = aid_features.join(aid_features_agg, on="aid", how="inner").drop("aid").to_numpy()

# Scaling node2vec embeddings and node features separately
scaler_node2vec = StandardScaler()
scaled_node2vec_embeddings = scaler_node2vec.fit_transform(node2vec_embeddings)

scaler_features = StandardScaler()
scaled_aid_features_all = scaler_features.fit_transform(aid_features_all)

# Concatenate node2vec embeddings and node features
features_and_embeddings = np.concatenate((scaled_node2vec_embeddings, scaled_aid_features_all), axis=1)

In [None]:
features_and_embeddings

In [None]:
del node2vec_embeddings,aid_features_all,aid_features_agg,aid_features
gc.collect()

### edge_index

In [None]:
# 데이터를 로드하고, 'user_id'와 'item_id' 열을 기준으로 구매 횟수를 집계합니다.
data = pd.read_parquet('./data/train.parquet')
edge_weights = data.groupby(['session', 'aid']).size().reset_index(name='weight')

# 간선 목록 및 가중치 목록을 추출합니다.
edge_list = edge_weights[['session', 'aid']].values.tolist()
edge_weights_list = edge_weights['weight'].values.tolist()

# 간선 목록 및 가중치 목록을 텐서로 변환합니다.
edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
edge_attr = torch.tensor(edge_weights_list, dtype=torch.float).view(-1, 1)

## Neighbor Loader

In [None]:
gSAGE_loader = NeighborLoader(
    graph_data,
    # 각 레이어에서 샘플링할 이웃 노드의 수를 나열한 리스트
    num_neighbors=[10,10],
    # 한 번에 처리할 노드의 수를 결정하는 배치 크기
    batch_size=512)

## GraphSAGE

In [None]:
# 가중치 정보를 사용하도록 수정한 GraphSAGE 코드

class WeightedSAGEConv(SAGEConv):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(WeightedSAGEConv, self).__init__(in_channels, out_channels, **kwargs)

    def forward(self, x, edge_index, edge_weight=None):
        x = (x[0], x[1])
        return super(WeightedSAGEConv, self).forward(x, edge_index, edge_weight)

    
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super(GraphSAGE, self).__init__()

        self.num_layers = num_layers
        self.convs = torch.nn.ModuleList()

        # Input layer
        self.convs.append(WeightedSAGEConv(in_channels, hidden_channels))

        # Hidden layers
        for _ in range(num_layers - 2):
            self.convs.append(WeightedSAGEConv(hidden_channels, hidden_channels))

        # Output layer
        self.convs.append(WeightedSAGEConv(hidden_channels, out_channels))

    def forward(self, x, adjs):
        for i, (edge_index, edge_attr, size) in enumerate(adjs):
            x_target = x[:size[1]]  # Target node features
            x = self.convs[i]((x, x_target), edge_index, edge_attr)

            if i != self.num_layers - 1:
                x = F.relu(x)
                x = F.dropout(x, p=0.5, training=self.training)

        return x.log_softmax(dim=-1)

    def inference(self, x_all, subgraph_loader, device):
        pbar = tqdm(total=x_all.size(0) * self.num_layers)
        pbar.set_description('Evaluating')

        for i in range(self.num_layers):
            xs = []
            for batch_size, n_id, adj in subgraph_loader:
                edge_index, edge_attr, size = adj.to(device)
                x = x_all[n_id].to(device)
                x_target = x[:size[1]]
                x = self.convs[i]((x, x_target), edge_index, edge_attr)
                if i != self.num_layers - 1:
                    x = F.relu(x)
                xs.append(x.cpu())

                pbar.update(batch_size)

            x_all = torch.cat(xs, dim=0)

        pbar.close()
        return x_all
    
    
out_channels = 32
num_features = data.x.shape[1]
hidden_channels = 64
num_layers = 2
model = GAE(GraphSAGE(num_features, hidden_channels, out_channels, num_layers)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005,weight_decay=1e-5)
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)

## GAE로 그래서 뭘 하려는 건지 원 캐글코드를 먼저 끝까지 한번 훑어야 할 듯
그 다음에 내 학습코드 방향 잡을 수 있음 (캐글따라감/임의수정 등)

In [None]:
# 모델 하이퍼파라미터 튜닝 코드로 개선 
# y 라벨로 학습 할 수 있는 건지 다시 체크 (그렇게 학습시켜야 할 것 같은데ㅠㅠ)