In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import pandas as pd
import numpy as np
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv, GAE
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
data = pd.read_csv("./data/train.csv")

In [None]:
# 데이터 전처리
# 데이터에서 유저 정보와 도서 정보를 가져옵니다.
users = data[['User-ID', 'Age']].drop_duplicates().reset_index(drop=True)
books = data[['Book-ID', 'Year-Of-Publication']].drop_duplicates().reset_index(drop=True)

# 범주형 데이터를 수치형으로 변환합니다.
users['User-ID'] = users['User-ID'].astype('category').cat.codes
books['Book-ID'] = books['Book-ID'].astype('category').cat.codes

# 연령과 출판년도를 정규화합니다.
users['Age'] = (users['Age'] - users['Age'].min()) / (users['Age'].max() - users['Age'].min())
books['Year-Of-Publication'] = (books['Year-Of-Publication'] - books['Year-Of-Publication'].min()) / (books['Year-Of-Publication'].max() - books['Year-Of-Publication'].min())

# 'User-ID'와 'Book-ID' 컬럼의 데이터 타입을 int32로 변경합니다.
data['User-ID'] = pd.to_numeric(data['User-ID'], errors='coerce', downcast='integer')
data['Book-ID'] = pd.to_numeric(data['Book-ID'], errors='coerce', downcast='integer')

# 유저와 도서 정보를 하나의 데이터 프레임으로 합칩니다.
data_processed = data.merge(users, on='User-ID').merge(books, on='Book-ID')


# 노드 피처를 생성합니다.
node_features = torch.tensor(pd.concat([users['Age'], books['Year-Of-Publication']]).values, dtype=torch.float).unsqueeze(1)

# 엣지 인덱스를 생성합니다.
edge_index = torch.tensor(data_processed[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()

# 타겟 값(도서 평점)을 생성합니다.
target = torch.tensor(data_processed['Book-Rating'].values, dtype=torch.float)

In [None]:
edge_attr = torch.tensor(data['Book-Rating'].values, dtype=torch.float32)

In [None]:
graph_data = Data(x=node_features, edge_index=edge_index, y=target)

In [None]:
gSAGE_loader = NeighborLoader(
    graph_data,
    num_neighbors=[10,10],
    batch_size=512)

In [None]:
class WeightedSAGEConv(SAGEConv):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(WeightedSAGEConv, self).__init__(in_channels, out_channels, **kwargs)

    def forward(self, x, edge_index, edge_weight=None):
        x = self.convs[i]((x, x_target), edge_index, edge_weight=edge_attr)
        return super(WeightedSAGEConv, self).forward(x, edge_index, edge_weight)


class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super(GraphSAGE, self).__init__()

        self.num_layers = num_layers
        self.convs = torch.nn.ModuleList()

        # Input layer
        self.convs.append(WeightedSAGEConv(in_channels, hidden_channels))

        # Hidden layers
        for _ in range(num_layers - 2):
            self.convs.append(WeightedSAGEConv(hidden_channels, hidden_channels))

        # Output layer
        self.convs.append(WeightedSAGEConv(hidden_channels, out_channels))

    def forward(self, x, adjs):
        for i, (edge_index, edge_attr, size) in enumerate(adjs):
            x_target = x[:size[1]]  # Target node features
            x = self.convs[i]((x, x_target), edge_index, edge_weight=edge_attr)

            if i != self.num_layers - 1:
                x = F.relu(x)
                x = F.dropout(x, p=0.5, training=self.training)
        return x

In [None]:
out_channels = 16
num_features = graph_data.x.shape[1]
hidden_channels = 32
num_layers = 4
model = GAE(GraphSAGE(num_features, hidden_channels, out_channels, num_layers)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005,weight_decay=1e-5)
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)

In [None]:
def train(loader):
    total_loss = 0
    for subgraph in tqdm(loader):
        optimizer.zero_grad()
        z = model(subgraph.x.float().to(device), subgraph.edge_index.to(device))
        loss = model.recon_loss(z, pos_edge_index=subgraph.edge_index.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
n_epochs = 30
    
for epoch in range(0,n_epochs):
    loss,model = train(gSAGE_loader)
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')