In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv, GAE
from torch.optim.lr_scheduler import StepLR
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
data = pd.read_csv("./data/train.csv")

In [None]:
def preprocess_data(data):
    data = data.copy()

    # 데이터에서 유저 정보와 도서 정보를 가져옵니다.
    users = data[['User-ID', 'Age']].drop_duplicates().reset_index(drop=True)
    books = data[['Book-ID', 'Year-Of-Publication']].drop_duplicates().reset_index(drop=True)

    # 범주형 데이터를 수치형으로 변환하기 위한 map를 만듭니다.
    user_id_map = {user_id: idx for idx, user_id in enumerate(users['User-ID'].unique())}
    book_id_map = {book_id: idx for idx, book_id in enumerate(books['Book-ID'].unique())}

    # 연령과 출판년도를 정규화합니다.
    users['Age'] = (users['Age'] - users['Age'].min()) / (users['Age'].max() - users['Age'].min())
    books['Year-Of-Publication'] = (books['Year-Of-Publication'] - books['Year-Of-Publication'].min()) / (books['Year-Of-Publication'].max() - books['Year-Of-Publication'].min())

    # 원래 데이터셋에서 'User-ID'와 'Book-ID' 컬럼을 수치형으로 변환합니다.
    data['User-ID'] = data['User-ID'].map(user_id_map)
    data['Book-ID'] = data['Book-ID'].map(book_id_map)

    # 연령과 출판년도를 정규화된 값으로 업데이트합니다.
    data = data.merge(users, on='User-ID', suffixes=('', '_updated'))
    data = data.merge(books, on='Book-ID', suffixes=('', '_updated'))
    data = data.drop(columns=['Age', 'Year-Of-Publication'])
    data = data.rename(columns={'Age_updated': 'Age', 'Year-Of-Publication_updated': 'Year-Of-Publication'})

    # 결측치 처리
    data_processed = data.fillna(0)

    return data_processed, user_id_map, book_id_map

In [None]:
data_processed, user_id_map, book_id_map = preprocess_data(data)

In [None]:
node_features = data_processed[["Age","Year-Of-Publication"]].to_numpy()
edge_index = torch.tensor(data_processed[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()

#################################### 더 적합한 것으로 추후 변경
edge_attr = torch.tensor(data_processed['Book-Rating'].values, dtype=torch.float).unsqueeze(-1)

target = torch.tensor(data_processed['Book-Rating'].values, dtype=torch.float)

In [None]:
print(edge_index)
print(edge_index.shape)
print(f"Number of rows in data_processed: {len(data_processed)}")

# Make data

In [None]:
graph_data = Data(x=torch.tensor(node_features), edge_index=edge_index, edge_attr=edge_attr)
graph_data.n_id = torch.arange(graph_data.num_nodes)

In [None]:
graph_data.x

In [None]:
graph_data.edge_index

In [None]:
graph_data.edge_attr

In [None]:
graph_data.n_id

In [None]:
graph_data = Data(x=torch.tensor(node_features), edge_index=edge_index, edge_attr=edge_attr, y=target)
graph_data.n_id = torch.arange(graph_data.num_nodes)

class CustomGraphDataset(Dataset):
    def __init__(self, data_list, transform=None, pre_transform=None):
        super(CustomGraphDataset, self).__init__(None, transform, pre_transform)
        self.data_list = data_list

    def len(self):
        return len(self.data_list)

    def get(self, idx):
        return self.data_list[idx]
    
# 데이터셋 생성
data_list = [graph_data]  # 데이터셋에는 graph_data만 포함됩니다.
dataset = CustomGraphDataset(data_list)

In [None]:
### Neighbor Loader
# gSAGE_loader = NeighborLoader(
#     graph_data,
#     num_neighbors=[10,10],
#     batch_size=512)

# GraphSAGE

In [None]:
in_channels = 32
hidden_channels = 64
out_channels = 1
####################################
# 추후 베이지안 최적화(Bayesian optimization)
####################################
# conv 레이어 개수
# 은닉층의 노드 수 (hidden_channels)
# 드롭아웃 비율 (dropout rate)
# 학습률 (learning rate)
# 가중치 감소 (weight decay)
# 배치 크기 (batch size)
# 최적화 알고리즘 (optimizer)

In [None]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        return x

In [None]:
model = GraphSAGE(in_channels=in_channels, hidden_channels=hidden_channels, out_channels=out_channels)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005,weight_decay=1e-5)
loader = DataLoader(dataset, batch_size=1, shuffle=True)
loss_function = torch.nn.MSELoss()

In [None]:
def train(loader):
    model.train()
    total_loss = 0
    for data in tqdm(loader):
        data = data.to(device)
        optimizer.zero_grad()
        z = model(data.x.float(), data.edge_index)
        
        book_ratings = data.y.to(device)
        
        loss = loss_function(z, book_ratings)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader), model

In [None]:
num_epoch = 100

for epoch in range(0,num_epoch):
    loss,model = train(loader)
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')

# test