In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv, GAE
from torch.optim.lr_scheduler import StepLR
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
data = pd.read_csv("./data/train.csv")

In [None]:
################## 데이터 전처리
################## 전처리 미진행 column : Location, Book-Title, Book-Author, Publisher 

users = data[['User-ID', 'Age']].drop_duplicates().reset_index(drop=True)
books = data[['Book-ID', 'Year-Of-Publication']].drop_duplicates().reset_index(drop=True)

user_id_map = {user_id: idx for idx, user_id in enumerate(users['User-ID'].unique())}
book_id_map = {book_id: idx for idx, book_id in enumerate(books['Book-ID'].unique())}
users['User-ID'] = users['User-ID'].map(user_id_map)
books['Book-ID'] = books['Book-ID'].map(book_id_map)

data['Age'] = (users['Age'] - users['Age'].min()) / (users['Age'].max() - users['Age'].min())
data['Year-Of-Publication'] = (books['Year-Of-Publication'] - books['Year-Of-Publication'].min()) / (books['Year-Of-Publication'].max() - books['Year-Of-Publication'].min())

data['User-ID'] = data['User-ID'].map(user_id_map)
data['Book-ID'] = data['Book-ID'].map(book_id_map)

In [None]:
data_processed = data.copy()
data_processed = data_processed.fillna(0)

node_features = torch.tensor(pd.concat([users['Age'], books['Year-Of-Publication']]).values, dtype=torch.float).unsqueeze(1)
edge_index = torch.tensor(data_processed[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()

target = torch.tensor(data_processed['Book-Rating'].values, dtype=torch.float)

In [None]:
print(edge_index)
print(edge_index.shape)
print(f"Number of rows in data_processed: {len(data_processed)}")

In [None]:
import random
from torch.utils.data import Dataset, DataLoader

class BookRatingData(Dataset):
    def __init__(self, node_features, edge_index, target):
        super(BookRatingData, self).__init__()
        self.node_features = node_features
        self.edge_index = edge_index
        self.target = target

    def __len__(self):
        return self.edge_index.size(1)

    def __getitem__(self, idx):
        return {
            'x': self.node_features,
            'edge_index': self.edge_index[:, idx],
            'y': self.target[idx]
        }

# 원본 데이터셋 생성
book_rating_data = BookRatingData(node_features, edge_index, target)

In [None]:
print(book_rating_data.edge_index)
print(book_rating_data.edge_index.shape)

In [None]:
def split_edges(edge_index, split_ratio=(0.8, 0.2)):
    num_edges = edge_index.size(1)
    indices = list(range(num_edges))
    random.shuffle(indices)

    split_idx = int(split_ratio[0] * num_edges)
    train_indices, test_indices = indices[:split_idx], indices[split_idx:]

    train_edge_index = edge_index[:, train_indices]
    test_edge_index = edge_index[:, test_indices]
#     train_edge_attr = edge_attr[train_indices]
#     test_edge_attr = edge_attr[test_indices]

    return train_edge_index, test_edge_index

# 무작위로 엣지를 분할합니다.
train_dataset, test_dataset = split_edges(book_rating_data.edge_index)

In [None]:
train_dataset.shape

In [None]:
# dataloader

In [None]:
class GraphSAGERatingPredictor(nn.Module):
    def __init__(self, num_features, hidden_channels, num_layers, dropout):
        super(GraphSAGERatingPredictor, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGEConv(num_features, hidden_channels))
        
        for _ in range(num_layers - 2):
            self.layers.append(SAGEConv(hidden_channels, hidden_channels))
        
        self.layers.append(SAGEConv(hidden_channels, 1))
        self.dropout = dropout

    def forward(self, x, edge_index):
        for layer in self.layers[:-1]:
            x = layer(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        
        x = self.layers[-1](x, edge_index)
        return x

In [None]:
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    
    for data in loader:
        x = data.x
        edge_index = data.edge_index
        y = data.y
        
        optimizer.zero_grad()
        out = model(x, edge_index)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(loader)


In [None]:
def test(model, loader, criterion):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for data in loader:
            x = data.x
            edge_index = data.edge_index
            y = data.y
            
            out = model(x, edge_index)
            loss = criterion(out, y)
            total_loss += loss.item()
    
    return total_loss / len(loader)

In [None]:
num_features = node_features.size(1)
hidden_channels = 64
num_layers = 3
dropout = 0.5
epochs = 15
lr = 0.01

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphSAGERatingPredictor(num_features, hidden_channels, num_layers, dropout).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()

for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    test_loss = test(model, test_loader, criterion)
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")