In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv, GAE
from torch.optim.lr_scheduler import StepLR
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
data = pd.read_csv("./data/train.csv")

In [None]:
users = data[['User-ID', 'Age']].drop_duplicates().reset_index(drop=True)
books = data[['Book-ID', 'Year-Of-Publication']].drop_duplicates().reset_index(drop=True)

user_id_map = {user_id: idx for idx, user_id in enumerate(users['User-ID'].unique())}
book_id_map = {book_id: idx for idx, book_id in enumerate(books['Book-ID'].unique())}

users['User-ID'] = users['User-ID'].map(user_id_map)
books['Book-ID'] = books['Book-ID'].map(book_id_map)

users['Age'] = (users['Age'] - users['Age'].min()) / (users['Age'].max() - users['Age'].min())
books['Year-Of-Publication'] = (books['Year-Of-Publication'] - books['Year-Of-Publication'].min()) / (books['Year-Of-Publication'].max() - books['Year-Of-Publication'].min())

data['User-ID'] = data['User-ID'].map(user_id_map)
data['Book-ID'] = data['Book-ID'].map(book_id_map)

In [None]:
data_processed = data.copy()

users_dict = users.set_index('User-ID')['Age'].to_dict()
books_dict = books.set_index('Book-ID')['Year-Of-Publication'].to_dict()
data_processed['User-Age'] = data_processed['User-ID'].apply(lambda x: users_dict[x])
data_processed['Book-Year-Of-Publication'] = data_processed['Book-ID'].apply(lambda x: books_dict[x])

node_features = torch.tensor(pd.concat([users['Age'], books['Year-Of-Publication']]).values, dtype=torch.float).unsqueeze(1)

edges = data_processed[['User-ID', 'Book-ID', 'Book-Rating']].copy()
user_ids = edges['User-ID'].unique()
book_ids = edges['Book-ID'].unique()
user_id_map = {user_id: idx for idx, user_id in enumerate(user_ids)}
book_id_map = {book_id: idx + len(user_ids) for idx, book_id in enumerate(book_ids)}
edges['User-ID'] = edges['User-ID'].map(user_id_map)
edges['Book-ID'] = edges['Book-ID'].map(book_id_map)

edge_index = torch.tensor(edges[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()
edge_attr = torch.tensor(edges['Book-Rating'].values, dtype=torch.float).unsqueeze(-1)

target = torch.tensor(data_processed['Book-Rating'].values, dtype=torch.float)

In [None]:
print(f"Number of rows in data_processed: {len(data_processed)}")

In [None]:
class BookRatingData(Dataset):
    def __init__(self, node_features, edge_index, edge_attr, target):
        super(BookRatingData, self).__init__()
        self.node_features = node_features
        self.edge_index = edge_index
        self.edge_attr = edge_attr
        self.target = target

    def __len__(self):
        return self.edge_index.size(1)

    def __getitem__(self, idx):
        return {
            'x': self.node_features,
            'edge_index': self.edge_index,
            'edge_attr': self.edge_attr,
            'y': self.target[idx]
        }

book_rating_data = BookRatingData(node_features, edge_index, edge_attr, target)

def split_edges(dataset, split_ratio=(0.8, 0.2)):
    num_edges = len(dataset)
    indices = list(range(num_edges))
    random.shuffle(indices)

    split_idx = int(split_ratio[0] * num_edges)
    train_indices, test_indices = indices[:split_idx], indices[split_idx:]

    return [dataset[i] for i in train_indices], [dataset[i] for i in test_indices]

train_dataset, test_dataset = split_edges(book_rating_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
