In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv, GAE
from torch.optim.lr_scheduler import StepLR
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
data = pd.read_csv("./data/train.csv")

In [None]:
def preprocess_data(data, user_id_map=None, book_id_map=None, age_min=None, age_max=None, year_min=None, year_max=None):
    data = data.copy()

    users = data[['User-ID', 'Age']].drop_duplicates().reset_index(drop=True)
    books = data[['Book-ID', 'Year-Of-Publication']].drop_duplicates().reset_index(drop=True)

    if user_id_map is None:
        user_id_map = {user_id: idx for idx, user_id in enumerate(users['User-ID'].unique())}
    if book_id_map is None:
        book_id_map = {book_id: idx for idx, book_id in enumerate(books['Book-ID'].unique())}

    if age_min is None:
        age_min = users['Age'].min()
        age_max = users['Age'].max()
    if year_min is None:
        year_min = books['Year-Of-Publication'].min()
        year_max = books['Year-Of-Publication'].max()

    users['Age'] = (users['Age'] - age_min) / (age_max - age_min)
    books['Year-Of-Publication'] = (books['Year-Of-Publication'] - year_min) / (year_max - year_min)

    data = data.merge(users, on='User-ID', suffixes=('', '_updated'))
    data = data.merge(books, on='Book-ID', suffixes=('', '_updated'))
    data = data.drop(columns=['Age', 'Year-Of-Publication'])
    data = data.rename(columns={'Age_updated': 'Age', 'Year-Of-Publication_updated': 'Year-Of-Publication'})
    
    data['User-ID'] = data['User-ID'].map(user_id_map)
    data['Book-ID'] = data['Book-ID'].map(book_id_map)
    
    data_processed = data.fillna(0)

    return data_processed, user_id_map, book_id_map, age_min, age_max, year_min, year_max

In [None]:
train_data_processed, user_id_map, book_id_map, age_min, age_max, year_min, year_max = preprocess_data(train_data)

In [None]:
node_features = train_data_processed[["Age","Year-Of-Publication"]].to_numpy()
edge_index = torch.tensor(train_data_processed[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()

#################################### 더 적합한 것으로 추후 변경
edge_attr = torch.tensor(train_data_processed['Book-Rating'].values, dtype=torch.float).unsqueeze(-1)
target = torch.tensor(train_data_processed['Book-Rating'].values, dtype=torch.float).unsqueeze(-1)

# Make data

In [None]:
graph_data = Data(x=torch.tensor(node_features), edge_index=edge_index, edge_attr=edge_attr, y=target, user_ids=user_ids, item_ids=item_ids)
graph_data.n_id = torch.arange(graph_data.num_nodes)

In [None]:
class CustomGraphDataset(Dataset):
    def __init__(self, data_list, transform=None, pre_transform=None):
        super(CustomGraphDataset, self).__init__(None, transform, pre_transform)
        self.data_list = data_list

    def len(self):
        return len(self.data_list)

    def get(self, idx):
        return self.data_list[idx]
    
# 데이터셋 생성
data_list = [graph_data]  
dataset = CustomGraphDataset(data_list)

In [None]:
### Neighbor Loader
# gSAGE_loader = NeighborLoader(
#     graph_data,
#     num_neighbors=[10,10],
#     batch_size=512)

# GraphSAGE

In [None]:
in_channels = graph_data.x.shape[1]
hidden_channels = 64
out_channels = 16
embedding_dim = 32
user_count = user_count
item_count = item_count
####################################
# 추후 베이지안 최적화(Bayesian optimization)
####################################
# conv 레이어 개수
# 은닉층의 노드 수 (hidden_channels)
# 드롭아웃 비율 (dropout rate)
# 학습률 (learning rate)
# 가중치 감소 (weight decay)
# 배치 크기 (batch size)
# 최적화 알고리즘 (optimizer)

In [None]:
class GraphSAGERatingPredictor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, embedding_dim, user_count, item_count):
        super(GraphSAGERatingPredictor, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, embedding_dim)

        self.user_embedding = nn.Embedding(user_count, embedding_dim)
        self.item_embedding = nn.Embedding(item_count, embedding_dim)
        self.regressor = nn.Linear(embedding_dim * 2, 1)

    def forward(self, x, edge_index, user_ids, item_ids):
        x, edge_index = x.float().to(device), edge_index.to(device)        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)

        user_embeddings = self.user_embedding(user_ids)
        item_embeddings = self.item_embedding(item_ids)

        concatenated_embeddings = torch.cat((user_embeddings, item_embeddings), dim=1)
        rating_predictions = self.regressor(concatenated_embeddings)
        return rating_predictions.squeeze()

In [None]:
# class GraphSAGE(torch.nn.Module):
#     def __init__(self, in_channels, hidden_channels, out_channels):
#         super(GraphSAGE, self).__init__()
#         self.conv1 = SAGEConv(in_channels, hidden_channels)
#         self.conv2 = SAGEConv(hidden_channels, out_channels)

#     def forward(self, x, edge_index):
#         x = self.conv1(x, edge_index)
#         x = F.relu(x)
#         x = F.dropout(x, p=0.5, training=self.training)
#         x = self.conv2(x, edge_index)
#         x = F.relu(x)
#         return x

# model = GraphSAGE(in_channels=in_channels, hidden_channels=hidden_channels, out_channels=out_channels)
# model = model.to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.005,weight_decay=1e-5)
# loader = DataLoader(dataset, batch_size=1, shuffle=True)
# loss_function = torch.nn.MSELoss()

# def train(loader):
#     model.train()
#     total_loss = 0
#     for data in tqdm(loader):
#         data = data.to(device)
#         optimizer.zero_grad()
#         z = model(data.x.float(), data.edge_index)
        
#         book_ratings = data.y.to(device)
        
#         loss = loss_function(z, book_ratings)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     return total_loss / len(loader), model

# num_epoch = 100

# for epoch in range(0,num_epoch):
#     loss,model = train(loader)
#     print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
# torch.save(model,"base_graphSage_model")

In [None]:
model = GraphSAGERatingPredictor(in_channels, hidden_channels, embedding_dim, user_count, item_count)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
optimizer.load_state_dict(optimizer.state_dict())
criterion = nn.MSELoss()
criterion = criterion.to(device)
loader = DataLoader(dataset, batch_size=512, shuffle=True)

In [None]:
def train(loader, model, criterion, optimizer):
    model.train()
    total_loss = 0
    
    for batch in tqdm(loader):
        batch_x = batch.x.to(device)
        batch_edge_index = batch.edge_index
        batch_user_ids = batch.user_ids
        batch_item_ids = batch.item_ids
        batch_user_ids = torch.tensor(batch_user_ids, dtype=torch.long).to(device)
        batch_item_ids = torch.tensor(batch_item_ids, dtype=torch.long).to(device)
        batch_y = batch.y.to(device)

        optimizer.zero_grad()
        out = model(batch_x, batch_edge_index, batch_user_ids, batch_item_ids)
        out = out.to(device)
        loss = criterion(out, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
num_epochs = 20
for epoch in range(num_epochs):
    train_loss = train(loader, model, criterion, optimizer)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss}")

In [None]:
print("train User-ID min:", train_data_processed['User-ID'].min())
print("train User-ID max:", train_data_processed['User-ID'].max())
print("train Book-ID min:", train_data_processed['Book-ID'].min())
print("train Book-ID max:", train_data_processed['Book-ID'].max())

print("train_node_features shape:", node_features.shape)
print("train_edge_index shape:", edge_index.shape)

print("train Edge index min:", data_list[0].edge_index.min())
print("train Edge index max:", data_list[0].edge_index.max())

print("train_graph_data.x shape:", graph_data.x.shape)

## Encoding by mini-batch

In [None]:
model = torch.load("base_graphSage_model")

In [None]:
np_embeddings = np.zeros((graph_data.num_nodes,32))
np_embeddings

In [None]:
%%time
from annoy import AnnoyIndex

index = AnnoyIndex(32, 'angular')

for idx,idx_embedding in enumerate(np_embeddings):
    index.add_item(idx, idx_embedding)
    
index.build(10)

In [None]:
del np_embeddings
gc.collect()

# Validation / Inference