In [None]:
import torch
# import torch.nn as nn
from torch import nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset, DataLoader, Batch
from torch_geometric.nn import SAGEConv, GAE
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import random_split
from torch.nn import Embedding
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
import gensim.downloader as api

In [None]:
train_data = pd.read_csv("./prepro_train_data.csv")

# Make Graph
## Category Embedding

### Age

In [None]:
age_tensor = torch.tensor(train_data['Age'].values, dtype=torch.float32).unsqueeze(1)

In [None]:
print("age_tensor.shape >>> ",age_tensor.shape)
print(age_tensor)

### Location

In [None]:
### Location

le = LabelEncoder()
train_data['Location_encoded'] = le.fit_transform(train_data['Location'])
embedding_layer = Embedding(num_embeddings=151, embedding_dim=29)
location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(train_data['Location_encoded'].values, dtype=torch.long), dim=1))
location_embeddings = location_embeddings.detach().numpy().squeeze()

In [None]:
print("location_embeddings.shape >>> ",location_embeddings.shape)
print(location_embeddings)

### Book-Title

In [None]:
import fasttext

# # 사전 훈련된 FastText 모델 다운로드
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
# !gunzip cc.en.300.bin.gz

In [None]:
fasttext_model = fasttext.load_model("cc.en.300.bin")

In [None]:
def get_title_embedding_fasttext(title):
    words = title.split()
    if len(words) == 0:
        return np.zeros(300)
    embeddings = [fasttext_model.get_word_vector(word) for word in words]
    return np.mean(embeddings, axis=0)

In [None]:
title_embeddings = train_data['Book-Title'].apply(get_title_embedding_fasttext).tolist()

In [None]:
empty_titles_count = 0
for title in train_data['Book-Title']:
    words = title.split()
    if not words:
        empty_titles_count += 1

print(f"빈 문자열이거나 토큰화된 단어가 없는 책 제목의 개수: {empty_titles_count}")

In [None]:
for idx, emb in enumerate(title_embeddings):
    if not isinstance(emb, np.ndarray) or emb.shape != (300,):
        print(f"Index: {idx}, Title: {train_data['Book-Title'][idx]}, Embedding: {emb}")

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=20)
title_embeddings_array = np.array(title_embeddings)
reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

In [None]:
print("reduced_title_embeddings.shape >>> ",reduced_title_embeddings.shape)
print(reduced_title_embeddings)

### Publisher

In [None]:
### Publisher

le = LabelEncoder()
train_data['Publisher_encoded'] = le.fit_transform(train_data['Publisher'])
embedding_layer = Embedding(num_embeddings=3689, embedding_dim=10)
publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(train_data['Publisher_encoded'].values, dtype=torch.long), dim=1))
publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()

In [None]:
print("Publisher_embeddings.shape >>> ",Publisher_embeddings.shape)
print(Publisher_embeddings)

### User-ID & Book-ID

In [None]:
### User-ID & Book-ID

# User-ID 열의 unique한 값들을 리스트로 만들기  > 83256
unique_user_ids = train_data['User-ID'].unique().tolist()
# Book-ID 열의 unique한 값들을 리스트로 만들기 > 243441
unique_book_ids = train_data['Book-ID'].unique().tolist()

# unique_user_ids 리스트를 기반으로 DataFrame 생성
unique_user_ids_df = pd.DataFrame(unique_user_ids, columns=['User-ID'])
# 내림차순 정렬
sorted_unique_user_ids_df = unique_user_ids_df.sort_values(by='User-ID', ascending=True)
# # 인덱스를 새 column으로 추가
sorted_unique_user_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_user_ids_df.rename(columns={'index': 'UserNodeID'}, inplace=True)

# unique_book_ids 리스트를 기반으로 DataFrame 생성
unique_book_ids_df = pd.DataFrame(unique_book_ids, columns=['Book-ID'])
# 내림차순 정렬
sorted_unique_book_ids_df = unique_book_ids_df.sort_values(by='Book-ID', ascending=True)
# #인덱스를 새 column으로 추가
sorted_unique_book_ids_df.reset_index(inplace=True, drop=True)
sorted_unique_book_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_book_ids_df.rename(columns={'index': 'BookNodeID'}, inplace=True)

# 인덱스 시작 번호를 83256으로 설정
sorted_unique_book_ids_df['BookNodeID'] += 83256

# UserNodeID  0 ~ 83255
# BookNodeID   83256 ~ 326696

In [None]:
def make_ID_dict(df):
    ID_dict = {}
    for index, row in df.iterrows():
        ID_dict[row[1]] = row[0]
    return ID_dict

UserNodeID_dict = make_ID_dict(sorted_unique_user_ids_df)
BookNodeID_dict = make_ID_dict(sorted_unique_book_ids_df)

####### 매핑 진행
train_data['User-ID'] = train_data['User-ID'].map(UserNodeID_dict)
train_data['Book-ID'] = train_data['Book-ID'].map(BookNodeID_dict)

## edge_index

In [None]:
edge_index = torch.tensor(train_data[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()

In [None]:
edge_index

## node feature

In [None]:
# 사용자 노드와 도서 노드의 총 수 계산:

num_user_nodes = len(unique_user_ids)
num_book_nodes = len(unique_book_ids)

user_ids = train_data["User-ID"].values.tolist()
book_ids = train_data["Book-ID"].values.tolist()

In [None]:
user_feature_dim = 30  # age (1) + location (29)
book_feature_dim = 30  # title (20) + publisher (10)

user_features = np.zeros((num_user_nodes, user_feature_dim))
book_features = np.zeros((num_book_nodes, book_feature_dim))

for user_id, age, location in zip(user_ids, age_tensor, location_embeddings):
    user_features[user_id] = np.concatenate([age, location], axis=0)

for book_id, title, publisher in zip(book_ids, reduced_title_embeddings, publisher_embeddings):
    book_features[book_id - num_user_nodes] = np.concatenate([title, publisher], axis=0)

node_features = np.vstack((user_features, book_features))

## Weight & Target

In [None]:
edge_attr = torch.tensor(train_data['Book-Rating'].values, dtype=torch.float).unsqueeze(-1)
y = edge_attr.clone()

# Build Graph

In [None]:
graph_data = Data(x=torch.tensor(node_features), edge_index=edge_index, edge_attr=edge_attr, y=y)
graph_data.n_id = torch.arange(graph_data.num_nodes)

# Graph Split - Node base

In [None]:
# example code

num_user_nodes = len(unique_user_ids)
num_book_nodes = len(unique_book_ids)

user_item_matrix = np.zeros((num_user_nodes, num_book_nodes))

for edge_index, edge_attr in zip(graph_data.edge_index.t(), graph_data.edge_attr):
    user, item = edge_index
    rating = edge_attr
    user_item_matrix[user, item] = rating


In [None]:
from sklearn.model_selection import train_test_split

edges = np.argwhere(user_item_matrix > 0)
ratings = user_item_matrix[edges[:, 0], edges[:, 1]]

train_indices, test_indices, _, _ = train_test_split(
    np.arange(edges.shape[0]), ratings, test_size=0.2, random_state=42
)

train_edges, test_edges = edges[train_indices], edges[test_indices]
train_ratings, test_ratings = ratings[train_indices], ratings[test_indices]


In [None]:

train_data = Data(
    x=graph_data.x, edge_index=torch.tensor(train_edges).t().contiguous(), edge_attr=torch.tensor(train_ratings)
)
test_data = Data(
    x=graph_data.x, edge_index=torch.tensor(test_edges).t().contiguous(), edge_attr=torch.tensor(test_ratings)
)


# GraphSAGE

In [None]:
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class WeightedSAGEConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(WeightedSAGEConv, self).__init__(aggr='mean')
        self.lin = torch.nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index, edge_attr):
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        x = self.lin(x)
        return self.propagate(edge_index, x=x, edge_attr=edge_attr)

    def message(self, x_j, edge_attr):
        return x_j * edge_attr.view(-1, 1)

    def update(self, aggr_out):
        return aggr_out

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class GraphSAGERegressor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.5):
        super(GraphSAGERegressor, self).__init__()
        self.sage1 = WeightedSAGEConv(in_channels, hidden_channels)
        self.sage2 = WeightedSAGEConv(hidden_channels, hidden_channels)
        self.sage3 = WeightedSAGEConv(hidden_channels, out_channels)
        self.batch_norm1 = torch.nn.BatchNorm1d(hidden_channels)
        self.batch_norm2 = torch.nn.BatchNorm1d(hidden_channels)
        self.dropout = dropout

    def forward(self, x, edge_index, edge_attr):
        x = F.relu(self.sage1(x, edge_index, edge_attr))
        x = self.batch_norm1(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = F.relu(self.sage2(x, edge_index, edge_attr))
        x = self.batch_norm2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.sage3(x, edge_index, edge_attr)
        return x.squeeze(-1)



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_data = train_data.to(device)
test_data = test_data.to(device)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


in_channels = 30
hidden_channels = 64
out_channels = 1

model = GraphSAGERegressor(in_channels, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.5)

In [None]:
def train(train_loader):
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.edge_attr)
        
        predictions = out[data.edge_index[0], data.edge_index[1]]
        loss = criterion(predictions, data.y)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def test(test_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            out = model(data.x, data.edge_index, data.edge_attr)

            predictions = out[data.edge_index[0], data.edge_index[1]]
            loss = criterion(predictions, data.y)
            total_loss += loss.item()
    return total_loss / len(test_loader)

In [None]:
early_stopping_patience = 10
best_test_loss = float("inf")
epochs_without_improvement = 0

for epoch in range(1, 201):
    train_loss = train(train_loader)
    test_loss = test(test_loader)
    scheduler.step()
    
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

    print(f'Epoch: {epoch}, Train Loss: {train_loss}, Test Loss: {test_loss}')

    if epochs_without_improvement >= early_stopping_patience:
        print("Early stopping...")
        break

In [None]:
# 

In [None]:
# 하이퍼파라미터 튜닝을 위해 베이지안 최적화 라이브러리인 optuna를 사용

In [None]:
!pip install optuna

In [None]:
import optuna

def objective(trial):
    # 하이퍼파라미터 추천값 설정
    hidden_channels = trial.suggest_int('hidden_channels', 32, 128)
    dropout = trial.suggest_float('dropout', 0.1, 0.5)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True)

    # 모델 및 최적화 생성
    model = GraphSAGERegressor(in_channels, hidden_channels, out_channels, dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # 학습 및 평가 루프
    best_test_loss = float('inf')
    for epoch in range(1, 201):
        train_loss = train(train_loader)
        test_loss = test(test_loader)

        if test_loss < best_test_loss:
            best_test_loss = test_loss

    return best_test_loss

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=15)  # n_trials는 원하는 튜닝 횟수에 따라 조정할 수 있습니다.

best_trial = study.best_trial
print(f"Best trial: {best_trial.number}, Test Loss: {best_trial.value}")
print(f"Best hyperparameters: {best_trial.params}")

In [None]:
# Inference
 모델이 처음 보는 유저 또는 도서가 있는 경우, 해당 유저 또는 도서의 노드 특성을 생성하고 기존 그래프 데이터에 추가해야 함!

In [None]:
def predict_ratings(new_user_book_pairs, user_id_to_index, book_id_to_index):
    model.eval()
    predicted_ratings = []

    with torch.no_grad():
        for user_id, book_id in new_user_book_pairs:
            user_index = user_id_to_index[user_id]
            book_index = book_id_to_index[book_id]

            user_tensor = torch.tensor([user_index], dtype=torch.long, device=device)
            book_tensor = torch.tensor([book_index], dtype=torch.long, device=device)

            edge_index = torch.stack([user_tensor, book_tensor], dim=0)
            edge_attr = torch.tensor([1], dtype=torch.float, device=device)

            out = model(graph_data.x.to(device), edge_index, edge_attr)
            rating = out[user_index, book_index].item()

            predicted_ratings.append((user_id, book_id, rating))

    return predicted_ratings


In [None]:
predicted_ratings = predict_ratings(new_user_book_pairs, user_id_to_index, book_id_to_index)

print(predicted_ratings)
