In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset, NeighborSampler, Batch, DataLoader
from torch_geometric.nn import SAGEConv, GAE, TopKPooling
from torch.optim.lr_scheduler import StepLR
from torch.nn import Embedding
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
import gensim.downloader as api
import optuna
from sklearn.model_selection import train_test_split
import json
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

In [None]:
dataset = pd.read_csv("./prepro_train_data.csv")

In [None]:
# 'Book-Rating' 컬럼 값의 빈도수 계산
rating_counts = dataset['Book-Rating'].value_counts()

# 결과 출력
print(rating_counts)

# 심각한 class imbalance 문제

# Make Graph
## Category Embedding

### Age

In [None]:
age_tensor = torch.tensor(dataset['Age'].values, dtype=torch.float32).unsqueeze(1)

In [None]:
print("age_tensor.shape >>> ",age_tensor.shape)
print(age_tensor)

### Location

In [None]:
### Location

le = LabelEncoder()
dataset['Location_encoded'] = le.fit_transform(dataset['Location'])
embedding_layer = Embedding(num_embeddings=151, embedding_dim=29)
location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(dataset['Location_encoded'].values, dtype=torch.long), dim=1))
location_embeddings = location_embeddings.detach().numpy().squeeze()

In [None]:
print("location_embeddings.shape >>> ",location_embeddings.shape)
print(location_embeddings)

### Book-Title

In [None]:
import fasttext

# # 사전 훈련된 FastText 모델 다운로드
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
# !gunzip cc.en.300.bin.gz

In [None]:
fasttext_model = fasttext.load_model("cc.en.300.bin")

In [None]:
def get_title_embedding_fasttext(title):
    words = title.split()
    if len(words) == 0:
        return np.zeros(300)
    embeddings = [fasttext_model.get_word_vector(word) for word in words]
    return np.mean(embeddings, axis=0)

In [None]:
title_embeddings = dataset['Book-Title'].apply(get_title_embedding_fasttext).tolist()

In [None]:
empty_titles_count = 0
for title in dataset['Book-Title']:
    words = title.split()
    if not words:
        empty_titles_count += 1

print(f"빈 문자열이거나 토큰화된 단어가 없는 책 제목의 개수: {empty_titles_count}")

In [None]:
for idx, emb in enumerate(title_embeddings):
    if not isinstance(emb, np.ndarray) or emb.shape != (300,):
        print(f"Index: {idx}, Title: {dataset['Book-Title'][idx]}, Embedding: {emb}")

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=20)
title_embeddings_array = np.array(title_embeddings)
reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

In [None]:
print("reduced_title_embeddings.shape >>> ",reduced_title_embeddings.shape)
print(reduced_title_embeddings)

### Publisher

In [None]:
### Publisher

le = LabelEncoder()
dataset['Publisher_encoded'] = le.fit_transform(dataset['Publisher'])
embedding_layer = Embedding(num_embeddings=3689, embedding_dim=10)
publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(dataset['Publisher_encoded'].values, dtype=torch.long), dim=1))
publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()

In [None]:
print("Publisher_embeddings.shape >>> ",Publisher_embeddings.shape)
print(Publisher_embeddings)

### User-ID & Book-ID

In [None]:
### User-ID & Book-ID

# User-ID 열의 unique한 값들을 리스트로 만들기  > 83256
unique_user_ids = dataset['User-ID'].unique().tolist()
# Book-ID 열의 unique한 값들을 리스트로 만들기 > 243441
unique_book_ids = dataset['Book-ID'].unique().tolist()

# unique_user_ids 리스트를 기반으로 DataFrame 생성
unique_user_ids_df = pd.DataFrame(unique_user_ids, columns=['User-ID'])
# 내림차순 정렬
sorted_unique_user_ids_df = unique_user_ids_df.sort_values(by='User-ID', ascending=True)
# # 인덱스를 새 column으로 추가
sorted_unique_user_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_user_ids_df.rename(columns={'index': 'UserNodeID'}, inplace=True)

# unique_book_ids 리스트를 기반으로 DataFrame 생성
unique_book_ids_df = pd.DataFrame(unique_book_ids, columns=['Book-ID'])
# 내림차순 정렬
sorted_unique_book_ids_df = unique_book_ids_df.sort_values(by='Book-ID', ascending=True)
# #인덱스를 새 column으로 추가
sorted_unique_book_ids_df.reset_index(inplace=True, drop=True)
sorted_unique_book_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_book_ids_df.rename(columns={'index': 'BookNodeID'}, inplace=True)

# 인덱스 시작 번호를 83256으로 설정
sorted_unique_book_ids_df['BookNodeID'] += 83256

# UserNodeID  0 ~ 83255
# BookNodeID   83256 ~ 326696

In [None]:
def make_ID_dict(df):
    ID_dict = {}
    for index, row in df.iterrows():
        ID_dict[row[1]] = row[0]
    return ID_dict

UserNodeID_dict = make_ID_dict(sorted_unique_user_ids_df)
BookNodeID_dict = make_ID_dict(sorted_unique_book_ids_df)

####### 매핑 진행
dataset['User-ID'] = dataset['User-ID'].map(UserNodeID_dict)
dataset['Book-ID'] = dataset['Book-ID'].map(BookNodeID_dict)

## node feature

In [None]:
num_user_nodes = len(dataset['User-ID'].unique().tolist())
num_book_nodes = len(dataset['Book-ID'].unique().tolist())

user_ids = dataset['User-ID'].unique().tolist()  # 유니크 처리함
book_ids = dataset['Book-ID'].unique().tolist()

In [None]:
num_users = max(user_ids) + 1
num_books = max(book_ids) - num_user_nodes + 1

feature_dim = 30  # age (1) + location (29)  # title (20) + publisher (10)

user_features = np.zeros((num_user_nodes, feature_dim))
book_features = np.zeros((num_book_nodes, feature_dim))

In [None]:
# user_id와 book_id는 이미 정수로 매핑되어 있다고 가정
# 스케일링
from sklearn.preprocessing import StandardScaler

# user_id와 book_id는 이미 정수로 매핑되어 있다고 가정
scaler = StandardScaler()

# 스케일링을 개별적으로 수행
age_tensor_scaled = scaler.fit_transform(age_tensor.reshape(-1, 1))
location_embeddings_scaled = scaler.fit_transform(location_embeddings)

for user_id, age, location in zip(user_ids, age_tensor_scaled, location_embeddings_scaled):
    user_features[user_id] = np.concatenate([age, location], axis=0)

title_embeddings_scaled = scaler.fit_transform(reduced_title_embeddings)
publisher_embeddings_scaled = scaler.fit_transform(publisher_embeddings)

for book_id, title, publisher in zip(book_ids, title_embeddings_scaled, publisher_embeddings_scaled):
    book_features[book_id - num_user_nodes] = np.concatenate([title, publisher], axis=0)
    
# 사용자 노드 행렬과 도서 노드 행렬을 세로로 연결하여 최종 node_features 행렬 생성:    
node_features = np.vstack((user_features, book_features))


# Graph Split - Node base

In [None]:
# 전체 노드의 개수를 계산
num_nodes = len(node_features)

# 데이터 분할을 위한 나눔
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# train 셋에만 오버샘플링 적용
from sklearn.utils import resample
n_samples = 60000
df_list = [train_data[train_data['Book-Rating'] == i] for i in range(11)]
resampled_df_list = [resample(df, replace=True, n_samples=n_samples, random_state=42) for df in df_list]
resampled_train_data = pd.concat(resampled_df_list)

# 각각의 데이터프레임에서 유니크한 값을 찾아냄
train_user_ids = np.unique(resampled_train_data['User-ID'].values)
train_book_ids = np.unique(resampled_train_data['Book-ID'].values)
test_user_ids = np.unique(test_data['User-ID'].values)
test_book_ids = np.unique(test_data['Book-ID'].values)

#유니크한 값들을 이어붙임
train_node_ids = np.concatenate((train_user_ids, train_book_ids))
test_node_ids = np.concatenate((test_user_ids, test_book_ids))

# 텐서로 만듦
train_idx = torch.tensor(train_node_ids, dtype=torch.long)
test_idx = torch.tensor(test_node_ids, dtype=torch.long)

# 마스크 생성. 처음에는 모든 값이 False로 초기화되어 있음
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

#마스크 적용. 각 마스크에 대해 앞서 추출한 노드 ID에 해당하는 인덱스를 True로 설정.
# 이렇게 하면 각 마스크는 해당 데이터셋에 속하는 노드를 나타내게 된다.
train_mask[train_node_ids] = True
test_mask[test_node_ids] = True

## Weight & Target

In [None]:
edge_attr = torch.tensor(dataset['Book-Rating'].values, dtype=torch.float).unsqueeze(-1)
y = edge_attr.clone()

## edge_index

In [None]:
combined_ratings = pd.concat([resampled_train_data, test_data]).reset_index(drop=True)

In [None]:
edge_index = torch.tensor(combined_ratings[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()
edge_index # torch.Size([2, 871393])

In [None]:
# 분할된 후에 다시 클래스 분포 확인해 보기. 

test_data_rating_counts = test_data['Book-Rating'].value_counts()
print(test_data_rating_counts)

resampled_train_data_rating_counts = resampled_train_data['Book-Rating'].value_counts()
print(resampled_train_data_rating_counts)

# Build Graph

In [None]:
# 그래프 데이터 생성
data = Data(x=node_feature_matrix,
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=y,
            train_mask=train_mask,
            test_mask=test_mask)

data.train_mask = train_mask
data.test_mask = test_mask

In [None]:
# 샘플러에 적용
train_loader = NeighborSampler(data.edge_index, node_idx=train_idx, sizes=[5, 3], batch_size=32, shuffle=True, num_nodes=data.num_nodes)
test_loader = NeighborSampler(data.edge_index, node_idx=test_idx, sizes=[5, 3], batch_size=32, shuffle=False, num_nodes=data.num_nodes)

# GraphSAGE

In [None]:
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

class WeightedSAGEConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(WeightedSAGEConv, self).__init__(aggr='mean')
        self.lin = torch.nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index, edge_attr=None):
        row, col = edge_index
        if edge_attr is not None:
            edge_attr = edge_attr[col]  # 인접 노드에 대한 edge_attr만 선택
        edge_index, edge_attr = add_self_loops(edge_index, edge_attr, num_nodes=x.size(0))
        x = self.lin(x)
        return self.propagate(edge_index, x=x, edge_attr=edge_attr)


    def message(self, x_j, edge_attr):
        if edge_attr is not None:
            return x_j * edge_attr.view(-1, 1)
        else:
            return x_j

    def update(self, aggr_out):
        return aggr_out

In [None]:
class GraphSAGERegressor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout, activation_name, num_layers=2):
        super(GraphSAGERegressor, self).__init__()

        # Select the activation function
        activations = torch.nn.ModuleDict([
            ['ReLU', torch.nn.ReLU()],
            ['ReLU6', torch.nn.ReLU6()],
            ['LeakyReLU', torch.nn.LeakyReLU()],
            ['PReLU', torch.nn.PReLU()],
            ['ELU', torch.nn.ELU()],
            ['SiLU', torch.nn.SiLU()]
        ])
        self.activation = activations[activation_name]
        if self.activation is None:
            raise ValueError(f"Unsupported activation function: {activation_name}")

        self.user_convs = torch.nn.ModuleList()
        self.book_convs = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()

        # Add the first layer
        self.user_convs.append(WeightedSAGEConv(in_channels, hidden_channels))
        self.book_convs.append(WeightedSAGEConv(in_channels, hidden_channels))
        self.batch_norms.append(torch.nn.BatchNorm1d(hidden_channels))

        # Add intermediate layers
        for _ in range(num_layers - 2):
            self.user_convs.append(WeightedSAGEConv(hidden_channels, hidden_channels))
            self.book_convs.append(WeightedSAGEConv(hidden_channels, hidden_channels))
            self.batch_norms.append(torch.nn.BatchNorm1d(hidden_channels))

        # Add the last layer
        self.user_convs.append(WeightedSAGEConv(hidden_channels, out_channels))
        self.book_convs.append(WeightedSAGEConv(hidden_channels, out_channels))

        self.dropout = torch.nn.Dropout(p=dropout)

    def forward(self, x, edge_index_list, edge_attr_list=None):
        if edge_attr_list is None:
            edge_attr_list = [None] * len(edge_index_list)
            
        user_x = x
        book_x = x

        for i in range(len(self.user_convs) - 1):
            user_x = self.user_convs[i](user_x, edge_index_list[i], edge_attr_list[i])
            user_x = self.batch_norms[i](user_x)
            user_x = self.activation(user_x)
            user_x = self.dropout(user_x)

            book_x = self.book_convs[i](book_x, edge_index_list[i], edge_attr_list[i])
            book_x = self.batch_norms[i](book_x)
            book_x = self.activation(book_x)
            book_x = self.dropout(book_x)

        user_emb = self.user_convs[-1](user_x, edge_index_list[-1], edge_attr_list[-1])
        user_emb = torch.sigmoid(user_emb) * 10

        book_emb = self.book_convs[-1](book_x, edge_index_list[-1], edge_attr_list[-1])
        book_emb = torch.sigmoid(book_emb) * 10

        return user_emb, book_emb

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

in_channels = 30
hidden_channels = 32
out_channels = 1
activation_name = 'ReLU'
dropout =  0.1

model = GraphSAGERegressor(in_channels, hidden_channels, out_channels, dropout, activation_name, num_layers=3)
model = model.to(device)  # 모델만 옮기고, 데이터는 옮기지 않는다. 그래야 메모리 효율성 업!
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.5)

# Train / Test

In [None]:
!nvidia-smi

In [None]:
def train(train_loader, optimizer):
    model.train()
    total_loss = 0
    for batch_size, n_id, adjs in train_loader:
        torch.cuda.empty_cache()
        adjs = [adj.to(device) for adj in adjs]  # Move adjs to device
        edge_index_list = [adj.edge_index for adj in adjs]
        edge_attr_list = [data.edge_attr.to(device) for _ in adjs]  # Reuse edge_attr

        optimizer.zero_grad()
        out = model(data.x[n_id].to(device), edge_index_list, edge_attr_list)
        user_emb, book_emb = out[:batch_size]  # Assume the model outputs user embeddings and book embeddings
        predictions = (user_emb * book_emb).sum(dim=-1)  # Compute predicted ratings
        loss = criterion(predictions, data.y[n_id[:batch_size]].to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def test(test_loader):
    model.eval()
    total_loss = 0
    for batch_size, n_id, adjs in test_loader:
        adjs = [adj.to(device) for adj in adjs]  # Move adjs to device
        edge_index_list = [adj.edge_index for adj in adjs]
        edge_attr_list = [data.edge_attr.to(device) for _ in adjs]
        
        out = model(data.x[n_id].to(device), edge_index_list, edge_attr_list)
        user_emb, book_emb = out[:batch_size]  # Assume the model outputs user embeddings and book embeddings
        predictions = (user_emb * book_emb).sum(dim=-1)  # Compute predicted ratings
        loss = criterion(predictions, data.y[n_id[:batch_size]].to(device))
        total_loss += loss.item()
    return total_loss / len(test_loader)


# Beysian Hyper Parameter Optimizing

In [None]:
def objective(trial):
    # 하이퍼파라미터 추천값 설정
    in_channels = node_features.shape[1]
    hidden_channels = trial.suggest_int('hidden_channels', 32, 128)
    out_channels = 1
    dropout = trial.suggest_float('dropout', 0.1, 0.5)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128])
    activation_name = trial.suggest_categorical('activation_name', ['ReLU', 'LeakyReLU', 'PReLU', 'ELU', 'SiLU'])
    optimizer_name = trial.suggest_categorical('optimizer_name', ['AdamW', 'Adam', 'RMSprop', 'Adagrad'])

    #hidden_channels, dropout 등 숫자 값을 설정할 때, 가능한 경우 integer가 아닌 float으로 설정하는 것이 좋습니다.
    # 이렇게 함으로써 float값 범위 내에서 모든 가능한 값을 테스트해볼 수 있게 됩니다.
    
    # 모델 및 최적화 생성
    model = GraphSAGERegressor(in_channels, hidden_channels, out_channels, dropout, activation_name).to(device)

    optimizer_class = getattr(torch.optim, optimizer_name)
    optimizer_instance = optimizer_class(model.parameters(), lr=lr, weight_decay=weight_decay)
    #최적화 함수를 바꿀 때마다 optimizer_instance의 파라미터를 수정해줘야 하는데, 이 과정에서 누락될 가능성이 있습니다.
    # 이를 방지하기 위해 optimizer_instance를 새로 생성해주는 것이 좋습니다.
    
    # DataLoader 수정
    train_loader = NeighborSampler(data.edge_index, node_idx=train_idx, sizes=[5,3], batch_size=batch_size, shuffle=True, num_nodes=data.num_nodes)
    test_loader = NeighborSampler(data.edge_index, node_idx=test_idx, sizes=[5, 3], batch_size=batch_size, shuffle=False, num_nodes=data.num_nodes)

    # 학습 및 평가 루프
    best_test_loss = float('inf')
    for epoch in range(1, 11):
        train_loss = train(train_loader, optimizer_instance)
        test_loss = test(test_loader)

        if test_loss < best_test_loss:
            best_test_loss = test_loss
            best_model_weights = deepcopy(model.state_dict())
            trial.set_user_attr('best_model_weights', best_model_weights)
            trial.set_user_attr('best_model', model)
    tqdm.write(f'Trial {trial.number} - Test Loss: {best_test_loss:.4f}')

    return best_test_loss

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)
best_trial = study.best_trial

print(f"Best trial: {best_trial.number}, Test Loss: {best_trial.value}")
print(f"Best hyperparameters: {best_trial.params}")

# 최적의 파라미터를 세팅
best_params = best_trial.params

# 최적의 모델을 받아옵니다.
best_model = best_trial.user_attrs['best_model']

# 최적의 모델의 가중치를 저장합니다.
best_model_weights = best_trial.user_attrs['best_model_weights']
torch.save(best_model_weights, 'best_model_weights.pth')

In [None]:
# 가중치 불러오기
loaded_weights = torch.load('best_model_weights.pth')

# best_model의 상태 사전 출력
print("final_model's state_dict:")
for param_tensor in best_model.state_dict():
    print(param_tensor, "\t", best_model.state_dict()[param_tensor].size())

print("\nLoaded weights:")
for param_tensor in loaded_weights:
    print(param_tensor, "\t", loaded_weights[param_tensor].size())

# Train with Best Parameter

In [None]:
model_params = {key: best_params[key] for key in ['hidden_channels', 'dropout', 'activation_name']}
model_params['out_channels'] = 1
model_params['in_channels'] = 30

best_model = GraphSAGERegressor(**model_params).to(device)
best_model.load_state_dict(torch.load('best_model_weights.pth'))

optimizer_name = best_params['optimizer_name']
optimizer_class = getattr(torch.optim, optimizer_name)
lr = best_params['lr']
weight_decay = best_params['weight_decay']
optimizer = optimizer_class(best_model.parameters(), lr=lr, weight_decay=weight_decay)

batch_size = best_params['batch_size']
train_loader = NeighborSampler(data.edge_index, node_idx=train_idx, sizes=[5, 3], batch_size=batch_size, shuffle=True, num_nodes=data.num_nodes)
test_loader = NeighborSampler(data.edge_index, node_idx=test_idx, sizes=[5, 3], batch_size=batch_size, shuffle=False, num_nodes=data.num_nodes)

In [None]:
# 가중치 불러오기
loaded_weights = torch.load('best_model_weights.pth')

# best_model의 상태 사전 출력
print("best_model's state_dict:")
for param_tensor in best_model.state_dict():
    print(param_tensor, "\t", best_model.state_dict()[param_tensor].size())

print("\nLoaded weights:")
for param_tensor in loaded_weights:
    print(param_tensor, "\t", loaded_weights[param_tensor].size())

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

num_epochs = 100  # 원하는 에포크 수를 설정하세요.
patience = 10  # Early stopping patience 설정
min_delta = 0.001  # Early stopping을 위한 최소 개선량 설정
best_test_loss = float('inf')
epochs_no_improve = 0

# Learning rate scheduler 설정
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

best_model.train()
for epoch in tqdm(range(1, num_epochs + 1)):
    train_loss = train(train_loader, optimizer)
    test_loss = test(test_loader)
    
    # Learning rate scheduler 업데이트
    scheduler.step(test_loss)

    print(f"Epoch {epoch}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

    # Early stopping 조건 검사
    if test_loss < best_test_loss - min_delta:
        best_test_loss = test_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

In [None]:
torch.save(best_model.state_dict(), 'final_trained_model_weights.pth')

with open('final_trained_model_params.json', 'w') as f:
    json.dump(model_params, f)

# Inference
 모델이 처음 보는 유저 또는 도서가 있는 경우, 해당 유저 또는 도서의 노드 특성을 생성하고 기존 그래프 데이터에 추가해야 함!

In [None]:
test_df = pd.read_csv("./data/prepro_test_data.csv")

In [None]:
age_tensor = torch.tensor(test_df['Age'].values, dtype=torch.float32).unsqueeze(1)

test_df['Location_encoded'] = le.fit_transform(test_df['Location'])
embedding_layer = Embedding(num_embeddings=151, embedding_dim=29)
location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(test_df['Location_encoded'].values, dtype=torch.long), dim=1))
location_embeddings = location_embeddings.detach().numpy().squeeze()

title_embeddings = test_df['Book-Title'].apply(get_title_embedding_fasttext).tolist()
title_embeddings_array = np.array(title_embeddings)
reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

test_df['Publisher_encoded'] = le.fit_transform(test_df['Publisher'])
embedding_layer = Embedding(num_embeddings=3689, embedding_dim=10)
publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(test_df['Publisher_encoded'].values, dtype=torch.long), dim=1))
publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()

In [None]:
new_unique_user_ids = test_df['User-ID'].unique().tolist()
new_unique_book_ids = test_df['Book-ID'].unique().tolist()

# 새로운 사용자 ID에 대한 인덱스를 할당합니다.
last_used_index = 326696
for new_user_id in new_unique_user_ids:
    if new_user_id not in UserNodeID_dict:
        last_used_index += 1
        UserNodeID_dict[new_user_id] = last_used_index

# 새로운 도서 ID에 대한 인덱스를 할당합니다.
for new_book_id in new_unique_book_ids:
    if new_book_id not in BookNodeID_dict:
        last_used_index += 1
        BookNodeID_dict[new_book_id] = last_used_index
        
# IDs 매핑 진행
test_df['User-ID'] = test_df['User-ID'].map(UserNodeID_dict)
test_df['Book-ID'] = test_df['Book-ID'].map(BookNodeID_dict)

In [None]:
new_unique_node_count = len(UserNodeID_dict) + len(BookNodeID_dict)
new_feature_matrix = np.zeros((new_unique_node_count, feature_dim))

user_ids = test_df['User-ID'].unique().tolist()  # 유니크 처리함
book_ids = test_df['Book-ID'].unique().tolist()

# user_id와 book_id는 이미 정수로 매핑되어 있다고 가정
# 스케일링을 개별적으로 수행
age_tensor_scaled = scaler.fit_transform(age_tensor.reshape(-1, 1))
location_embeddings_scaled = scaler.fit_transform(location_embeddings)

for user_id, age, location in zip(user_ids, age_tensor_scaled, location_embeddings_scaled):
    new_feature_matrix[user_id] = np.concatenate([age, location], axis=0)

title_embeddings_scaled = scaler.fit_transform(reduced_title_embeddings)
publisher_embeddings_scaled = scaler.fit_transform(publisher_embeddings)

for book_id, title, publisher in zip(book_ids, title_embeddings_scaled, publisher_embeddings_scaled):
    new_feature_matrix[book_id - num_user_nodes] = np.concatenate([title, publisher], axis=0)

new_node_feature_matrix = torch.tensor(new_feature_matrix, dtype=torch.float)

In [None]:
with open('final_trained_model_params.json', 'r') as f:
    loaded_model_params = json.load(f)
    
final_model = GraphSAGERegressor(**loaded_model_params).to(device)

final_model.load_state_dict(torch.load('final_trained_model_weights.pth'))

final_model.eval()

In [None]:
# 가중치 불러오기
loaded_weights = torch.load('final_trained_model_weights.pth')

# best_model의 상태 사전 출력
print("final_model's state_dict:")
for param_tensor in final_model.state_dict():
    print(param_tensor, "\t", best_model.state_dict()[param_tensor].size())

print("\nLoaded weights:")
for param_tensor in loaded_weights:
    print(param_tensor, "\t", loaded_weights[param_tensor].size())

In [None]:
new_edge_index = torch.tensor(test_df[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()
new_data = Data(x=torch.tensor(new_feature_matrix, dtype=torch.float), edge_index=new_edge_index)

In [None]:
new_data_loader = DataLoader([new_data], batch_size=16, shuffle=False)

In [None]:
predictions = []

for batch in new_data_loader:
    batch = batch.to(device)
    with torch.no_grad():
        out = final_model(batch.x, [batch.edge_index])
        edge_index_row, edge_index_col = batch.edge_index
        edge_predictions = out[edge_index_row] * out[edge_index_col]
        predictions.append(edge_predictions.sum(dim=-1).cpu().numpy())

# Combine the predictions
predictions = np.concatenate(predictions)