In [1]:
import pandas as pd
import numpy as np
import torch 
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from torch.nn import Embedding
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree
import tqdm
import networkx as nx
from torch_geometric.utils import from_networkx
from torch_geometric.loader import NeighborSampler
from torch_geometric.utils import negative_sampling
import networkx as nx
from torch_geometric.utils import to_networkx

import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.utils import negative_sampling
from torch.nn import Embedding

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
dataset = pd.read_csv("./data/prepro_train_data.csv")
test_df = pd.read_csv("./data/prepro_test_data.csv")
submit = pd.read_csv("./data/sample_submission.csv")

# Make Graph Component

In [4]:
age_tensor = torch.tensor(dataset['Age'].values, dtype=torch.float32).unsqueeze(1)

### Location
# LabelEncoder를 사용해 위치 정보를 정수로 변환
le = LabelEncoder()
dataset['Location_encoded'] = le.fit_transform(dataset['Location'])

# 임베딩 레이어 초기화
embedding_layer = Embedding(num_embeddings=151, embedding_dim=79)

# 위치 정보를 10차원 벡터로 변환
location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(dataset['Location_encoded'].values, dtype=torch.long), dim=1))

# 결과를 NumPy array로 변환
location_embeddings = location_embeddings.detach().numpy().squeeze()


In [5]:
import fasttext
# FastText 모델 로드
fasttext_model = fasttext.load_model("cc.en.300.bin")

def get_title_embedding_fasttext(title):
    if not isinstance(title, str):
        title = ""
    words = title.split()
    if len(words) == 0:
        return np.zeros(300)
    embeddings = [fasttext_model.get_word_vector(word) for word in words]
    return np.mean(embeddings, axis=0)
title_embeddings = dataset['Book-Title'].apply(get_title_embedding_fasttext).tolist()
dataset['Book-Title_encoded'] = dataset['Book-Title'].apply(get_title_embedding_fasttext)



In [6]:
from sklearn.decomposition import PCA

# PCA 객체를 생성
pca = PCA(n_components=50)

# 평균 임베딩 벡터로 구성된 리스트를 NumPy 배열로 변환
title_embeddings_array = np.array(title_embeddings)

# 차원 축소
reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

In [7]:
### Publisher
# LabelEncoder를 사용해 위치 정보를 정수로 변환
le = LabelEncoder()
dataset['Publisher_encoded'] = le.fit_transform(dataset['Publisher'])

# 임베딩 레이어 초기화
embedding_layer = Embedding(num_embeddings=3689, embedding_dim=30)

# 위치 정보를 10차원 벡터로 변환
publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(dataset['Publisher_encoded'].values, dtype=torch.long), dim=1))

# 결과를 NumPy array로 변환
publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()

### User-ID & Book-ID

In [8]:
combined_df = pd.concat([dataset, test_df])

unique_user_ids_num = combined_df['User-ID'].nunique()
unique_book_ids_num = combined_df['Book-ID'].nunique()

In [9]:
### User-ID & Book-ID

# User-ID 열의 unique한 값들을 리스트로 만들기  > 83256
unique_user_ids = combined_df['User-ID'].unique().tolist()
# Book-ID 열의 unique한 값들을 리스트로 만들기 > 243441
unique_book_ids = combined_df['Book-ID'].unique().tolist()

# unique_user_ids 리스트를 기반으로 DataFrame 생성
unique_user_ids_df = pd.DataFrame(unique_user_ids, columns=['User-ID'])
# 내림차순 정렬
sorted_unique_user_ids_df = unique_user_ids_df.sort_values(by='User-ID', ascending=True)
# # 인덱스를 새 column으로 추가
sorted_unique_user_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_user_ids_df.rename(columns={'index': 'UserNodeID'}, inplace=True)

# unique_book_ids 리스트를 기반으로 DataFrame 생성
unique_book_ids_df = pd.DataFrame(unique_book_ids, columns=['Book-ID'])
# 내림차순 정렬
sorted_unique_book_ids_df = unique_book_ids_df.sort_values(by='Book-ID', ascending=True)
# #인덱스를 새 column으로 추가
sorted_unique_book_ids_df.reset_index(inplace=True, drop=True)
sorted_unique_book_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_book_ids_df.rename(columns={'index': 'BookNodeID'}, inplace=True)


# # book_ids 인덱스 시작 번호를 83256으로 설정
sorted_unique_book_ids_df['BookNodeID'] += (unique_user_ids_num)
# UserNodeID  0 ~ 83255 ....인데 test 데이터셋의 uniuqe 숫자까지 고려하면 92102.
# BookNodeID   83256(/////92101) ~ 326696

In [10]:
def make_ID_dict(df):
    ID_dict = {}
    for index, row in df.iterrows():
        ID_dict[row[1]] = row[0]
    return ID_dict

UserNodeID_dict = make_ID_dict(sorted_unique_user_ids_df)
BookNodeID_dict = make_ID_dict(sorted_unique_book_ids_df)

####### 매핑 진행
dataset['User-ID'] = dataset['User-ID'].map(UserNodeID_dict)
dataset['Book-ID'] = dataset['Book-ID'].map(BookNodeID_dict)

## node feature


In [11]:
user_ids = dataset['User-ID'].unique().tolist()  # 유니크 처리함
book_ids = dataset['Book-ID'].unique().tolist()

feature_dim = 80  # age (1) + location (29)  # title (20) + publisher (10)
feature_matrix = np.zeros((unique_user_ids_num+unique_book_ids_num, feature_dim))

In [12]:
from sklearn.preprocessing import StandardScaler

# user_id와 book_id는 이미 정수로 매핑되어 있다고 가정
scaler = StandardScaler()

# 스케일링을 개별적으로 수행
age_tensor_scaled = scaler.fit_transform(age_tensor.reshape(-1, 1))
location_embeddings_scaled = scaler.fit_transform(location_embeddings)

for user_id, age, location in zip(user_ids, age_tensor_scaled, location_embeddings_scaled):
    feature_matrix[user_id] = np.concatenate([age, location], axis=0)

title_embeddings_scaled = scaler.fit_transform(reduced_title_embeddings)
publisher_embeddings_scaled = scaler.fit_transform(publisher_embeddings)

for book_id, title, publisher in zip(book_ids, title_embeddings_scaled, publisher_embeddings_scaled):
    feature_matrix[book_id] = np.concatenate([title, publisher], axis=0)
    
# 사용자 노드 행렬과 도서 노드 행렬을 세로로 연결하여 최종 node_features 행렬 생성:    
# node_features = np.vstack((user_features, book_features))

In [13]:
from torch_geometric.data import Data

node_feature_matrix = torch.tensor(feature_matrix, dtype=torch.float)
edge_attr = torch.tensor(dataset['Book-Rating'].values, dtype=torch.float).unsqueeze(-1)
edge_index = torch.tensor(dataset[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()

data = Data(x=node_feature_matrix,
            edge_index=edge_index,
            edge_attr=edge_attr)

In [14]:
train_user_ids = np.unique(dataset['User-ID'].values)
train_book_ids = np.unique(dataset['Book-ID'].values)
train_node_ids = np.concatenate((train_user_ids, train_book_ids))
train_idx = torch.tensor(train_node_ids, dtype=torch.long)

# GraphSAGE

In [15]:
# import torch
# from torch.nn import functional as F

# def nce_loss(pos_out, neg_out, neg_sample_size):
#     pos_out = pos_out.view(-1).unsqueeze(0)
#     neg_out = neg_out.view(-1).unsqueeze(0)
    
#     out = torch.cat((pos_out, neg_out), 1)
#     out = F.log_softmax(out * 100, dim=1)
#     return -out[0][0]

# '''
# 네, nce_loss 함수와 현재 코드에서의 손실 계산 방식은 서로 다릅니다. 두 방법 모두 Noise Contrastive Estimation (NCE) 손실을 계산하려고 하지만, 구체적인 구현 방식과 계산 식에는 차이가 있습니다.
# nce_loss 함수: 이 함수는 positive output과 negative output을 concatenate하고, log softmax 함수를 이용하여 확률을 계산합니다. 그 후 첫 번째 값 (positive sample에 대한 예측)에 대한 negative log likelihood를 반환합니다. 이 방식은 마치 multi-class classification 문제를 푸는 것처럼 모든 negative sample을 동시에 고려합니다. 이 때, out * 100는 softmax 함수의 특성 때문에 temperature를 낮추는 역할을 합니다. 이렇게 하면 모델이 더욱 확신 있는 예측을 하도록 돕습니다.
# 현재 코드에서의 손실 계산: 현재 코드에서는 각 positive sample과 negative sample에 대해 binary cross entropy (BCE) loss를 계산합니다. positive sample에 대한 손실은 -log(sigmoid(pos_score)) 이며, negative sample에 대한 손실은 -log(1 - sigmoid(neg_score)) 입니다. 이 방식은 각 sample에 대해 binary classification 문제를 푸는 것처럼 각 negative sample을 독립적으로 고려합니다.
# 두 방법 모두 NCE 손실을 계산하는 방법이지만, 구체적인 구현 방식에 차이가 있습니다. 어떤 방식이 더 좋은지는 데이터와 문제에 따라 다를 수 있으므로, 두 방법을 모두 실험해 보고 성능을 비교하는 것이 좋습니다.
# '''

In [16]:
# # 제대로 작동하는 베이스라인

import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.utils import negative_sampling
from torch.nn import Embedding

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        return x
    
def train(data):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GraphSAGE(data.num_node_features, 128, 64).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    data = data.to(device)

    model.train()
    for epoch in range(6):
        optimizer.zero_grad()
        z = model(data.x, data.edge_index)  # Obtain node embeddings

        # Negative sampling
        edge_index_pos = data.edge_index
        edge_index_neg = negative_sampling(edge_index_pos, num_nodes=data.num_nodes,
                                           num_neg_samples=edge_index_pos.size(1))

        pos_loss = (1 - torch.sigmoid((z[edge_index_pos[0]] * z[edge_index_pos[1]]).sum(dim=-1))).mean()
        neg_loss = torch.sigmoid((z[edge_index_neg[0]] * z[edge_index_neg[1]]).sum(dim=-1)).mean()
        loss = -torch.log(pos_loss) - torch.log(1 - neg_loss)

        loss.backward()
        optimizer.step()

    return model


###### baseline과 함께 쓰는 cpu 연산 함수

import numpy as np
def negative_sampling_with_centrality(edge_index, num_nodes, num_neg_samples, centrality):
    pos_edges = edge_index.cpu().numpy()
    pos_edges = pos_edges[:, pos_edges[0] < pos_edges[1]] # Ensure edges are one-way
    pos_edges_set = set(map(tuple, pos_edges.T))

    num_samples = edge_index.size(1) * num_neg_samples
    neg_edges = np.empty((2, num_samples), dtype=np.int64)
    centrality = centrality / centrality.sum()

    idx = 0
    while idx < num_samples:
        tmp_edges = np.random.choice(num_nodes, size=(2, num_samples-idx), p=centrality)
        tmp_edges = tmp_edges[:, tmp_edges[0] < tmp_edges[1]]
        for edge in tmp_edges.T:
            if tuple(edge) not in pos_edges_set:
                neg_edges[:, idx] = edge
                idx += 1

    return torch.from_numpy(neg_edges).to(edge_index.device)

In [17]:
import gc
gc.collect()

0

In [18]:
model = train(data)   #가중치 반영과 부정/긍정 반영 완료 

In [19]:
model.eval()
with torch.no_grad():
    x = data.x.to(torch.device('cuda'))
    edge_index = data.edge_index.to(torch.device('cuda'))
    embeddings = model(x, edge_index).cpu().detach().numpy()

In [21]:
embeddings

array([[ 0.14259125,  1.2785361 , -3.712126  , ..., -3.4537666 ,
         5.567064  ,  1.5775338 ],
       [ 0.14259125,  1.2785361 , -3.712126  , ..., -3.4537666 ,
         5.567064  ,  1.5775338 ],
       [ 0.14259125,  1.2785361 , -3.712126  , ..., -3.4537666 ,
         5.567064  ,  1.5775338 ],
       ...,
       [ 1.0620803 ,  1.3408318 , -1.8986593 , ..., -2.6062376 ,
         2.9764261 ,  1.2410023 ],
       [-0.14292783, -0.02004437, -0.847361  , ..., -0.89263856,
         1.2784622 , -0.04630674],
       [ 1.260303  ,  1.3715726 , -2.707858  , ..., -3.469562  ,
         3.6128674 ,  1.1688728 ]], dtype=float32)

In [None]:
# np.save("embeddings.npy", embeddings)

실험이 잘 끝나면 

1. 배치 정규화 & dropout 추가 (임베딩 잘 되는지 한번 더 확인)
2.edge_attr 추가 (임베딩 잘 되는지 한번 더 확인)
3. 하이퍼 파라미터 튜닝
3. 오버샘플링 추가 (성능 차이 있는지 확인)


# LGBM 

In [20]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [22]:
df = dataset[["Age","Location_encoded","Year-Of-Publication","Publisher_encoded"]]
df_np = df.to_numpy()
book_title_embeddings = np.stack(dataset["Book-Title_encoded"].values)
df_emb = np.concatenate((df_np, book_title_embeddings), axis=1) # features_emb는 edge_index와 완전히 같은 정렬 순서를 가짐
edge_index = edge_index.cpu().numpy()
# 이제 여기에 embeddings에서 적절한 값들을 concat하면 된다.

# edge_index[0] 리스트를 순회하며, 해당 값으로 embeddings를 인덱싱
user_embeddings_selected = embeddings[edge_index[0]]

# 인덱싱된 값을 그대로 df_emb의 오른편에 concat
df_emb = np.concatenate((df_emb, user_embeddings_selected), axis=1)

# edge_index[1] 리스트를 순회하며, 해당 값으로 embeddings를 인덱싱
book_embeddings_selected = embeddings[edge_index[1]]

# 인덱싱된 값을 그대로 df_emb의 오른편에 concat
df_emb = np.concatenate((df_emb, book_embeddings_selected), axis=1)

In [23]:
# Generate features and targets
X = df_emb
y = data.edge_attr.cpu().numpy()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [27]:
# 상위 5개 파라미터 세트 출력하는 버전 

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import lightgbm as lgb

# Define the search space for hyperparameters
space = {
    'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'feature_fraction': hp.uniform('feature_fraction', 0.5, 1.0),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1.0),
    'bagging_freq': hp.quniform('bagging_freq', 1, 7, 1),
    'max_depth': hp.quniform('max_depth', 5, 10, 1),
    'force_col_wise': hp.choice('force_col_wise', [True]),
}

# Define the objective function
def objective(params):
    params = {
        'device': 'gpu',
        'num_leaves': int(params['num_leaves']),
        'learning_rate': params['learning_rate'],
        'feature_fraction': params['feature_fraction'],
        'bagging_fraction': params['bagging_fraction'],
        'bagging_freq': int(params['bagging_freq']),
        'max_depth': int(params['max_depth']),
        'force_col_wise': params['force_col_wise'],
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'verbose': 0
    }

    # This is a 5-fold cross-validation
    cv_results = lgb.cv(params, train_data, num_boost_round=500, nfold=5, 
                        early_stopping_rounds=50, metrics='rmse', seed=42)
    # Hyperopt will try to minimize loss (it always minimizes the objective)
    loss = min(cv_results['rmse-mean'])
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

# Run the algorithm
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)

# Print the best parameters
print("Best parameters:")
print(best)

# Print top 5 hyperparameter sets
sorted_trials = sorted(trials.results, key=lambda x: x['loss'])
print("Top 5 hyperparameter sets:")
for t in sorted_trials[:5]:
    print(f"Loss: {t['loss']}, Params: {t['params']}")


  0%|                                                                   | 0/10 [00:00<?, ?trial/s, best loss=?]





 10%|████                                     | 1/10 [01:37<14:38, 97.63s/trial, best loss: 3.6349934294627273]




 20%|████████▏                                | 2/10 [02:45<10:43, 80.40s/trial, best loss: 3.6349934294627273]




 30%|████████████▎                            | 3/10 [03:57<08:54, 76.37s/trial, best loss: 3.6349934294627273]




 40%|████████████████▍                        | 4/10 [05:48<08:59, 89.85s/trial, best loss: 3.6349934294627273]




 50%|████████████████████▌                    | 5/10 [07:29<07:50, 94.12s/trial, best loss: 3.6349934294627273]




 60%|████████████████████████▌                | 6/10 [08:19<05:16, 79.16s/trial, best loss: 3.6349934294627273]




 70%|████████████████████████████▋            | 7/10 [09:32<03:51, 77.12s/trial, best loss: 3.6349934294627273]




 80%|████████████████████████████████▊        | 8/10 [10:13<02:11, 65.50s/trial, best loss: 3.6349934294627273]




 90%|████████████████████████████████████▉    | 9/10 [12:38<01:30, 90.33s/trial, best loss: 3.6349934294627273]




100%|████████████████████████████████████████| 10/10 [14:31<00:00, 87.19s/trial, best loss: 3.6349934294627273]
Best parameters:
{'bagging_fraction': 0.9817448768721541, 'bagging_freq': 4.0, 'feature_fraction': 0.5210236827419366, 'force_col_wise': 0, 'learning_rate': 0.1000239348640537, 'max_depth': 10.0, 'num_leaves': 110.0}
Top 5 hyperparameter sets:
Loss: 3.6349934294627273, Params: {'device': 'gpu', 'num_leaves': 110, 'learning_rate': 0.1000239348640537, 'feature_fraction': 0.5210236827419366, 'bagging_fraction': 0.9817448768721541, 'bagging_freq': 4, 'max_depth': 10, 'force_col_wise': True, 'objective': 'regression', 'boosting_type': 'gbdt', 'verbose': 0}
Loss: 3.641698702048835, Params: {'device': 'gpu', 'num_leaves': 90, 'learning_rate': 0.088553583751713, 'feature_fraction': 0.5108296025455152, 'bagging_fraction': 0.9013082077762015, 'bagging_freq': 6, 'max_depth': 9, 'force_col_wise': True, 'objective': 'regression', 'boosting_type': 'gbdt', 'verbose': 0}
Loss: 3.648845261379

In [28]:
import pickle

# sorted_trials 변수 저장
with open("sorted_trials.pickle", "wb") as f:
    pickle.dump(sorted_trials, f)

In [29]:
print("done")

done


# train LGBMs for Amsemble

In [None]:
hyperparameters = []
n = 5
for i in sorted_trials[:n]:
    hyperparameters.append(i[params])

In [None]:
# Train LightGBM model
models = {}
# 각각의 하이퍼파라미터 세트에 대해 모델을 학습하고 저장
for i, params in enumerate(hyperparameters):
    gbm = lgb.train(params, train_data, num_boost_round=500, valid_sets=test_data,
                early_stopping_rounds=10, verbose_eval=False)
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    # Calculate and print RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"RMSE: {rmse:.4f}")
    models[i] = gbm

# 각 모델을 파일로 저장
for i, model in models.items():
    model.save_model(f'model_{i}.txt')

In [None]:
# X_df = pd.DataFrame(X)

# # Compute correlation matrix
# corr_matrix = X_df.corr()

# # Display correlation matrix
# print(corr_matrix)

In [None]:
models

# Inference

In [None]:
age_tensor = torch.tensor(test_df['Age'].values, dtype=torch.float32).unsqueeze(1)

test_df['Location_encoded'] = le.fit_transform(test_df['Location'])
embedding_layer = Embedding(num_embeddings=151, embedding_dim=79)
location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(test_df['Location_encoded'].values, dtype=torch.long), dim=1))
location_embeddings = location_embeddings.detach().numpy().squeeze()

pca = PCA(n_components=50)
title_embeddings = test_df['Book-Title'].apply(get_title_embedding_fasttext).tolist()
title_embeddings_array = np.array(title_embeddings)
reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

test_df['Publisher_encoded'] = le.fit_transform(test_df['Publisher'])
embedding_layer = Embedding(num_embeddings=3689, embedding_dim=30)
publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(test_df['Publisher_encoded'].values, dtype=torch.long), dim=1))
publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()
        
# IDs 매핑 진행
test_df['User-ID'] = test_df['User-ID'].map(UserNodeID_dict)
test_df['Book-ID'] = test_df['Book-ID'].map(BookNodeID_dict)

In [None]:
age_tensor_scaled = scaler.fit_transform(age_tensor.reshape(-1, 1))
location_embeddings_scaled = scaler.fit_transform(location_embeddings)

for user_id, age, location in zip(user_ids, age_tensor_scaled, location_embeddings_scaled):
    feature_matrix[user_id] = np.concatenate([age, location], axis=0)

title_embeddings_scaled = scaler.fit_transform(reduced_title_embeddings)
publisher_embeddings_scaled = scaler.fit_transform(publisher_embeddings)

for book_id, title, publisher in zip(book_ids, title_embeddings_scaled, publisher_embeddings_scaled):
    feature_matrix[book_id] = np.concatenate([title, publisher], axis=0)

new_node_feature_matrix = torch.tensor(feature_matrix, dtype=torch.float)

In [None]:
new_node_feature_matrix.shape

In [None]:
new_edge_index = torch.tensor(test_df[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous().to(device)
new_data = Data(x=new_node_feature_matrix, edge_index=new_edge_index).to(device)

In [None]:
# 임베딩 추출
model = train(data)
model.eval()
with torch.no_grad():
    x = new_data.x.to(torch.device('cuda'))
    edge_index = new_edge_index.to(torch.device('cuda'))
    test_embeddings = model(x, new_edge_index).cpu().detach().numpy()

In [None]:
test_embeddings

In [None]:
test_embeddings.shape

### 최종 예측을 위한 feature 합치기

In [None]:
test_df['Book-Title_encoded'] = test_df['Book-Title'].apply(get_title_embedding_fasttext)

In [None]:
new_edge_index

In [None]:
df = test_df[["Age","Location_encoded","Year-Of-Publication","Publisher_encoded"]]
df_np = df.to_numpy()
book_title_embeddings = np.stack(test_df["Book-Title_encoded"].values)
df_emb = np.concatenate((df_np, book_title_embeddings), axis=1) 
new_edge_index = new_edge_index.cpu().numpy()
# features_emb는 edge_index와 완전히 같은 정렬 순서를 가진다.

# edge_index[0] 리스트를 순회하며, 해당 값으로 embeddings를 인덱싱
user_embeddings_selected = test_embeddings[new_edge_index[0]]

# 인덱싱된 값을 그대로 df_emb의 오른편에 concat
df_emb = np.concatenate((df_emb, user_embeddings_selected), axis=1)

# edge_index[1] 리스트를 순회하며, 해당 값으로 embeddings를 인덱싱
book_embeddings_selected = test_embeddings[new_edge_index[1]]

# 인덱싱된 값을 그대로 df_emb의 오른편에 concat
df_emb = np.concatenate((df_emb, book_embeddings_selected), axis=1)

In [None]:
df_emb

In [None]:
df_emb.shape

In [None]:
print(book_index_start)

In [None]:
last_used_index

In [None]:
df_emb.dtype

In [None]:
models

In [None]:
new_predictions = []

for model in models.values():
    pred = model.predict(df_emb)
    new_predictions.append(pred)
    
# 앙상블: 예측값들의 평균을 구함
final_new_predictions = np.mean(new_predictions, axis=0)

In [None]:
final_new_predictions.shape

In [None]:
final_new_predictions.min()

In [None]:
final_new_predictions.max()

In [None]:
submit

In [None]:
submit['Book-Rating'] = final_new_predictions

In [None]:
submit

In [None]:
submit.to_csv('submit.csv', index=False)