In [None]:
import torch
# import torch.nn as nn
from torch import nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset, DataLoader, Batch
from torch_geometric.nn import SAGEConv, GAE
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import random_split
from torch.nn import Embedding
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
import gensim.downloader as api

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
train_data = pd.read_csv("./prepro_train_data.csv")

# Make Graph
## Category Embedding

### Age

In [None]:
age_tensor = torch.tensor(train_data['Age'].values, dtype=torch.float32).unsqueeze(1)

In [None]:
print("age_tensor.shape >>> ",age_tensor.shape)
print(age_tensor)

### Location

In [None]:
### Location

le = LabelEncoder()
train_data['Location_encoded'] = le.fit_transform(train_data['Location'])
embedding_layer = Embedding(num_embeddings=151, embedding_dim=29)
location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(train_data['Location_encoded'].values, dtype=torch.long), dim=1))
location_embeddings = location_embeddings.detach().numpy().squeeze()

In [None]:
print("location_embeddings.shape >>> ",location_embeddings.shape)
print(location_embeddings)

### Book-Title

In [None]:
import fasttext

# # 사전 훈련된 FastText 모델 다운로드
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
# !gunzip cc.en.300.bin.gz

In [None]:
fasttext_model = fasttext.load_model("cc.en.300.bin")

In [None]:
def get_title_embedding_fasttext(title):
    words = title.split()
    if len(words) == 0:
        return np.zeros(300)
    embeddings = [fasttext_model.get_word_vector(word) for word in words]
    return np.mean(embeddings, axis=0)

In [None]:
title_embeddings = train_data['Book-Title'].apply(get_title_embedding_fasttext).tolist()

In [None]:
empty_titles_count = 0
for title in train_data['Book-Title']:
    words = title.split()
    if not words:
        empty_titles_count += 1

print(f"빈 문자열이거나 토큰화된 단어가 없는 책 제목의 개수: {empty_titles_count}")

In [None]:
for idx, emb in enumerate(title_embeddings):
    if not isinstance(emb, np.ndarray) or emb.shape != (300,):
        print(f"Index: {idx}, Title: {train_data['Book-Title'][idx]}, Embedding: {emb}")

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=20)
title_embeddings_array = np.array(title_embeddings)
reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

In [None]:
print("reduced_title_embeddings.shape >>> ",reduced_title_embeddings.shape)
print(reduced_title_embeddings)

### Publisher

In [None]:
### Publisher

le = LabelEncoder()
train_data['Publisher_encoded'] = le.fit_transform(train_data['Publisher'])
embedding_layer = Embedding(num_embeddings=3689, embedding_dim=10)
publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(train_data['Publisher_encoded'].values, dtype=torch.long), dim=1))
publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()

In [None]:
print("Publisher_embeddings.shape >>> ",Publisher_embeddings.shape)
print(Publisher_embeddings)

### User-ID & Book-ID

In [None]:
### User-ID & Book-ID

# User-ID 열의 unique한 값들을 리스트로 만들기  > 83256
unique_user_ids = train_data['User-ID'].unique().tolist()
# Book-ID 열의 unique한 값들을 리스트로 만들기 > 243441
unique_book_ids = train_data['Book-ID'].unique().tolist()

# unique_user_ids 리스트를 기반으로 DataFrame 생성
unique_user_ids_df = pd.DataFrame(unique_user_ids, columns=['User-ID'])
# 내림차순 정렬
sorted_unique_user_ids_df = unique_user_ids_df.sort_values(by='User-ID', ascending=True)
# # 인덱스를 새 column으로 추가
sorted_unique_user_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_user_ids_df.rename(columns={'index': 'UserNodeID'}, inplace=True)

# unique_book_ids 리스트를 기반으로 DataFrame 생성
unique_book_ids_df = pd.DataFrame(unique_book_ids, columns=['Book-ID'])
# 내림차순 정렬
sorted_unique_book_ids_df = unique_book_ids_df.sort_values(by='Book-ID', ascending=True)
# #인덱스를 새 column으로 추가
sorted_unique_book_ids_df.reset_index(inplace=True, drop=True)
sorted_unique_book_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_book_ids_df.rename(columns={'index': 'BookNodeID'}, inplace=True)

# 인덱스 시작 번호를 83256으로 설정
sorted_unique_book_ids_df['BookNodeID'] += 83256

# UserNodeID  0 ~ 83255
# BookNodeID   83256 ~ 326696

In [None]:
def make_ID_dict(df):
    ID_dict = {}
    for index, row in df.iterrows():
        ID_dict[row[1]] = row[0]
    return ID_dict

UserNodeID_dict = make_ID_dict(sorted_unique_user_ids_df)
BookNodeID_dict = make_ID_dict(sorted_unique_book_ids_df)

####### 매핑 진행
train_data['User-ID'] = train_data['User-ID'].map(UserNodeID_dict)
train_data['Book-ID'] = train_data['Book-ID'].map(BookNodeID_dict)

## edge_index

In [None]:
edge_index = torch.tensor(train_data[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()

In [None]:
edge_index

## node feature

In [None]:
# 사용자 노드와 도서 노드의 총 수 계산:

num_user_nodes = len(unique_user_ids)
num_book_nodes = len(unique_book_ids)

user_ids = train_data["User-ID"].values.tolist()
book_ids = train_data["Book-ID"].values.tolist()

In [None]:
user_feature_dim = 30  # age (1) + location (29)
book_feature_dim = 30  # title (20) + publisher (10)

user_features = np.zeros((num_user_nodes, user_feature_dim))
book_features = np.zeros((num_book_nodes, book_feature_dim))

for user_id, age, location in zip(user_ids, age_tensor, location_embeddings):
    user_features[user_id] = np.concatenate([age, location], axis=0)

for book_id, title, publisher in zip(book_ids, reduced_title_embeddings, publisher_embeddings):
    book_features[book_id - num_user_nodes] = np.concatenate([title, publisher], axis=0)

node_features = np.vstack((user_features, book_features))