In [1]:
import pandas as pd
import numpy as np
import torch 
from sklearn.preprocessing import LabelEncoder
from torch.nn import Embedding
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
dataset = pd.read_csv("./data/prepro_train_data.csv")
test_df = pd.read_csv("./data/prepro_test_data.csv")
submit = pd.read_csv("./data/sample_submission.csv")

# Make Graph Component

In [4]:
age_tensor = torch.tensor(dataset['Age'].values, dtype=torch.float32).unsqueeze(1)

### Location
# LabelEncoder를 사용해 위치 정보를 정수로 변환
le = LabelEncoder()
dataset['Location_encoded'] = le.fit_transform(dataset['Location'])

# 임베딩 레이어 초기화
embedding_layer = Embedding(num_embeddings=151, embedding_dim=79)

# 위치 정보를 10차원 벡터로 변환
location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(dataset['Location_encoded'].values, dtype=torch.long), dim=1))

# 결과를 NumPy array로 변환
location_embeddings = location_embeddings.detach().numpy().squeeze()


In [5]:
import fasttext
# FastText 모델 로드
fasttext_model = fasttext.load_model("cc.en.300.bin")

def get_title_embedding_fasttext(title):
    if not isinstance(title, str):
        title = ""
    words = title.split()
    if len(words) == 0:
        return np.zeros(300)
    embeddings = [fasttext_model.get_word_vector(word) for word in words]
    return np.mean(embeddings, axis=0)
title_embeddings = dataset['Book-Title'].apply(get_title_embedding_fasttext).tolist()



In [6]:
from sklearn.decomposition import PCA

# PCA 객체를 생성
pca = PCA(n_components=50)

# 평균 임베딩 벡터로 구성된 리스트를 NumPy 배열로 변환
title_embeddings_array = np.array(title_embeddings)

# 차원 축소
reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

In [7]:
### Publisher
# LabelEncoder를 사용해 위치 정보를 정수로 변환
le = LabelEncoder()
dataset['Publisher_encoded'] = le.fit_transform(dataset['Publisher'])

# 임베딩 레이어 초기화
embedding_layer = Embedding(num_embeddings=3689, embedding_dim=30)

# 위치 정보를 10차원 벡터로 변환
publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(dataset['Publisher_encoded'].values, dtype=torch.long), dim=1))

# 결과를 NumPy array로 변환
publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()

### User-ID & Book-ID

In [None]:
combined_df = pd.concat([dataset, test_df])

unique_user_ids_num = combined_df['User-ID'].nunique()
unique_book_ids_num = combined_df['Book-ID'].nunique()

In [8]:
### User-ID & Book-ID

# User-ID 열의 unique한 값들을 리스트로 만들기  > 83256
unique_user_ids = combined_df['User-ID'].unique().tolist()
# Book-ID 열의 unique한 값들을 리스트로 만들기 > 243441
unique_book_ids = combined_df['Book-ID'].unique().tolist()

# unique_user_ids 리스트를 기반으로 DataFrame 생성
unique_user_ids_df = pd.DataFrame(unique_user_ids, columns=['User-ID'])
# 내림차순 정렬
sorted_unique_user_ids_df = unique_user_ids_df.sort_values(by='User-ID', ascending=True)
# # 인덱스를 새 column으로 추가
sorted_unique_user_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_user_ids_df.rename(columns={'index': 'UserNodeID'}, inplace=True)

# unique_book_ids 리스트를 기반으로 DataFrame 생성
unique_book_ids_df = pd.DataFrame(unique_book_ids, columns=['Book-ID'])
# 내림차순 정렬
sorted_unique_book_ids_df = unique_book_ids_df.sort_values(by='Book-ID', ascending=True)
# #인덱스를 새 column으로 추가
sorted_unique_book_ids_df.reset_index(inplace=True, drop=True)
sorted_unique_book_ids_df.reset_index(inplace=True, drop=False)
sorted_unique_book_ids_df.rename(columns={'index': 'BookNodeID'}, inplace=True)


# # book_ids 인덱스 시작 번호를 83256으로 설정
sorted_unique_book_ids_df['BookNodeID'] += (unique_user_ids_num)
# UserNodeID  0 ~ 83255 ....인데 test 데이터셋의 uniuqe 숫자까지 고려하면 92102.
# BookNodeID   83256(/////92101) ~ 326696

In [9]:
def make_ID_dict(df):
    ID_dict = {}
    for index, row in df.iterrows():
        ID_dict[row[1]] = row[0]
    return ID_dict

UserNodeID_dict = make_ID_dict(sorted_unique_user_ids_df)
BookNodeID_dict = make_ID_dict(sorted_unique_book_ids_df)

####### 매핑 진행
dataset['User-ID'] = dataset['User-ID'].map(UserNodeID_dict)
dataset['Book-ID'] = dataset['Book-ID'].map(BookNodeID_dict)

## node feature


In [10]:
# num_user_nodes = len(dataset['User-ID'].unique().tolist())
# num_book_nodes = len(dataset['Book-ID'].unique().tolist())

user_ids = dataset['User-ID'].unique().tolist()  # 유니크 처리함
book_ids = dataset['Book-ID'].unique().tolist()

In [11]:
# num_users = max(user_ids) + 1
# num_books = max(book_ids) - num_user_nodes + 1

feature_dim = 80  # age (1) + location (29)  # title (20) + publisher (10)

feature_matrix = np.zeros((unique_user_ids_num+unique_book_ids_num, feature_dim))
# book_features = np.zeros((unique_book_ids_num, feature_dim))

In [12]:
from sklearn.preprocessing import StandardScaler

# user_id와 book_id는 이미 정수로 매핑되어 있다고 가정
scaler = StandardScaler()

# 스케일링을 개별적으로 수행
age_tensor_scaled = scaler.fit_transform(age_tensor.reshape(-1, 1))
location_embeddings_scaled = scaler.fit_transform(location_embeddings)

for user_id, age, location in zip(user_ids, age_tensor_scaled, location_embeddings_scaled):
    feature_matrix[user_id] = np.concatenate([age, location], axis=0)

title_embeddings_scaled = scaler.fit_transform(reduced_title_embeddings)
publisher_embeddings_scaled = scaler.fit_transform(publisher_embeddings)

for book_id, title, publisher in zip(book_ids, title_embeddings_scaled, publisher_embeddings_scaled):
    feature_matrix[book_id] = np.concatenate([title, publisher], axis=0)
    
# 사용자 노드 행렬과 도서 노드 행렬을 세로로 연결하여 최종 node_features 행렬 생성:    
# node_features = np.vstack((user_features, book_features))

In [13]:
from torch_geometric.data import Data

node_feature_matrix = torch.tensor(feature_matrix, dtype=torch.float)
edge_attr = torch.tensor(dataset['Book-Rating'].values, dtype=torch.float).unsqueeze(-1)
edge_index = torch.tensor(dataset[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()

data = Data(x=node_feature_matrix,
            edge_index=edge_index,
            edge_attr=edge_attr)

In [14]:
data.to(device)

Data(x=[326697, 80], edge_index=[2, 871393], edge_attr=[871393, 1])

In [15]:
train_user_ids = np.unique(dataset['User-ID'].values)
train_book_ids = np.unique(dataset['Book-ID'].values)
train_node_ids = np.concatenate((train_user_ids, train_book_ids))
train_idx = torch.tensor(train_node_ids, dtype=torch.long)

# GraphSAGE

In [16]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.utils import negative_sampling
from torch.nn import Embedding

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        return x

In [17]:
def train(data):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GraphSAGE(data.num_node_features, 128, 64).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    data = data.to(device)

    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        z = model(data.x, data.edge_index)  # Obtain node embeddings

        # Negative sampling
        edge_index_pos = data.edge_index
        edge_index_neg = negative_sampling(edge_index_pos, num_nodes=data.num_nodes,
                                           num_neg_samples=edge_index_pos.size(1))

        pos_loss = (1 - torch.sigmoid((z[edge_index_pos[0]] * z[edge_index_pos[1]]).sum(dim=-1))).mean()
        neg_loss = torch.sigmoid((z[edge_index_neg[0]] * z[edge_index_neg[1]]).sum(dim=-1)).mean()
        loss = -torch.log(pos_loss) - torch.log(1 - neg_loss)

        loss.backward()
        optimizer.step()

    return model

In [None]:
model = train(data)

In [18]:
# 임베딩 추출
model.eval()
with torch.no_grad():
    x = data.x.to(torch.device('cuda'))
    edge_index = data.edge_index.to(torch.device('cuda'))
    embeddings = model(x, edge_index).cpu().detach().numpy()

# # 임베딩 출력
# for i, embedding in enumerate(embeddings):
#     node_label = data.classes[data.y[i].item()]
#     print(f"Node {i}: Label={node_label}, Embedding={embedding}")

# LGBM 

In [None]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
df = dataset[["Age","Location_encoded","Year-Of-Publication","Publisher_encoded"]]
df_np = df.to_numpy()
book_title_embeddings = np.stack(dataset["Book-Title_encoded"].values)
df_emb = np.concatenate((df_np, book_title_embeddings), axis=1) # features_emb는 edge_index와 완전히 같은 정렬 순서를 가짐
edge_index = edge_index.cpu().numpy()
# 이제 여기에 embeddings에서 적절한 값들을 concat하면 된다.

# edge_index[0] 리스트를 순회하며, 해당 값으로 embeddings를 인덱싱
user_embeddings_selected = embeddings[edge_index[0]]

# 인덱싱된 값을 그대로 df_emb의 오른편에 concat
df_emb = np.concatenate((df_emb, user_embeddings_selected), axis=1)

# edge_index[1] 리스트를 순회하며, 해당 값으로 embeddings를 인덱싱
book_embeddings_selected = embeddings[edge_index[1]]

# 인덱싱된 값을 그대로 df_emb의 오른편에 concat
df_emb = np.concatenate((df_emb, book_embeddings_selected), axis=1)

In [None]:
# Generate features and targets
X = df_emb
y = data.edge_attr.cpu().numpy()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
# 상위 5개 파라미터 세트 출력하는 버전 

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import lightgbm as lgb

# Define the search space for hyperparameters
space = {
    'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'feature_fraction': hp.uniform('feature_fraction', 0.5, 1.0),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1.0),
    'bagging_freq': hp.quniform('bagging_freq', 1, 7, 1),
    'max_depth': hp.quniform('max_depth', 5, 10, 1),
    'force_col_wise': hp.choice('force_col_wise', [True]),
}

# Define the objective function
def objective(params):
    params = {
        'device': 'gpu',
        'num_leaves': int(params['num_leaves']),
        'learning_rate': params['learning_rate'],
        'feature_fraction': params['feature_fraction'],
        'bagging_fraction': params['bagging_fraction'],
        'bagging_freq': int(params['bagging_freq']),
        'max_depth': int(params['max_depth']),
        'force_col_wise': params['force_col_wise'],
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'verbose': 0
    }

    # This is a 5-fold cross-validation
    cv_results = lgb.cv(params, train_data, num_boost_round=500, nfold=5, 
                        early_stopping_rounds=50, metrics='rmse', seed=42)
    # Hyperopt will try to minimize loss (it always minimizes the objective)
    loss = min(cv_results['rmse-mean'])
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

# Run the algorithm
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

# Print the best parameters
print("Best parameters:")
print(best)

# Print top 5 hyperparameter sets
sorted_trials = sorted(trials.results, key=lambda x: x['loss'])
print("Top 5 hyperparameter sets:")
for t in sorted_trials[:5]:
    print(f"Loss: {t['loss']}, Params: {t['params']}")


In [None]:
# import pickle

# # sorted_trials 변수 저장
# with open("sorted_trials.pickle", "wb") as f:
#     pickle.dump(sorted_trials, f)

# train LGBMs for Amsemble

In [None]:
# Train LightGBM model


models = {}
# 각각의 하이퍼파라미터 세트에 대해 모델을 학습하고 저장
for i, params in enumerate(hyperparameters):
    gbm = lgb.train(params, train_data, num_boost_round=500, valid_sets=test_data,
                early_stopping_rounds=10, verbose_eval=False)
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    # Calculate and print RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"RMSE: {rmse:.4f}")
    models[i] = gbm

# 각 모델을 파일로 저장
for i, model in models.items():
    model.save_model(f'model_{i}.txt')

# Inference

In [None]:
age_tensor = torch.tensor(test_df['Age'].values, dtype=torch.float32).unsqueeze(1)

test_df['Location_encoded'] = le.fit_transform(test_df['Location'])
embedding_layer = Embedding(num_embeddings=151, embedding_dim=79)
location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(test_df['Location_encoded'].values, dtype=torch.long), dim=1))
location_embeddings = location_embeddings.detach().numpy().squeeze()

pca = PCA(n_components=50)
title_embeddings = test_df['Book-Title'].apply(get_title_embedding_fasttext).tolist()
title_embeddings_array = np.array(title_embeddings)
reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

test_df['Publisher_encoded'] = le.fit_transform(test_df['Publisher'])
embedding_layer = Embedding(num_embeddings=3689, embedding_dim=30)
publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(test_df['Publisher_encoded'].values, dtype=torch.long), dim=1))
publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()

# new_unique_user_ids = test_df['User-ID'].unique().tolist()
# new_unique_book_ids = test_df['Book-ID'].unique().tolist()

# # 새로운 사용자 ID에 대한 인덱스를 할당합니다.
# last_used_index = 326696
# for new_user_id in new_unique_user_ids:
#     if new_user_id not in UserNodeID_dict:
#         last_used_index += 1
#         UserNodeID_dict[new_user_id] = last_used_index
        
# book_index_start = last_used_index
# # 새로운 도서 ID에 대한 인덱스를 할당합니다.
# for new_book_id in new_unique_book_ids:
#     if new_book_id not in BookNodeID_dict:
#         last_used_index += 1
#         BookNodeID_dict[new_book_id] = last_used_index
        
# IDs 매핑 진행
test_df['User-ID'] = test_df['User-ID'].map(UserNodeID_dict)
test_df['Book-ID'] = test_df['Book-ID'].map(BookNodeID_dict)

In [None]:
# user_ids = test_df['User-ID'].unique().tolist()  # 유니크 처리함
# book_ids = test_df['Book-ID'].unique().tolist()

# new_unique_node_count = len(user_ids) + len(book_ids)
# new_feature_matrix = np.zeros((new_unique_node_count, feature_dim))


# user_id와 book_id는 이미 정수로 매핑되어 있다고 가정
# 스케일링을 개별적으로 수행
age_tensor_scaled = scaler.fit_transform(age_tensor.reshape(-1, 1))
location_embeddings_scaled = scaler.fit_transform(location_embeddings)

for user_id, age, location in zip(user_ids, age_tensor_scaled, location_embeddings_scaled):
    feature_matrix[user_id] = np.concatenate([age, location], axis=0)

title_embeddings_scaled = scaler.fit_transform(reduced_title_embeddings)
publisher_embeddings_scaled = scaler.fit_transform(publisher_embeddings)

for book_id, title, publisher in zip(book_ids, title_embeddings_scaled, publisher_embeddings_scaled):
    feature_matrix[book_id] = np.concatenate([title, publisher], axis=0)

new_node_feature_matrix = torch.tensor(feature_matrix, dtype=torch.float)

In [None]:
new_node_feature_matrix.shape

In [None]:
new_edge_index = torch.tensor(test_df[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous().to(device)
new_data = Data(x=new_node_feature_matrix, edge_index=new_edge_index).to(device)

In [None]:
# 임베딩 추출
model = train(data)
model.eval()
with torch.no_grad():
    x = new_data.x.to(torch.device('cuda'))
    edge_index = new_edge_index.to(torch.device('cuda'))
    test_embeddings = model(x, new_edge_index).cpu().detach().numpy()

### 최종 예측을 위한 feature 합치기

In [None]:
test_df['Book-Title_encoded'] = test_df['Book-Title'].apply(get_title_embedding_fasttext)

In [None]:
df = test_df[["Age","Location_encoded","Year-Of-Publication","Publisher_encoded"]]
df_np = df.to_numpy()
book_title_embeddings = np.stack(test_df["Book-Title_encoded"].values)
df_emb = np.concatenate((df_np, book_title_embeddings), axis=1) 
new_edge_index = new_edge_index.cpu().numpy()
# features_emb는 edge_index와 완전히 같은 정렬 순서를 가진다.

# edge_index[0] 리스트를 순회하며, 해당 값으로 embeddings를 인덱싱
user_embeddings_selected = test_embeddings[new_edge_index[0]]

# 인덱싱된 값을 그대로 df_emb의 오른편에 concat
df_emb = np.concatenate((df_emb, user_embeddings_selected), axis=1)

# edge_index[1] 리스트를 순회하며, 해당 값으로 embeddings를 인덱싱
book_embeddings_selected = test_embeddings[new_edge_index[1]]

# 인덱싱된 값을 그대로 df_emb의 오른편에 concat
df_emb = np.concatenate((df_emb, book_embeddings_selected), axis=1)

In [None]:
new_predictions = []

for model in models.values():
    pred = model.predict(df_emb)
    new_predictions.append(pred)
    
# 앙상블: 예측값들의 평균을 구함
final_new_predictions = np.mean(new_predictions, axis=0)

In [None]:
submit['Book-Rating'] = final_new_predictions