In [None]:
# import_and_load_data.py
import pandas as pd
import numpy as np
import torch 
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from torch.nn import Embedding
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree
import tqdm
import networkx as nx
from torch_geometric.utils import from_networkx
from torch_geometric.loader import NeighborSampler
from torch_geometric.utils import negative_sampling
import networkx as nx
from torch_geometric.utils import to_networkx
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.utils import negative_sampling
from torch.nn import Embedding

def load_data():
    dataset = pd.read_csv("./data/prepro_train_data.csv")
    test_df = pd.read_csv("./data/prepro_test_data.csv")
    submit = pd.read_csv("./data/sample_submission.csv")
    return dataset, test_df, submit


In [None]:
# data_preprocessing.py
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from torch.nn import Embedding
import fasttext
from sklearn.decomposition import PCA

def preprocess_age(dataset):
    age_tensor = torch.tensor(dataset['Age'].values, dtype=torch.float32).unsqueeze(1)
    return age_tensor

def preprocess_location(dataset):
    le = LabelEncoder()
    dataset['Location_encoded'] = le.fit_transform(dataset['Location'])
    embedding_layer = Embedding(num_embeddings=151, embedding_dim=79)
    location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(dataset['Location_encoded'].values, dtype=torch.long), dim=1))
    location_embeddings = location_embeddings.detach().numpy().squeeze()
    return location_embeddings

def get_title_embedding_fasttext(title):
    fasttext_model = fasttext.load_model("cc.en.300.bin")
    if not isinstance(title, str):
        title = ""
    words = title.split()
    if len(words) == 0:
        return np.zeros(300)
    embeddings = [fasttext_model.get_word_vector(word) for word in words]
    return np.mean(embeddings, axis=0)

def preprocess_book_title(dataset):
    title_embeddings = dataset['Book-Title'].apply(get_title_embedding_fasttext).tolist()
    dataset['Book-Title_encoded'] = dataset['Book-Title'].apply(get_title_embedding_fasttext)
    pca = PCA(n_components=50)
    title_embeddings_array = np.array(title_embeddings)
    reduced_title_embeddings = pca.fit_transform(title_embeddings_array)
    return reduced_title_embeddings

def preprocess_publisher(dataset):
    le = LabelEncoder()
    dataset['Publisher_encoded'] = le.fit_transform(dataset['Publisher'])
    embedding_layer = Embedding(num_embeddings=3689, embedding_dim=30)
    publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(dataset['Publisher_encoded'].values, dtype=torch.long), dim=1))
    publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()
    return publisher_embeddings


In [None]:
# node_id_mapping.py
import pandas as pd

def make_ID_dict(df):
    ID_dict = {}
    for index, row in df.iterrows():
        ID_dict[row[1]] = row[0]
    return ID_dict

def map_node_ids(dataset, test_df):
    combined_df = pd.concat([dataset, test_df])
    unique_user_ids_num = combined_df['User-ID'].nunique()
    unique_book_ids_num = combined_df['Book-ID'].nunique()
    unique_user_ids = combined_df['User-ID'].unique().tolist()
    unique_book_ids = combined_df['Book-ID'].unique().tolist()
    unique_user_ids_df = pd.DataFrame(unique_user_ids, columns=['User-ID'])
    sorted_unique_user_ids_df = unique_user_ids_df.sort_values(by='User-ID', ascending=True)
    sorted_unique_user_ids_df.reset_index(inplace=True, drop=False)
    sorted_unique_user_ids_df.rename(columns={'index': 'UserNodeID'}, inplace=True)
    unique_book_ids_df = pd.DataFrame(unique_book_ids, columns=['Book-ID'])
    sorted_unique_book_ids_df = unique_book_ids_df.sort_values(by='Book-ID', ascending=True)
    sorted_unique_book_ids_df.reset_index(inplace=True, drop=True)
    sorted_unique_book_ids_df.reset_index(inplace=True, drop=False)
    sorted_unique_book_ids_df.rename(columns={'index': 'BookNodeID'}, inplace=True)
    sorted_unique_book_ids_df['BookNodeID'] += (unique_user_ids_num)
    UserNodeID_dict = make_ID_dict(sorted_unique_user_ids_df)
    BookNodeID_dict = make_ID_dict(sorted_unique_book_ids_df)
    dataset['User-ID'] = dataset['User-ID'].map(UserNodeID_dict)
    dataset['Book-ID'] = dataset['Book-ID'].map(BookNodeID_dict)
    return dataset


In [None]:
# create_feature_matrix.py
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data

def create_feature_matrix(dataset, age_tensor, location_embeddings, reduced_title_embeddings, publisher_embeddings):
    user_ids = dataset['User-ID'].unique().tolist()  # 유니크 처리함
    book_ids = dataset['Book-ID'].unique().tolist()
    feature_dim = 80  # age (1) + location (29)  # title (20) + publisher (10)
    feature_matrix = np.zeros((unique_user_ids_num+unique_book_ids_num, feature_dim))
    scaler = StandardScaler()
    age_tensor_scaled = scaler.fit_transform(age_tensor.reshape(-1, 1))
    location_embeddings_scaled = scaler.fit_transform(location_embeddings)
    for user_id, age, location in zip(user_ids, age_tensor_scaled, location_embeddings_scaled):
        feature_matrix[user_id] = np.concatenate([age, location], axis=0)
    title_embeddings_scaled = scaler.fit_transform(reduced_title_embeddings)
    publisher_embeddings_scaled = scaler.fit_transform(publisher_embeddings)
    for book_id, title, publisher in zip(book_ids, title_embeddings_scaled, publisher_embeddings_scaled):
        feature_matrix[book_id] = np.concatenate([title, publisher], axis=0)
    node_feature_matrix = torch.tensor(feature_matrix, dtype=torch.float)
    edge_attr = torch.tensor(dataset['Book-Rating'].values, dtype=torch.float).unsqueeze(-1)
    edge_index = torch.tensor(dataset[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous()
    data = Data(x=node_feature_matrix,
                edge_index=edge_index,
                edge_attr=edge_attr)
    train_user_ids = np.unique(dataset['User-ID'].values)
    train_book_ids = np.unique(dataset['Book-ID'].values)
    train_node_ids = np.concatenate((train_user_ids, train_book_ids))
    train_idx = torch.tensor(train_node_ids, dtype=torch.long)
    return data, train_idx


In [None]:
# GraphSAGE_model.py
import torch
from torch_geometric.nn import SAGEConv

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        return x


In [None]:
# train_model.py
import torch
from torch_geometric.utils import negative_sampling
from .GraphSAGE_model import GraphSAGE

def train(data):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GraphSAGE(data.num_node_features, 128, 64).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    data = data.to(device)

    model.train()
    for epoch in range(6):
        optimizer.zero_grad()
        z = model(data.x, data.edge_index)  # Obtain node embeddings

        # Negative sampling
        edge_index_pos = data.edge_index
        edge_index_neg = negative_sampling(edge_index_pos, num_nodes=data.num_nodes,
                                           num_neg_samples=edge_index_pos.size(1))

        pos_loss = (1 - torch.sigmoid((z[edge_index_pos[0]] * z[edge_index_pos[1]]).sum(dim=-1))).mean()
        neg_loss = torch.sigmoid((z[edge_index_neg[0]] * z[edge_index_neg[1]]).sum(dim=-1)).mean()
        loss = -torch.log(pos_loss) - torch.log(1 - neg_loss)

        loss.backward()
        optimizer.step()

    return model


In [None]:
# generate_embeddings.py
def generate_embeddings(model, data):
    model.eval()
    with torch.no_grad():
        x = data.x.to(torch.device('cuda'))
        edge_index = data.edge_index.to(torch.device('cuda'))
        embeddings = model(x, edge_index).cpu().detach().numpy()
        return embeddings


In [None]:
# prepare_input.py
import numpy as np
from sklearn.model_selection import train_test_split

def prepare_input(dataset, embeddings, edge_index, target):
    df = dataset[["Age","Location_encoded","Year-Of-Publication","Publisher_encoded"]]
    df_np = df.to_numpy()
    book_title_embeddings = np.stack(dataset["Book-Title_encoded"].values)
    df_emb = np.concatenate((df_np, book_title_embeddings), axis=1)
    edge_index = edge_index.cpu().numpy()
    user_embeddings_selected = embeddings[edge_index[0]]
    df_emb = np.concatenate((df_emb, user_embeddings_selected), axis=1)
    book_embeddings_selected = embeddings[edge_index[1]]
    df_emb = np.concatenate((df_emb, book_embeddings_selected), axis=1)
    X = df_emb
    y = target.cpu().numpy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


In [None]:
# hyperparameter_optimization.py
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import lightgbm as lgb

def hyperparameter_optimization(train_data):
    space = {
        'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
        'learning_rate': hp.loguniform('learning_rate', -5, 0),
        'feature_fraction': hp.uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1.0),
        'bagging_freq': hp.quniform('bagging_freq', 1, 7, 1),
        'max_depth': hp.quniform('max_depth', 5, 10, 1),
        'force_col_wise': hp.choice('force_col_wise', [True]),
    }

    def objective(params):
        params = {
            'device': 'gpu',
            'num_leaves': int(params['num_leaves']),
            'learning_rate': params['learning_rate'],
            'feature_fraction': params['feature_fraction'],
            'bagging_fraction': params['bagging_fraction'],
            'bagging_freq': int(params['bagging_freq']),
            'max_depth': int(params['max_depth']),
            'force_col_wise': params['force_col_wise'],
            'objective': 'regression',
            'boosting_type': 'gbdt',
            'verbose': 0
        }
        cv_results = lgb.cv(params, train_data, num_boost_round=500, nfold=5, 
                            early_stopping_rounds=50, metrics='rmse', seed=42)
        # Hyperopt will try to minimize loss (it always minimizes the objective)
        loss = min(cv_results['rmse-mean'])
        return {'loss': loss, 'params': params, 'status': STATUS_OK}

    trials = Trials()
    best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)

    sorted_trials = sorted(trials.results, key=lambda x: x['loss'])
    top5_hyperparameters = [(t['loss'], t['params']) for t in sorted_trials[:5]]

    return best, top5_hyperparameters


In [None]:
# train_and_save_models.py
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

def train_and_save_models(hyperparameters, train_data, test_data, X_test, y_test):
    models = {}
    for i, params in enumerate(hyperparameters):
        gbm = lgb.train(params, train_data, num_boost_round=500, valid_sets=test_data,
                        early_stopping_rounds=10, verbose_eval=False)
        y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
        # Calculate and print RMSE
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        print(f"RMSE: {rmse:.4f}")
        models[i] = gbm
        gbm.save_model(f'model_{i}.txt')
    return models


In [None]:
# infer_and_submit.py
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from torch.nn import Embedding
from torch_geometric.data import Data

def infer_and_submit(test_df, UserNodeID_dict, BookNodeID_dict, models):
    le = LabelEncoder()
    scaler = StandardScaler()
    age_tensor = torch.tensor(test_df['Age'].values, dtype=torch.float32).unsqueeze(1)

    # process location
    test_df['Location_encoded'] = le.fit_transform(test_df['Location'])
    embedding_layer = Embedding(num_embeddings=151, embedding_dim=79)
    location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(test_df['Location_encoded'].values, dtype=torch.long), dim=1))
    location_embeddings = location_embeddings.detach().numpy().squeeze()

    # process book title
    pca = PCA(n_components=50)
    title_embeddings = test_df['Book-Title'].apply(get_title_embedding_fasttext).tolist()
    title_embeddings_array = np.array(title_embeddings)
    reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

    # process publisher
    test_df['Publisher_encoded'] = le.fit_transform(test_df['Publisher'])
    embedding_layer = Embedding(num_embeddings=3689, embedding_dim=30)
    publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(test_df['Publisher_encoded'].values, dtype=torch.long), dim=1))
    publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()

    # prepare features for nodes
    test_df['User-ID'] = test_df['User-ID'].map(UserNodeID_dict)
    test_df['Book-ID'] = test_df['Book-ID'].map(BookNodeID_dict)
    age_tensor_scaled = scaler.fit_transform(age_tensor.reshape(-1, 1))
    location_embeddings_scaled = scaler.fit_transform(location_embeddings)

    feature_matrix = np.zeros((len(UserNodeID_dict) + len(BookNodeID_dict), age_tensor_scaled.shape[1] + location_embeddings_scaled.shape[1]))
    for user_id, age, location in zip(user_ids, age_tensor_scaled, location_embeddings_scaled):
        feature_matrix[user_id] = np.concatenate([age, location], axis=0)

    title_embeddings_scaled = scaler.fit_transform(reduced_title_embeddings)
    publisher_embeddings_scaled = scaler.fit_transform(publisher_embeddings)
    for book_id, title, publisher in zip(book_ids, title_embeddings_scaled, publisher_embeddings_scaled):
        feature_matrix[book_id] = np.concatenate([title, publisher], axis=0)

    # prepare data for GCN
    new_node_feature_matrix = torch.tensor(feature_matrix, dtype=torch.float)
    new_edge_index = torch.tensor(test_df[['User-ID', 'Book-ID']].values, dtype=torch.long).t().contiguous().to(device)
    new_data = Data(x=new_node_feature_matrix, edge_index=new_edge_index).to(device)

    # get embeddings from GCN
    model = train(data)  # You will have to define `train` function or use a pre-trained model
    model.eval()
    with torch.no_grad():
        x = new_data.x.to(torch.device('cuda'))
        edge_index = new_edge_index.to(torch.device('cuda'))
        test_embeddings = model(x, new_edge_index).cpu().detach().numpy()

    test_df['Book-Title_encoded'] = test_df['Book-Title'].apply(get_title_embedding_fasttext)

    # prepare features for LGBM
    df = test_df[["Age","Location_encoded","Year-Of-Publication","Publisher_encoded"]]
    df_np = df.to_numpy()
    book_title_embeddings = np.stack(test_df["Book-Title_encoded"].values)
    df_emb = np.concatenate((df_np, book_title_embeddings), axis=1) 
    new_edge_index = new_edge_index.cpu().numpy()
    user_embeddings_selected = test_embeddings[new_edge_index[0]]
    df_emb = np.concatenate((df_emb, user_embeddings_selected), axis=1)
    book_embeddings_selected = test_embeddings[new_edge_index[1]]
    df_emb = np.concatenate((df_emb, book_embeddings_selected), axis=1)

    # make predictions with LGBMs
    new_predictions = []
    for model in models.values():
        pred = model.predict(df_emb)
        new_predictions.append(pred)
    final_new_predictions = np.mean(new_predictions, axis=0)
    submit['Book-Rating'] = final_new_predictions  # You will have to define `submit` dataframe

    return submit


In [None]:
# 더 고도화해줄래? 물어봤더니만
#  모든 단계를 더욱 모듈화하고 재사용 가능한 함수를 도입하여 공통 기능을 재활용함으로써 코드의 가독성과 유지 보수성을 향상시킵니다.

# Common imports and functions
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import torch
from torch.nn import Embedding
from torch_geometric.data import Data
import os
from typing import Tuple

# Function to prepare node features
def prepare_node_features(df: pd.DataFrame, age_tensor: torch.Tensor, user_ids: np.ndarray, book_ids: np.ndarray) -> Tuple[np.ndarray, StandardScaler]:
    le = LabelEncoder()
    scaler = StandardScaler()

    df['Location_encoded'] = le.fit_transform(df['Location'])
    embedding_layer = Embedding(num_embeddings=151, embedding_dim=79)
    location_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(df['Location_encoded'].values, dtype=torch.long), dim=1))
    location_embeddings = location_embeddings.detach().numpy().squeeze()

    pca = PCA(n_components=50)
    title_embeddings = df['Book-Title'].apply(get_title_embedding_fasttext).tolist()
    title_embeddings_array = np.array(title_embeddings)
    reduced_title_embeddings = pca.fit_transform(title_embeddings_array)

    df['Publisher_encoded'] = le.fit_transform(df['Publisher'])
    embedding_layer = Embedding(num_embeddings=3689, embedding_dim=30)
    publisher_embeddings = embedding_layer(torch.unsqueeze(torch.tensor(df['Publisher_encoded'].values, dtype=torch.long), dim=1))
    publisher_embeddings = publisher_embeddings.detach().numpy().squeeze()
        
    age_tensor_scaled = scaler.fit_transform(age_tensor.reshape(-1, 1))
    location_embeddings_scaled = scaler.fit_transform(location_embeddings)

    feature_matrix = np.zeros((len(user_ids) + len(book_ids), age_tensor_scaled.shape[1] + location_embeddings_scaled.shape[1]))
    for user_id, age, location in zip(user_ids, age_tensor_scaled, location_embeddings_scaled):
        feature_matrix[user_id] = np.concatenate([age, location], axis=0)

    title_embeddings_scaled = scaler.fit_transform(reduced_title_embeddings)
    publisher_embeddings_scaled = scaler.fit_transform(publisher_embeddings)
    for book_id, title, publisher in zip(book_ids, title_embeddings_scaled, publisher_embeddings_scaled):
        feature_matrix[book_id] = np.concatenate([title, publisher], axis=0)

    return feature_matrix, scaler

# Function to perform hyperparameter optimization for LGBM
def perform_hyperopt(train_data):
    space = {
        'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
        'learning_rate': hp.loguniform('learning_rate', -5, 0),
        'feature_fraction': hp.uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1.0),
        'bagging_freq': hp.quniform('bagging_freq', 1, 7, 1),
        'max_depth': hp.quniform('max_depth', 5, 10, 1),
        'force_col_wise': hp.choice('force_col_wise', [True]),
    }

    def objective(params):
        params = {
            'device': 'gpu',
            'num_leaves': int(params['num_leaves']),
            'learning_rate': params['learning_rate'],
            'feature_fraction': params['feature_fraction'],
            'bagging_fraction': params['bagging_fraction'],
            'bagging_freq': int(params['bagging_freq']),
            'max_depth': int(params['max_depth']),
            'force_col_wise': params['force_col_wise'],
            'objective': 'regression',
            'boosting_type': 'gbdt',
            'verbose': 0
        }
        cv_results = lgb.cv(params, train_data, num_boost_round=500, nfold=5, 
                            early_stopping_rounds=50, metrics='rmse', seed=42)
        # Hyperopt will try to minimize loss (it always minimizes the objective)
        loss = min(cv_results['rmse-mean'])
        return {'loss': loss, 'params': params, 'status': STATUS_OK}

    trials = Trials()
    best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)
    sorted_trials = sorted(trials.results, key=lambda x: x['loss'])
    return sorted_trials

# Function to train models
def train_models(sorted_trials, train_data, test_data, X_test, y_test, save_dir='./'):
    hyperparameters = [trial['params'] for trial in sorted_trials[:5]]
    models = {}
    for i, params in enumerate(hyperparameters):
        gbm = lgb.train(params, train_data, num_boost_round=500, valid_sets=test_data,
                    early_stopping_rounds=10, verbose_eval=False)
        y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        print(f"RMSE: {rmse:.4f}")
        models[i] = gbm
        gbm.save_model(os.path.join(save_dir, f'model_{i}.txt'))
    return models

# ... Rest of your code (e.g., model definition, training functions etc.)


In [None]:
# 검증을 따로 하고, 다른 chat에게 한번 더 검토를 부탁해야 함.