In [1]:
from utils import get_health_scores, recall_at_k_binary, ndcg_at_k_binary, heterograph_to_tabular
from sklearn.decomposition import NMF
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
import pickle
import os

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

Device: 'cuda'


In [2]:
def load_fold_data(fold_path):
    with open(fold_path, 'rb') as f:
        fold_data = pickle.load(f)
    return fold_data['train_data'], fold_data['val_data'], fold_data['test_data'], fold_data['health_dict']

# Load the fold data
fold = 0
fold_path = os.path.join('../data/processed_data', f'fold_{fold+1}.pkl')
print(f"\n=========== Starting fold {fold} ===========")

train_data, val_data, test_data, health_dict = load_fold_data(fold_path)




In [3]:
test_edge_index = test_data['user', 'eats', 'food'].edge_label_index
test_health_scores = get_health_scores(test_edge_index, health_dict, device='cpu').cpu().numpy()

X_train, y_train = heterograph_to_tabular(train_data)
X_test, y_test = heterograph_to_tabular(test_data)

In [4]:
import pandas as pd
from pycaret.classification import *

# PyCaret 설정
clf = setup(pd.concat([X_train, y_train], axis=1), target='label', 
           session_id=42, train_size=0.8, verbose=False)

In [None]:
# 모델 비교 및 학습
models = ['rf', 'gbc', 'knn', 'lr', 'lightgbm', 'dt', 'svm', 'mlp', 'xgboost'] # ''rbfsvm',
best_models = {}

for model_name in models:
    model = create_model(model_name, verbose=False)
    tuned_model = tune_model(model, verbose=False)
    finalized_model = finalize_model(tuned_model)
    
    # 예측
    y_pred_proba = predict_model(finalized_model, data=X_test)['prediction_label']
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # 커스텀 메트릭 계산
    metrics = {
        'auc': roc_auc_score(y_test, y_pred_proba),
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'ndcg@10': ndcg_at_k_binary(y_test, y_pred_proba, 10),
        'recall@10': recall_at_k_binary(y_test, y_pred_proba, 10),
        'ndcg@20': ndcg_at_k_binary(y_test, y_pred_proba, 20),
        'recall@20': recall_at_k_binary(y_test, y_pred_proba, 20),
        'health_score': np.mean(test_health_scores[y_pred_proba > 0.5]) if np.sum(y_pred_proba > 0.5) > 0 else 0.0
    }
    
    best_models[model_name] = {'model': finalized_model, 'metrics': metrics}
    
    # 모델 저장
    save_model(finalized_model, f'baseline_models/fold_{fold+1}_{model_name}')

Transformation Pipeline and Model Successfully Saved


Transformation Pipeline and Model Successfully Saved


In [19]:
from sklearn.metrics import *
from utils import ndcg_at_k_binary, recall_at_k_binary
import glob
import os

def evaluate_saved_model(model_path, X_test, y_test, test_health_scores):
    model_name = model_path.replace('.pkl', '')
    model = load_model(model_name)
    
    predictions = predict_model(model, data=X_test, raw_score=True)
    prob_cols = [col for col in predictions.columns if 'score' in col.lower() and '1' in col]
    if prob_cols:
        y_pred_proba = predictions[prob_cols[0]].values
    else:
        y_pred_proba = predictions['prediction_label'].astype(float).values
    
    y_pred = predictions['prediction_label'].values
    
    return {
        'index': 0,
        'test_auc': roc_auc_score(y_test, y_pred_proba),
        'test_accuracy': accuracy_score(y_test, y_pred),
        'test_precision': precision_score(y_test, y_pred),
        'test_recall': recall_score(y_test, y_pred),
        'test_f1': f1_score(y_test, y_pred),
        'test_ndcg@10': ndcg_at_k_binary(y_test, y_pred_proba, 10),
        'test_recall@10': recall_at_k_binary(y_test, y_pred_proba, 10),
        'test_ndcg@20': ndcg_at_k_binary(y_test, y_pred_proba, 20),
        'test_recall@20': recall_at_k_binary(y_test, y_pred_proba, 20),
        'test_health_score': np.mean(test_health_scores[y_pred_proba > 0.5]) if np.sum(y_pred_proba > 0.5) > 0 else 0.0
    }

# 모든 저장된 모델 평가
results = {}
for model_file in glob.glob('baseline_models/fold_*.pkl'):
    model_name = os.path.basename(model_file).replace('.pkl', '')
    results[model_name] = evaluate_saved_model(model_file, X_test, y_test, test_health_scores)

Transformation Pipeline and Model Successfully Loaded


Transformation Pipeline and Model Successfully Loaded


Transformation Pipeline and Model Successfully Loaded


Transformation Pipeline and Model Successfully Loaded


Transformation Pipeline and Model Successfully Loaded


Transformation Pipeline and Model Successfully Loaded


Transformation Pipeline and Model Successfully Loaded


Transformation Pipeline and Model Successfully Loaded


Transformation Pipeline and Model Successfully Loaded


In [20]:
import pickle
with open('baseline_models/evaluation_results.pkl', 'wb') as f:
    pickle.dump(results, f)

In [None]:
def matrix_factorization(fold, train_data, test_data, health_dict):
    
    
    _, y_test = heterograph_to_tabular(test_data)
    
    user_indices = train_data['user', 'eats', 'food'].edge_label_index[0].cpu().numpy()
    food_indices = train_data['user', 'eats', 'food'].edge_label_index[1].cpu().numpy()
    ratings = train_data['user', 'eats', 'food'].edge_label.cpu().numpy()
    
    unique_users = np.unique(user_indices)
    unique_foods = np.unique(food_indices)
    
    user_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_users)}
    food_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_foods)}
    
    mapped_user_indices = np.array([user_id_map[idx] for idx in user_indices])
    mapped_food_indices = np.array([food_id_map[idx] for idx in food_indices])
    
    from scipy.sparse import csr_matrix
    matrix_shape = (len(unique_users), len(unique_foods))
    ratings_matrix = csr_matrix((ratings, (mapped_user_indices, mapped_food_indices)), shape=matrix_shape)
    
    nmf_model = NMF(n_components=20, random_state=42)
    user_factors = nmf_model.fit_transform(ratings_matrix)
    item_factors = nmf_model.components_
    
    test_user_indices = test_data['user', 'eats', 'food'].edge_label_index[0].cpu().numpy()
    test_food_indices = test_data['user', 'eats', 'food'].edge_label_index[1].cpu().numpy()
    test_edge_index = test_data['user', 'eats', 'food'].edge_label_index
    test_health_scores = get_health_scores(test_edge_index, health_dict, device='cpu').cpu().numpy()
    
    mf_predictions = []
    for u, f in zip(test_user_indices, test_food_indices):
        if u not in user_id_map or f not in food_id_map:
            mf_predictions.append(0.5)
        else:
            mapped_u = user_id_map[u]
            mapped_f = food_id_map[f]
            pred = np.dot(user_factors[mapped_u], item_factors[:, mapped_f])
            mf_predictions.append(min(max(pred / 5.0, 0), 1))
    
    mf_predictions = np.array(mf_predictions)
    
    mf_binary_predictions = (mf_predictions >= 0.5).astype(int)
    metrics = {
        'auc': roc_auc_score(y_test, mf_predictions),
        'accuracy': accuracy_score(y_test, mf_binary_predictions),
        'precision': precision_score(y_test, mf_binary_predictions),
        'recall': recall_score(y_test, mf_binary_predictions),
        'f1': f1_score(y_test, mf_binary_predictions),
        'recall@10' : recall_at_k_binary(y_test, mf_predictions, 10),
        'recall@20' : recall_at_k_binary(y_test, mf_predictions, 20),
        'ndcg@10': ndcg_at_k_binary(y_test, mf_predictions, 10),
        'ndcg@20': ndcg_at_k_binary(y_test, mf_predictions, 20)
    }
    
    mask = mf_binary_predictions == 1
    if np.sum(mask) > 0:
        metrics['health_score'] = np.mean(test_health_scores[mask])
    else:
        metrics['health_score'] = 0.0
    
    print(f"Matrix Factorization - Test AUC: {metrics['auc']:.4f}, F1: {metrics['f1']:.4f}, Health Score: {metrics['health_score']:.4f}")
    
    os.makedirs('mf_models', exist_ok=True)
    with open(f'mf_models/fold_{fold+1}_best_model.pkl', 'wb') as f:
        pickle.dump({
            'user_factors': user_factors,
            'item_factors': item_factors,
            'user_id_map': user_id_map,
            'food_id_map': food_id_map,
            'test_metrics': metrics,
            'predictions': mf_predictions
        }, f)
    
    return {'MatrixFactorization': metrics}, {'MatrixFactorization': mf_predictions}
