<font color='tomato'><font color="#CC3D3D"><p>
# RecSys Model Tuning

### Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip3 install optuna
!pip install cornac==1.17
!pip install dgl

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.1 colorlog-6.7.0 optuna-3.4.0
Collecting cornac==1.17
  Downloading cornac-1.17.0-cp310-cp310-manylinux1_x86_64.whl (21.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [3]:
import pandas as pd
import numpy as np
import optuna

# MS recommenders API
import sys
sys.path.append('/content/drive/Othercomputers/mypc/study/10_RecSystem')  # 본인이 msr.zip 압축을 푼 위치를 확인(셀에서 pwd 명령어 실행) 후 변경해야 함.
                                                # 윈도우에서는 폴더 구분자를 // 또는 \\로 해야 함.
from msr.cornac_utils import predict_ranking
from msr.python_splitters import python_stratified_split
from msr.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

# Cornac API
import cornac
print(f"Cornac version: {cornac.__version__}")
from cornac.models import NeuMF, VAECF, EASE, UserKNN, ItemKNN, MF

Cornac version: 1.17


In [7]:
# Data column definition
DEFAULT_USER_COL = 'resume_seq'
DEFAULT_ITEM_COL = 'recruitment_seq'
DEFAULT_RATING_COL = 'rating'
DEFAULT_PREDICTION_COL = 'prediction'

# Top k items to recommend
TOP_K = 5

# 최적화 시도횟수
TRIALS = 10

# Random seed, Verbose, etc.
SEED = 202311
VERBOSE = True

### Data Preparation

In [5]:
# 데이터 로딩
data = pd.read_csv('/content/drive/Othercomputers/mypc/study/10_RecSystem/실습-20231104/apply_train.csv')
data[DEFAULT_RATING_COL] = 1  # Cornac에서 지정한 데이터형식(UIR: User, Item, Rating)에 따라

In [10]:
# 데이터 분할
train, test = python_stratified_split(
    data,
    filter_by="user",
    ratio=0.7,
    col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL,
    seed=SEED
)

# 학습데이터 => Cornac 데이터형식으로 변환
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)
test_set = cornac.data.Dataset.from_uir(test.itertuples(index=False), seed=SEED)

### Hyperparameter Optimization with Optuna

BiVAECF

In [13]:
# 조절할 하이퍼 파라미터와 그 범위를 지정하는 함수 정의
def objective(trial):
    # optuna.trial.Trial.suggest_categorical() for categorical parameters
    # optuna.trial.Trial.suggest_int() for integer parameters
    # optuna.trial.Trial.suggest_float() for floating point parameters

    space = {
        'k': trial.suggest_int('k', 10, 100),
        'encoder_structure': trial.suggest_categorical('encoder_structure', [[20], [40, 20], [60, 40, 20]]),
        'act_fn': trial.suggest_categorical('act_fn', ['sigmoid', 'tanh', 'relu']),
        'likelihood': trial.suggest_categorical('likelihood', ['bern', 'gaus', 'pois']),
        'n_epochs': trial.suggest_int('n_epochs', 50, 200),
        'batch_size': trial.suggest_int('batch_size', 16, 128),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    }

    # 모델 설정 및 학습
    model = BiVAECF(k=space['k'],
                    encoder_structure=space['encoder_structure'],
                    act_fn=space['act_fn'],
                    likelihood=space['likelihood'],
                    n_epochs=space['n_epochs'],
                    batch_size=space['batch_size'],
                    learning_rate=space['learning_rate'],
                    verbose=VERBOSE, seed=SEED)


    model.fit(train_set)
    # 모든 아이템에 대한 예측값 생성
    all_predictions = predict_ranking(model, train, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, remove_seen=True)
    # Recall@K 계산
    score = recall_at_k(test, all_predictions,
                        col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL,
                        col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL,
                        k=TOP_K)
    return score

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective, n_trials=TRIALS)

#최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2023-11-06 06:54:50,888] A new study created in memory with name: no-name-6762c2db-3135-4049-9841-9d92baf95359


  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-06 07:02:43,452] Trial 0 finished with value: 0.006053079858490406 and parameters: {'k': 59, 'encoder_structure': [60, 40, 20], 'act_fn': 'relu', 'likelihood': 'bern', 'n_epochs': 184, 'batch_size': 39, 'learning_rate': 2.3477748125422217e-05}. Best is trial 0 with value: 0.006053079858490406.


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-06 07:07:53,322] Trial 1 finished with value: 0.001893878808381967 and parameters: {'k': 19, 'encoder_structure': [40, 20], 'act_fn': 'tanh', 'likelihood': 'gaus', 'n_epochs': 100, 'batch_size': 35, 'learning_rate': 5.5675495721600404e-05}. Best is trial 0 with value: 0.006053079858490406.


  0%|          | 0/198 [00:00<?, ?it/s]

  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-06 07:19:35,188] Trial 2 finished with value: 0.01362486601581242 and parameters: {'k': 10, 'encoder_structure': [40, 20], 'act_fn': 'tanh', 'likelihood': 'pois', 'n_epochs': 198, 'batch_size': 22, 'learning_rate': 0.0006040764283363218}. Best is trial 2 with value: 0.01362486601581242.


  0%|          | 0/178 [00:00<?, ?it/s]

  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-06 07:23:34,895] Trial 3 finished with value: 0.006101660011931989 and parameters: {'k': 62, 'encoder_structure': [20], 'act_fn': 'relu', 'likelihood': 'bern', 'n_epochs': 178, 'batch_size': 126, 'learning_rate': 0.0005884459619693207}. Best is trial 2 with value: 0.01362486601581242.


  0%|          | 0/145 [00:00<?, ?it/s]

  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-06 07:30:07,926] Trial 4 finished with value: 0.025935330657616308 and parameters: {'k': 42, 'encoder_structure': [20], 'act_fn': 'tanh', 'likelihood': 'pois', 'n_epochs': 145, 'batch_size': 32, 'learning_rate': 0.0007373651350340403}. Best is trial 4 with value: 0.025935330657616308.


  0%|          | 0/140 [00:00<?, ?it/s]

  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-06 07:33:49,317] Trial 5 finished with value: 0.0009746639433034152 and parameters: {'k': 96, 'encoder_structure': [20], 'act_fn': 'tanh', 'likelihood': 'gaus', 'n_epochs': 140, 'batch_size': 98, 'learning_rate': 4.7947994371435764e-05}. Best is trial 4 with value: 0.025935330657616308.


  0%|          | 0/97 [00:00<?, ?it/s]

  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-06 07:37:06,454] Trial 6 finished with value: 0.002981024793857542 and parameters: {'k': 18, 'encoder_structure': [40, 20], 'act_fn': 'tanh', 'likelihood': 'gaus', 'n_epochs': 97, 'batch_size': 87, 'learning_rate': 0.00012045065073042325}. Best is trial 4 with value: 0.025935330657616308.


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-06 07:41:42,691] Trial 7 finished with value: 0.005637957945421283 and parameters: {'k': 37, 'encoder_structure': [60, 40, 20], 'act_fn': 'sigmoid', 'likelihood': 'pois', 'n_epochs': 167, 'batch_size': 84, 'learning_rate': 4.1495470864202204e-05}. Best is trial 4 with value: 0.025935330657616308.


  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-06 07:45:18,267] Trial 8 finished with value: 0.005784090964263071 and parameters: {'k': 73, 'encoder_structure': [60, 40, 20], 'act_fn': 'sigmoid', 'likelihood': 'pois', 'n_epochs': 81, 'batch_size': 56, 'learning_rate': 0.00012553086552368386}. Best is trial 4 with value: 0.025935330657616308.


  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-06 07:48:56,704] Trial 9 finished with value: 0.0005894836123555764 and parameters: {'k': 34, 'encoder_structure': [40, 20], 'act_fn': 'sigmoid', 'likelihood': 'gaus', 'n_epochs': 103, 'batch_size': 72, 'learning_rate': 7.768787774201181e-05}. Best is trial 4 with value: 0.025935330657616308.


Best score: 0.025935330657616308
Best parameters: {'k': 42, 'encoder_structure': [20], 'act_fn': 'tanh', 'likelihood': 'pois', 'n_epochs': 145, 'batch_size': 32, 'learning_rate': 0.0007373651350340403}


##### EASE

In [None]:
# 조절할 하이퍼 파라미터와 그 범위를 지정하는 함수 정의
def objective(trial):
    # optuna.trial.Trial.suggest_categorical() for categorical parameters
    # optuna.trial.Trial.suggest_int() for integer parameters
    # optuna.trial.Trial.suggest_float() for floating point parameters
    space = {
        'lamb': trial.suggest_int('lamb', 100, 600, step=100),
        'posB': trial.suggest_categorical('posB', [True, False])
    }

    # 모델 설정 및 학습
    model = EASE(**space, seed=SEED, verbose=VERBOSE)
    model.fit(train_set)
    # 모든 아이템에 대한 예측값 생성
    all_predictions = predict_ranking(model, train, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, remove_seen=True)
    # Recall@K 계산
    score = recall_at_k(test, all_predictions,
                        col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL,
                        col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL,
                        k=TOP_K)
    return score

# 최적화 실행
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
study.optimize(objective, n_trials=TRIALS)

#최적화 결과 보기
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[I 2023-11-04 06:04:25,460] A new study created in memory with name: no-name-7646e9e2-3da3-44fc-9d13-1c046eb012e8


  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-04 06:06:18,558] Trial 0 finished with value: 0.09352238979883254 and parameters: {'lamb': 400, 'posB': False}. Best is trial 0 with value: 0.09352238979883254.


  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-04 06:08:11,124] Trial 1 finished with value: 0.09356781321138634 and parameters: {'lamb': 600, 'posB': False}. Best is trial 1 with value: 0.09356781321138634.


  0%|          | 0/8482 [00:00<?, ?it/s]

[I 2023-11-04 06:10:02,224] Trial 2 finished with value: 0.09452123997535307 and parameters: {'lamb': 500, 'posB': True}. Best is trial 2 with value: 0.09452123997535307.


Best score: 0.09452123997535307
Best parameters: {'lamb': 500, 'posB': True}


##### UserKNN

In [None]:
# # 조절할 하이퍼 파라미터와 그 범위를 지정하는 함수 정의
# def objective(trial):
#     # optuna.trial.Trial.suggest_categorical() for categorical parameters
#     # optuna.trial.Trial.suggest_int() for integer parameters
#     # optuna.trial.Trial.suggest_float() for floating point parameters
#     space = {
#         'k': trial.suggest_int('k', 10, 100, step=10),
#         'similarity': trial.suggest_categorical('similarity', ['cosine', 'pearson'])
#     }

#     # 모델 설정 및 학습
#     model = UserKNN(**space, seed=SEED, verbose=VERBOSE)
#     model.fit(train_set)
#     # 모든 아이템에 대한 예측값 생성
#     all_predictions = predict_ranking(model, train, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, remove_seen=True)
#     # Recall@K 계산
#     score = recall_at_k(test, all_predictions,
#                         col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL,
#                         col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL,
#                         k=TOP_K)
#     return score

# # 최적화 실행
# study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
# study.optimize(objective, n_trials=TRIALS)

# #최적화 결과 보기
# print("Best score:", study.best_value)
# print("Best parameters:", study.best_params)

<font color='tomato'><font color="#CC3D3D"><p>
# End