In [None]:
from scipy.sparse import csr_matrix
import implicit
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy import sparse
import pickle
n = 25

In [None]:
def _ndcg_calculator(gt, rec, idcg):
    dcg = 0.0
    for i, r in enumerate(rec):
        if r in gt:
            dcg += 1.0 / np.log(i + 2)
    return dcg / idcg

def ndcg_calculator(answer, submission, n):
    idcg = sum((1.0 / np.log(i + 1) for i in range(1, n + 1)))

    assert (answer.profile_id != submission.profile_id).sum() == 0

    ndcg_list = []
    for (_, row_answer), (_, row_submit) in zip(answer.iterrows(), submission.iterrows()):
        ndcg_list.append(_ndcg_calculator(row_answer.album_id, row_submit.album_id, idcg))

    ndcg_score = sum(ndcg_list) / len(answer)
    return ndcg_score

In [None]:
train_data = pd.read_csv("./data/train.csv")
train_data_answer = train_data[["Book-Rating"]]

test_data = pd.read_csv("./data/test.csv")

sample_sumbission = pd.read_csv("./data/sample_submission.csv")

train_df = train_data.copy()
mf_sumbission = sample_sumbission.copy()

In [None]:
ALL_USERS = train_df['User-ID'].unique().tolist()
ALL_ITEMS = train_df['Book-ID'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

train_df['User-ID'] = train_df['User-ID'].map(user_map)
train_df['Book-ID'] = train_df['Book-ID'].map(item_map)

In [None]:
row = train_df['User-ID'].values
col = train_df['Book-ID'].values
data = np.ones(train_df.shape[0])
csr_train = csr_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
csr_train

In [None]:
def train(csr_train, factors=200, iterations=3, regularization=0.05, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(csr_train, show_progress=show_progress)
    return model

In [None]:
def submit(model, csr_train, mf_sumbission):  #default week_train set
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(ALL_USERS))
    pred_df = []
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=25, filter_already_liked_items=False)
        for i, profile_id in enumerate(batch):
            profile_id = user_ids[profile_id]
            user_items = ids[i]
            album_ids = [item_ids[item_id] for item_id in user_items] #
            pred_df.append({'User-ID':profile_id,'Book-ID':album_ids})
            
    pred_dfs = pd.DataFrame(pred_df)
#     sample_sumbission_week.drop(columns='album_id', inplace=True)
    sumbission = mf_sumbission.merge(pred_dfs, on='ID')
    
    return sumbission

In [None]:
def validate(csr_train, factors=200, iterations=3, regularization=0.05, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(csr_train, show_progress=show_progress)
    df_preds = submit(model, csr_train, mf_sumbission)
    ndcg = ndcg_calculator(train_data_answer, df_preds, n)
#     ndcg = ndcg_calculator(test_answer_week, sample_sumbission_week, n=25)  # submission 여기서 저장 안되어서 정의 불가능 
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> ndcg@25: {ndcg:6.5f}")
    return ndcg

# df_preds = submit(model, csr_train, sample_sumbission_week)
# mf_week_ndcg = ndcg_calculator(test_answer_week, df_preds, n)

In [None]:
%%time
best_ndcg25 = 0
for factors in [30, 50, 100, 200, 500, 1000]:
    for iterations in [3, 5, 10, 15, 20]:
        for regularization in [0.01, 0.02, 0.05, 0.1]:
            ndcg25 = validate(csr_train, factors, iterations, regularization, show_progress=False)
            if ndcg25 > best_ndcg25:
                best_ndcg25 = ndcg25
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best ndcg@25 found. Updating: {best_params}")