In [1]:
import sys
sys.path.append("../src")
from scipy.sparse import csr_matrix
import implicit
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy import sparse
import pickle
from dataLoad import testLoad
from metric import ndcg_calculator, hit_at_k

path= "../dataset/"
fe_test, final_submission, test_user_label = testLoad(path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
n = 10

In [3]:
def user_item_maps(df):
    ALL_USERS = df["user"].unique().tolist()
    ALL_ITEMS = df["item_id"].unique().tolist()

    user_ids = dict(list(enumerate(ALL_USERS)))
    item_ids = dict(list(enumerate(ALL_ITEMS)))

    user_map = {u: uidx for uidx, u in user_ids.items()}
    item_map = {i: iidx for iidx, i in item_ids.items()}

    df["user"] = df["user"].map(user_map)
    df["item_id"] = df["item_id"].map(item_map)
    return ALL_USERS, ALL_ITEMS, user_ids, item_ids, user_map, item_map

def make_csr_matrix(df):
    row = df["user"].values
    col = df["item_id"].values
    data = np.ones(df.shape[0])
    csr_train = csr_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return csr_train


def train_mf(csr_train, factors=50, iterations=3, regularization=0.05, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(csr_train, show_progress=show_progress)
    return model


def real_submit(model, csr_train, final_submission):  
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(ALL_USERS))
    pred_df = []
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=10, filter_already_liked_items=False)
        for i, user in enumerate(batch):
            user = user_ids[user]
            user_items = ids[i]
            items_ids = [item_ids[item_id] for item_id in user_items] 
            pred_df.append({"user":user,"predicted_list":items_ids})
    pred_dfs = pd.DataFrame(pred_df)    
    # df = final_submission.merge(pred_dfs, on="user")
    return df 

In [4]:
ALL_USERS, ALL_ITEMS, user_ids, item_ids, user_map, item_map = user_item_maps(fe_test)
mf_csr = make_csr_matrix(fe_test)
mf_model = train_mf(mf_csr)
mf_preds = real_submit(mf_model, mf_csr, final_submission)

100%|██████████| 3/3 [01:50<00:00, 36.85s/it]


In [11]:
mf_preds = mf_preds.drop(columns="predicted_list_x").rename(columns={"predicted_list_y":"predicted_list"}) 

In [15]:
mf_preds.to_parquet("../output/test_final_als_submit.parquet")

# Evaluate

In [23]:
%%time
mf_ndcg = ndcg_calculator(test_user_label, mf_preds, 10)

print("performance")
print(f"nDCG@10(mf_ALS): {mf_ndcg:.4f}")

performance
nDCG@10(mf_ALS): 0.0078
CPU times: total: 1min 13s
Wall time: 1min 14s


In [19]:
%%time
mf_hit = hit_at_k(test_user_label, mf_preds, 10)

print("performance")
print(f"hit@10(mf_ALS): {mf_hit:.4f}")

performance
hit@10(mf_ALS): 0.0029
CPU times: total: 36.3 s
Wall time: 36.9 s


In [24]:
%%time
mf_hit = hit_at_k(test_user_label, mf_preds, 1)

print("performance")
print(f"hit@1(mf_ALS): {mf_hit:.4f}")

performance
hit@1(mf_ALS): 0.0029
CPU times: total: 40.2 s
Wall time: 40.9 s


# Save model & factor & ect

In [25]:
pickle.dump(mf_model, open("../output/test_als_model.pkl", "wb"))
sparse.save_npz("../output/test_als_csr.npz", mf_csr)

In [26]:
# user factor 추출 
k=50
user_factors = pd.DataFrame(mf_model.user_factors)
user_factors.columns = [f"user_{i}" for i in range(k)]
user_factors.index = user_factors.index.map(user_ids)
user_factors = user_factors.reset_index().rename(columns={"index":"user"})
user_factors.to_parquet("../output/test_user_factor_als.parquet")


# item factor 추출
item_factors = pd.DataFrame(mf_model.item_factors)
item_factors.columns = [f"item_id_{i}" for i in range(k)]
item_factors.index = item_factors.index.map(item_ids)
item_factors = item_factors.reset_index().rename(columns={"index":"item_id"})

item_factors.to_parquet("../output/test_item_factor_als.parquet")