In [1]:
import sys
sys.path.append("../src")
from scipy.sparse import csr_matrix
import implicit
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy import sparse
import pickle
from dataLoad import trainValidLoad
from metric import ndcg_calculator, hit_at_k

path= "../dataset/"
train, train_valid, sample_sumbission = trainValidLoad(path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
n = 10

In [3]:
def user_item_maps(df):
    ALL_USERS = df["user"].unique().tolist()
    ALL_ITEMS = df["item_id"].unique().tolist()

    user_ids = dict(list(enumerate(ALL_USERS)))
    item_ids = dict(list(enumerate(ALL_ITEMS)))

    user_map = {u: uidx for uidx, u in user_ids.items()}
    item_map = {i: iidx for iidx, i in item_ids.items()}

    df["user"] = df["user"].map(user_map)
    df["item_id"] = df["item_id"].map(item_map)
    return ALL_USERS, ALL_ITEMS, user_ids, item_ids, user_map, item_map

def make_csr_matrix(df):
    row = df["user"].values
    col = df["item_id"].values
    data = np.ones(df.shape[0])
    csr_train = csr_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return csr_train


def train_mf(csr_train, factors=200, iterations=3, regularization=0.05, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(csr_train, show_progress=show_progress)
    return model


def real_submit(model, csr_train, sample_sumbission):  
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(ALL_USERS))
    pred_df = []
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=10, filter_already_liked_items=False)
        for i, user in enumerate(batch):
            user = user_ids[user]
            user_items = ids[i]
            items_ids = [item_ids[item_id] for item_id in user_items] 
            pred_df.append({"user":user,"predicted_list":items_ids})
    pred_dfs = pd.DataFrame(pred_df)    
    sample_sumbission = sample_sumbission.merge(pred_dfs, on="user")
    return sample_sumbission 

In [4]:
ALL_USERS, ALL_ITEMS, user_ids, item_ids, user_map, item_map = user_item_maps(train)
mf_csr = make_csr_matrix(train)
mf_model = train_mf(mf_csr)
mf_preds = real_submit(mf_model, mf_csr, sample_sumbission)

#65m 57.3s

100%|██████████| 3/3 [03:28<00:00, 69.52s/it]


# Evaluate

In [37]:
%%time
mf_ndcg = ndcg_calculator(train_valid, mf_preds, 10)

print("performance")
print(f"nDCG@10(mf_ALS): {mf_ndcg:.4f}")

performance
nDCG@10(mf_ALS): 0.0399
CPU times: total: 11.5 s
Wall time: 11.7 s


In [38]:
%%time
mf_hit = hit_at_k(train_valid, mf_preds, 10)

print("performance")
print(f"hit@10(mf_ALS): {mf_hit:.4f}")

performance
hit@10(mf_ALS): 0.0279
CPU times: total: 5.67 s
Wall time: 5.88 s


In [39]:
%%time
mf_hit = hit_at_k(train_valid, mf_preds, 1)

print("performance")
print(f"hit@1(mf_ALS): {mf_hit:.4f}")

performance
hit@1(mf_ALS): 0.0279
CPU times: total: 5.47 s
Wall time: 5.52 s


# Save model & factor & ect

In [15]:
pickle.dump(mf_model, open("../output/als_model.pkl", "wb"))
sparse.save_npz("../output/als_csr.npz", mf_csr)

In [29]:
# user factor 추출 
k=200
user_factors = pd.DataFrame(mf_model.user_factors)
user_factors.columns = [f"user_{i}" for i in range(k)]
user_factors.index = user_factors.index.map(user_ids)
user_factors = user_factors.reset_index().rename(columns={"index":"user"})
user_factors.to_parquet("../output/user_factor_als.parquet")


# item factor 추출
item_factors = pd.DataFrame(mf_model.item_factors)
item_factors.columns = [f"item_id_{i}" for i in range(k)]
item_factors.index = item_factors.index.map(item_ids)
item_factors = item_factors.reset_index().rename(columns={"index":"item_id"})

item_factors.to_parquet("../output/item_factor_als.parquet")