In [2]:
import sys
sys.path.append("../src")
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt
import lightgbm
from dataLoad import trainValidLoad
from metric import ndcg_calculator, hit_at_k

path= "../dataset/"
train, train_valid, sample_sumbission = trainValidLoad(path)

In [4]:
train = train.sort_values(by ="timestamp", ascending = True)\
        .reset_index(drop=True).drop(columns=["timestamp","click_count_normalized","user_click_count_normalized"])

In [5]:
# 현 데이터셋에서는 train에서 또다시 valid를 나눌 수가 없다.
# 이미 train에 단 1건의 데이터만 가진 유저가 존재하기 때문.
# 그러니 우리는 train_valid의 기록을 가지고 label_df를 만들어야 한다. 

train_user_click_count =train.user.value_counts()
print(f"유저 별 아이템 클릭수 최소: {train_user_click_count.min()}")

유저 별 아이템 클릭수 최소: 1


In [6]:
def make_label(train_valid):
    label_df = train_valid[["user","item_id"]]
    label_df.drop_duplicates(subset=["user","item_id"],inplace=True)
    label_df["click_num"] = 1
    return label_df

def label_create_df(train, label_df):
    train = pd.merge(train, label_df, how="left", on=["user","item_id"])
    train["click_num"] = train["click_num"].fillna(0)
    return train

In [7]:
label_df = make_label(train_valid)
train = label_create_df(train, label_df)

In [8]:
# print(train[train["click_num"]==1].count()) #91997
# print(train[train["click_num"]==0].count()) #16298892

# cadidate 생성 필요
# train with basic featuresExtract

In [95]:
def _ndcg_calculator(gt, rec, idcg):
    if not isinstance(rec, list):
        rec = [rec]
    dcg = 0.0
    for i, r in enumerate(rec):
        if r in set(gt):
            dcg += 1.0 / np.log(i + 2)
    return dcg / idcg

def ndcg_calculator(answer, submission, n=10):
    idcg = sum((1.0 / np.log(i + 1) for i in range(1, n + 1)))
    assert (answer.user != submission.user).sum() == 0
    ndcg_list = []
    for (_, row_answer), (_, row_submit) in zip(answer.iterrows(), submission.iterrows()):
        ndcg_list.append(_ndcg_calculator(row_answer.item_id, row_submit.predicted_list, idcg))
    ndcg_score = sum(ndcg_list) / len(answer)
    return ndcg_score


def hit_at_k(answer, submission, n=10):
    assert (answer.user != submission.user).sum() == 0
    if n > len(answer):
        n = len(answer)
    hit = 0
    for i in range(len(answer)):
        answer_ids = answer["item_id"].loc[i]
        submission_ids = submission["predicted_list"].loc[i]
        hits = np.isin(submission_ids, answer_ids)
        hit += np.sum(hits) / len(hits)
    hit /= len(answer)
    return hit


In [91]:
# 시간이 된다면 class 리팩토링
def lgbm_preprocess(train):
    X_train = train.drop(columns=["click_num"])
    y_train = train["click_num"]
#     group_dict = train.groupby("user")["user"].count().to_dict()
    item_idx = X_train["item_id"].copy()
    user_idx = X_train["user"].copy()
    del X_train["item_id"], X_train["user"]
    return X_train, y_train, item_idx, user_idx

def make_groups(X_train):
    group_dict = []
    batchsize = len(X_train)
    while True:
        if batchsize >= 2000:
            group_dict.append(2000)
            batchsize = batchsize - 2000
        else:
            group_dict.append(batchsize)
            break
            
def train_LgbmRanker_batch(X_train, y_train, model_params, batch_size):
    model = lightgbm.LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        boosting_type="dart",
        num_leaves= model_params["num_leaves"],
        learning_rate=model_params["learning_rate"],
        n_estimators= model_params["n_estimators"],
        importance_type="gain",
        verbose= -1,
        random_state= 42
        )
    num_batches = X_train.shape[0] // batch_size
    for i in tqdm(range(0, X_train.shape[0], batch_size), total=num_batches):
        X_batch = X_train[i:i + batch_size]
        y_batch = y_train[i:i + batch_size]
#         user_array = X_batch.user.to_numpy()
#         group_vectorized = np.vectorize(lambda x: group_dict.get(x, 0))
#         train_group_batch = group_vectorized(user_array) 
        group_dict = []
        batchsize = len(X_batch)
        while batchsize !=0 :
            if batchsize >= 1000:
                group_dict.append(1000)
                batchsize = batchsize - 1000
            else:
                group_dict.append(batchsize)
                batchsize -= batchsize 
        model.fit(
        X=X_batch,
        y=y_batch,
        group=group_dict,
        ) 
    feature_importances_df = pd.DataFrame(dict(zip(X_train.columns, model.feature_importances_)), index=["feature_importances"]).T
    return model, feature_importances_df

    
def valid_evaluation(X_train, train_valid, sample_sumbission, model, feature_importances_df, item_idx, user_idx): 
    print(feature_importances_df)
    pred = model.predict(X_train)
    X_train["pred"] = pred
    X_train["item_id"] = item_idx
    X_train["user"] = user_idx
    
    print("performance")
    # each user pred 25 items
    lgbm_sub_df = X_train.sort_values(by="pred", ascending=False).groupby("user").head(25)
    lgbm_user_items_dict = lgbm_sub_df.groupby("user")["item_id"].unique().to_dict()
    sample_sumbission["predicted_list"] = sample_sumbission["user"].apply(lambda x: lgbm_user_items_dict.get(x, []))

    print("lgbm ndcg:", ndcg_calculator(sample_sumbission, train_valid, 10))
        
    return X_train, sample_sumbission

In [93]:
import gc
gc.collect()

46

In [23]:
model_params={"num_leaves":150 ,"learning_rate":0.005,"n_estimators":35}

In [46]:
X_train, y_train, item_idx, user_idx = lgbm_preprocess(train)

In [16]:
model, feature_importances_df = train_LgbmRanker_batch(X_train, y_train, model_params, 10000)

3147it [12:44,  4.12it/s]                          


In [98]:
sample_sumbission = sample_sumbission.drop(columns=["predicted_list"])
sample_sumbission= sample_sumbission.rename(columns={"item_id":"predicted_list"})

In [99]:
sample_sumbission
# 부족한 row는 앙상블해서 채우는 작업 필요 

Unnamed: 0,user,predicted_list
0,43249,"[598030, 175484, 752807, 113984, 344194, 80814..."
1,167215,"[430431, 884977, 498215, 124182, 779861, 14314..."
2,574608,"[647443, 834767, 278264, 810252, 563373, 90209..."
3,121721,"[642023, 898865, 8359, 721438, 437904, 150199,..."
4,283623,"[226711, 785029, 128078, 462582, 399089, 10893..."
...,...,...
85670,65,"[830264, 282343, 774189, 457558, 106842]"
85671,795,"[622313, 539585]"
85672,654,"[127216, 899825, 860471]"
85673,357,"[918732, 471615]"


In [70]:
X_train, sample_sumbission = \
valid_evaluation(X_train, train_valid, sample_sumbission, model, feature_importances_df, item_idx, user_idx, 10)
#  nDCG@10

                  feature_importances
day_of_week                  0.000000
days                         0.000000
hour                         0.907654
weeks                        0.000000
cumcount                   756.874029
click_count                371.160690
user_click_count           549.257184
performance
lgbm ndcg: 0.1669152321587026


In [84]:
%%time
mf_hit = hit_at_k(train_valid, sample_sumbission, 10)

print("performance")
print(f"hit@10(mf_ALS): {mf_hit:.4f}")

performance
hit@10(mf_ALS): 0.1189
CPU times: user 4.5 s, sys: 1.39 ms, total: 4.5 s
Wall time: 4.51 s


In [85]:
%%time
mf_hit = hit_at_k(train_valid, sample_sumbission, 1)

print("performance")
print(f"hit@1(mf_ALS): {mf_hit:.4f}")

performance
hit@1(mf_ALS): 0.1189
CPU times: user 4.56 s, sys: 3.43 ms, total: 4.56 s
Wall time: 4.57 s


In [19]:
X_train.to_parquet("../output/batch1000_lgbm_X_train.parquet", index=False)
pickle.dump(model, open("../output/batch1000_lgbm_model.pkl", "wb"))