In [1]:
import sys
sys.path.append("../src")
import pandas as pd
import surprise
import matplotlib.pyplot as plt
from tqdm import tqdm
from dataLoad import trainValidLoad
from metric import ndcg_calculator, hit_at_k

path= "../dataset/"
train, train_valid, sample_sumbission = trainValidLoad(path)

In [2]:
n = 10

In [3]:
mp = train.item_id.value_counts().head(n).index
mp_unique = train.groupby("item_id").user.nunique().nlargest(n).index

#가장 많이 조회된 item(train에서 가장 자주 나타나는 item)
mp_submission = sample_sumbission.copy()
mp_submission.item_id = [mp.values.tolist()]*len(sample_sumbission)

# 가장 많은 unique user가 조회한 item
mp_unique_submission = sample_sumbission.copy() 
mp_unique_submission.item_id = [mp_unique.values.tolist()]*len(sample_sumbission)

In [4]:
## Random
random_item = train.item_id.drop_duplicates().sample(n).values
random_submission = sample_sumbission.copy()
random_submission.item_id = [random_item.tolist()]*len(sample_sumbission)

## Evaluate

In [5]:
%%time
random_ndcg = ndcg_calculator(train_valid, random_submission, n)
mp_ndcg = ndcg_calculator(train_valid, mp_submission, n)
mp_unique_ndcg = ndcg_calculator(train_valid, mp_unique_submission, n)

print("performance")
print(f"nDCG(random): {random_ndcg:.4f}")
print(f"nDCG(mp): {mp_ndcg:.4f}")
print(f"nDCG(mp unique): {mp_unique_ndcg:.4f}")

performance
nDCG(random): 0.0000
nDCG(mp): 0.0005
nDCG(mp unique): 0.0007
CPU times: total: 30.8 s
Wall time: 31.1 s


In [6]:
%%time
random_hit = hit_at_k(train_valid, random_submission, n)
mp_hit = hit_at_k(train_valid, mp_submission, n)
mp_unique_hit = hit_at_k(train_valid, mp_unique_submission, n)

print("performance")
print(f"hit(random): {random_hit:.4f}")
print(f"hit(mp): {mp_hit:.4f}")
print(f"hit(mp unique): {mp_unique_hit:.4f}")

performance
hit(random): 0.0000
hit(mp): 0.0006
hit(mp unique): 0.0006
CPU times: total: 15.7 s
Wall time: 16 s


# last 30 Days MP

In [9]:
# 홈쇼핑 상품이라면 최근 베스트셀러의 영향을 받을 것. 마지막 30일에 대한 Mp만 고려

recent_days = 30
train_recent30 = train[train["timestamp"] >= (train["timestamp"].max() - pd.Timedelta(days=recent_days))]

In [10]:
mp = train_recent30.item_id.value_counts().head(n).index
mp_unique = train_recent30.groupby("item_id").user.nunique().nlargest(n).index

#가장 많이 조회된 item(train에서 가장 자주 나타나는 item)
mp_submission = sample_sumbission.copy()
mp_submission.item_id = [mp.values.tolist()]*len(sample_sumbission)

# 가장 많은 unique user가 조회한 item
mp_unique_submission = sample_sumbission.copy() 
mp_unique_submission.item_id = [mp_unique.values.tolist()]*len(sample_sumbission)

In [11]:
%%time
random_ndcg = ndcg_calculator(train_valid, random_submission, n)
mp_ndcg = ndcg_calculator(train_valid, mp_submission, n)
mp_unique_ndcg = ndcg_calculator(train_valid, mp_unique_submission, n)

print("performance")
print(f"nDCG(random): {random_ndcg:.4f}")
print(f"nDCG(mp): {mp_ndcg:.4f}")
print(f"nDCG(mp unique): {mp_unique_ndcg:.4f}")

performance
nDCG(random): 0.0000
nDCG(mp): 0.0008
nDCG(mp unique): 0.0007
CPU times: total: 32.1 s
Wall time: 32.6 s


In [12]:
%%time
random_hit = hit_at_k(train_valid, random_submission, n)
mp_hit = hit_at_k(train_valid, mp_submission, n)
mp_unique_hit = hit_at_k(train_valid, mp_unique_submission, n)

print("performance")
print(f"hit(random): {random_hit:.4f}")
print(f"hit(mp): {mp_hit:.4f}")
print(f"hit(mp unique): {mp_unique_hit:.4f}")

performance
hit(random): 0.0000
hit(mp): 0.0007
hit(mp unique): 0.0006
CPU times: total: 16.5 s
Wall time: 16.6 s


- "매우 근소한 차이"지만, 최근 30일의 log만을 반영했을 때의 결과가 미미하게 낫다.