In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
import datetime
import gensim
from gensim.models import Word2Vec, KeyedVectors

import os
import sys
import time

In [3]:
print(f"python version : {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",)

print("np version : ", np.__version__)
print("pd version : ",pd.__version__)

python version : 3.8.10
np version :  1.24.3
pd version :  2.0.3


In [30]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

Goal: predict article for each customer for next 7 days

dataset : https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data

In [None]:
interaction_df = pd.read_csv("data/h&m/transactions_train.csv")
user_df = pd.read_parquet("data/h&m/customers.parquet")
item_df = pd.read_parquet("data/h&m/articles.parquet")

# configs

In [4]:
date_col = 't_dat'
user_col = 'customer_id'
item_col = 'product_code'

In [None]:
interaction_df[date_col] = pd.to_datetime(interaction_df[date_col], format='%Y-%m-%d')
print("shape : ", interaction_df.shape)
interaction_df[['customer_id', 'article_id', 'sales_channel_id']].nunique()

In [None]:
interaction_df.describe()

In [None]:
print(user_df.shape)
user_df.head()

In [None]:
print(item_df.shape)
item_df.head()

In [None]:
# Some of items have same product_code but different product name. BUT good things is all other columns seems to be same.
# We will ignore them for now and just use latest product name for item_cd2nm_map
item_df.groupby(['product_code'])[['prod_name']].nunique().sort_values("prod_name", ascending=False).iloc[:6]

In [None]:
item_cd2nm_map = dict(zip(item_df[item_col],  item_df["prod_name"]))

In [None]:
len(item_cd2nm_map)

In [None]:
with open('data/h&m/item_cd2nm_map.pickle', 'wb') as f:
    pickle.dump(item_cd2nm_map, f)

# Item2Vec

## Train

In [None]:
# simply add prdt_cd
interaction_df = interaction_df.merge(item_df[['article_id', item_col]], how='left', on='article_id')

In [None]:
train_df = interaction_df.loc[interaction_df['t_dat'] < '2020-01-01'].copy()
test_df = interaction_df.loc[interaction_df['t_dat'] >= '2020-01-01'].copy()

In [None]:
print(f"trainset date range : {train_df[date_col].min()} ~ {train_df[date_col].max()}")
print(f"testset date range : {test_df[date_col].min()} ~ {test_df[date_col].max()}")

In [None]:
train_df.head()

In [None]:
# Sequence for each user on all train dataset.
user_seq = train_df.groupby(user_col)[item_col].apply(list)
user_seq_df = pd.DataFrame(user_seq).rename(columns={item_col:"seq_list"})
tr_seq_df = train_df.drop_duplicates(user_col, keep='first')
tr_seq_df = tr_seq_df.merge(user_seq_df, on=user_col)
tr_seq_df['n_seq'] = tr_seq_df['seq_list'].str.len()

In [None]:
tr_seq_df.head()

In [None]:
tr_seq_df[['n_seq']].hist(bins=100, figsize=(4, 3))

In [None]:
tr_sequences = tr_seq_df['seq_list'].tolist()

In [None]:
tr_sequences[:2]

In [None]:
len(tr_sequences)

In [None]:
model = Word2Vec(vector_size = 100, #
                 workers = 4, # s
                 sg = 1, # 1 = Skip-gram, else = CBOW
                 hs = 0, # Hierachical softmax = 1, else Negative sampling
                 negative = 3, 
                 window = 9999)

In [None]:
model

In [None]:
st_time = time.perf_counter()
model.build_vocab(tr_sequences, progress_per=10_000)
model.train(tr_sequences, total_examples=model.corpus_count,
           epochs=10)
print(time.perf_counter() - st_time)

In [None]:
"""
Store trained word vectors in `KeyedVectors` instance
No need to save model (containing curr state) if no additional training is required.

Keeping only word embedding space makes it more memory efficient
"""
item_vectors = model.wv
item_vectors[663713001] # get item_id's item vector


item_vectors.save("trained_model/item2vec_emb_space.wordvectors")

## Inference

In [5]:
learned_item_embeddings = KeyedVectors.load("trained_model/item2vec_emb_space.wordvectors"
                           , mmap='r' # Load with memory-mapping which is read-only, shared across processes thus faster.
                           )

In [None]:
import umap
cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0,
                              n_components=2, random_state=42).fit_transform(X)
plt.figure(figsize=(10,9))
plt.scatter(cluster_embedding[:, 0], cluster_embedding[:, 1], s=3, cmap='Spectral')

In [None]:
user_seq = test_df.groupby(user_col)[item_col].apply(list)
user_seq_df = pd.DataFrame(user_seq).rename(columns={item_col:"seq_list"})
test_seq_df = test_df.drop_duplicates(user_col, keep='first')
test_seq_df = test_seq_df.merge(user_seq_df, on=user_col)
test_seq_df['n_seq'] = test_seq_df['seq_list'].str.len()

# # filter : n_seq > 3 to use both: item2item, seq(user)2item  
# test_seq_df = test_seq_df.loc[test_seq_df['n_seq'] >= 3].copy()
test_seq_df['true_y'] = test_seq_df['seq_list'].apply(lambda x: x[-1])

In [6]:
# test_seq_df.to_parquet("data/h&m/test_seq_df.parquet")
test_seq_df = pd.read_parquet("data/h&m/test_seq_df.parquet")

In [None]:
test_seq_df.groupby(['n_seq']).count().iloc[:10]

In [None]:
print(test_seq_df.shape)
test_seq_df.head()

In [7]:
from cart import get_recc, get_user_vector

In [23]:
topk = 10 # so we can recomend at least 10 items with 1 item vector to represent that user.

test_seq_df = test_seq_df.loc[test_seq_df['n_seq'] > min_n_seq].copy()
test_seq_df['input_items'] = test_seq_df['seq_list'].apply(lambda row: row[:-topk])
test_seq_df['target_items'] = test_seq_df['seq_list'].apply(lambda row: row[-topk:])

In [28]:
st_time = time.perf_counter()
rec_result_dfs = []
column_order = [user_col, item_col, 'item_nm', 'score', 'rank']
topk = 10
i = 0
for row in test_seq_df.iloc[:1000].itertuples():
    i += 1
    if i % 5000 == 0:
        print(f"on {i}th user")
    user_id = getattr(row, user_col)
    input_item_seq = getattr(row, 'input_items')
    
    # 1) Combine item vectors to create user vector
    user_vector, cold_items = get_user_vector(learned_item_embeddings, input_item_seq, method='sum')
    
    # all cold-items, cannot make rec
    if not np.any(user_vector):
        continue

    # 2)Get rec for each user
    # hm... How should I leverage arithmetics ?
    rec_df = get_recc(learned_item_embeddings, item_col, user_vector, negatives=None, topn=topk)
    rec_df[user_col] = user_id
    rec_df['n_cold_items'] = len(cold_items)
    rec_result_dfs.append(rec_df[column_order])

print(f"recc on {len(test_seq_df)}users, {topk} recc each took : {time.perf_counter() - st_time}s")

recc on 315563users, 10 recc each took : 45.95136070000001s


In [77]:
rec_result_df = pd.concat(rec_result_dfs)

In [81]:
df = pd.DataFrame(rec_result_df.groupby(user_col)[item_col].apply(lambda x: list(x))).rename(columns={item_col:"topk_rec"})
perm_metric_df = test_seq_df.merge(df, left_on='customer_id', right_index=True
                                   , how='inner'
                                  )

In [82]:
print(perm_metric_df.shape)
perm_metric_df.isna().sum()

(1000, 12)


t_dat               0
customer_id         0
article_id          0
price               0
sales_channel_id    0
product_code        0
seq_list            0
n_seq               0
true_y              0
input_items         0
target_items        0
topk_rec            0
dtype: int64

In [83]:
def prec_at_k(topk_recs, target_items):
    rec_items_set = set(topk_recs[:topk])
    target_items_set = set(target_items)
    
    return len(rec_items_set&target_items_set) / topk

def recall_at_k(topk_recs, target_items):
    rec_items_set = set(topk_recs[:topk])
    target_items_set = set(target_items)
    denom = n_of_relevant_items # ...? But in item2vec aren't all items considered relevant?
    
    return len(rec_items_set&target_items_set) / denom

In [85]:
perm_metric_df[f'prec@{topk}']=perm_metric_df.apply(lambda row: prec_at_k(row.topk_rec, row.target_items), axis=1)

In [90]:
print(f"prec@{topk} = ", np.sum(perm_metric_df['prec@10']) / len(perm_metric_df) )

prec@10 =  0.0253


In [None]:
rec_result_df = pd.concat(rec_result_dfs)
rec_result_df.to_parquet("data/h%m/rec_result_df.parquet")

In [None]:
# Evaluation
 # ??? How to measure other rec/classification metrics?, is hit@K only possible one? 
     # if not is it fair to compare with deepfm metrics?
for csno in csno_lst:
    csno_df = rec_result_df.loc[rec_result_df['csno'] == csno].copy()
    rec_set = set(csno_df.iloc[:topk]['recommended_item_id'])
    target = {csno_df['target_item_id'][0]}
    # hit@K
    if len(rec_set.intersection(target)) > 0:
        users_hit.append(1)
    else:
        users_hit.append(0)

hit_at_k = np.sum(users_hit) / len(csno_lst) 

In [None]:
test_seq_df.iloc[0]['seq_list']

In [None]:
test_seq_df.iloc[0]['item2item_input']

In [None]:
test_seq_df.head()

### Seq2Vec

- combine sequence of item vectors to generate user vector, find similar items in trained item embedding for recommendation.
- Only difference between Seq2Vec and Item2Vec is during inference, number of items combined. Item2Vec simply use one item's embedding to find similar items. 

1. t-1 의 item 만 주고 추천
2. 0~t-1 의 모든 item 들을 합하여 user vector 생성후 추천.

In [None]:
test_seq_df.head()

In [None]:
# 1. Lookup embedding vector from item_id
vec = item_vectors[806973001]

# 2. Search nearest K-items


In [None]:
vec

In [None]:

for row in test_seq_df.iter_tuple():
    row = getattr()

# DeepFM

# Performance Metrics
- Precision@K
- Recall@K
- NDCG@K
- MAP@K
- Coverage

In [None]:
def _compute_precision_recall(targets, predictions, k):
    """
    targets : 실제 관심 있는 item list 
    predictions : prob sorted in descending order.
    """
    pred = predictions[:k]
    num_hit = len(set(pred).intersection(set(targets))) #  hit는 k개 추천 아이템 중에 사용자가 실제로 관심 있는 상품이 존재하는 경우를 의미하며, = TP
    precision = float(num_hit) / len(pred)
    recall = float(num_hit) / len(targets)
    return precision, recall

def ap(rel_items, recc_items, k=5):
    precisions = []
    for i in range(k):
        k_recc = recc_items[:i+1]
        n_hits = set(rel_items) & set(recc_items)
        rel = recc_items[i] in rel_item #1/0
        precisions.append(len(n_hits)/len(rel_items)*rel))
    ap_k = sum(precisions) / len(precisions)
    return ap_k


# References
1. [Item2Vec_tutorial_with_recsys_application](https://github.com/bwange/Item2vec_Tutorial_with_Recommender_System_Application/blob/master/Making_Your_Own_Recommender_System_with_Item2Vec.ipynb)
2. [Word2Vec Tutorial Part I SkipGram - Chris McCormick](https://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model)
3. [Word2Vec Tutorial Part II Negative Sampling - Chris McCormick](https://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/)
4. [Word2Vec to Recsys and Advertising - Chris McCormick](https://mccormickml.com/2018/06/15/applying-word2vec-to-recommenders-and-advertising/)