In [1]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
os.chdir('../')
from candidate_generation import *
os.chdir('../../data/')

In [2]:
transactions = pd.read_pickle('../data/compressed_data/transactions_train.pkl')
customers = pd.read_pickle('../data/compressed_data/customers.pkl')
articles = pd.read_pickle('../data/compressed_data/articles.pkl')

transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

print('First week num: ', transactions.week.min(), '\nLast week num: ', transactions.week.max(), '\n')

# Test week is week after last week in train data
test_week = transactions.week.max()
train_weeks = range(test_week - 10, test_week)

# Filter transactions to last 10 weeks (most recent data)
transactions_train = transactions[transactions.week.isin(train_weeks)]
transaction_test = transactions[transactions.week == test_week]

First week num:  0 
Last week num:  104 



# Generate sim not bought candidates for k=200 with ranks

In [3]:
# Generate cosine item similarities for all items in train and test data (last 10 weeks)
sims = generate_item_similarities(pd.concat([transactions_train, transaction_test]), articles)

39521it [03:38, 180.66it/s]


In [19]:
# Generate candidate items for each user in train and test data (for final prediction use test as well)
t = pd.concat([transactions_train, transaction_test])
similarities_dict = sims.copy()
k = 200

In [20]:
# Get items purchased by each user in train and test data in each week
user_purchases = t.groupby(['customer_id', 'week'])['article_id'].apply(list)
user_purchases

customer_id  week
0            102                                      [568601043]
1            94                                       [826211002]
2            103                                      [794321007]
4            99      [896152002, 730683050, 927530004, 791587015]
6            103                           [719530003, 448509014]
                                         ...                     
1371976      98      [853654012, 624486088, 853654002, 759871030]
             99                                       [762846031]
1371977      95                 [636323002, 835561003, 795975009]
             102                [689365050, 884081001, 794819001]
             103                                      [762846027]
Name: article_id, Length: 831706, dtype: object

In [22]:
# 
# For each customer, get k most similar items to items that customer did not interact with
#

total_candidates = []

for cid, week in tqdm(user_purchases.index.values):
    items = user_purchases.loc[(cid, week)]

    sim_df = []
    for item in items:
        sim_df.append(similarities_dict[item])
    sim_df = pd.concat(sim_df)
    
    candidates = sim_df[~sim_df.article_id.isin(items)]\
        .drop_duplicates(subset=['article_id'], keep='first')\
        .sort_values(by='similarity', ascending=False)\
        .rename(columns={'similarity':'similarity_score'})\
        .head(k)
    
    candidates['customer_id'] = cid
    candidates['week'] = week

    total_candidates.append(candidates)

100%|██████████| 831706/831706 [13:42<00:00, 1011.42it/s] 


In [24]:
result = pd.concat(total_candidates)
result['strategy'] = 'sim_not_bought'
result.to_csv('../data/candidates_200_ranks/sim_not_bought.csv', index=False)

In [26]:
result.sample(10)

Unnamed: 0,article_id,similarity_score,customer_id,week,strategy
470,815434005,0.990989,138112,100,sim_not_bought
494,751664001,1.0,588242,103,sim_not_bought
493,783707046,1.0,476944,97,sim_not_bought
429,841260002,0.994158,181285,100,sim_not_bought
442,810836001,0.99514,997864,102,sim_not_bought
307,266873006,0.87469,85192,95,sim_not_bought
315,724378001,0.922175,866800,94,sim_not_bought
442,935548001,0.976015,1176214,100,sim_not_bought
379,557048002,0.771031,251688,100,sim_not_bought
397,803454001,0.997391,501694,94,sim_not_bought
