In [21]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
os.chdir('../')
from eval_helpers import recall_at_k
os.chdir('lecture_4')

In [2]:
transactions = pd.read_pickle('../../../data/compressed_data/transactions_train.pkl')
customers = pd.read_pickle('../../../data/compressed_data/customers.pkl')
articles = pd.read_pickle('../../../data/compressed_data/articles.pkl')

transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

print('First week num: ', transactions.week.min(), '\nLast week num: ', transactions.week.max(), '\n')

# Test week is week after last week in train data
test_week = transactions.week.max()
train_weeks = range(test_week - 10, test_week)

# Filter transactions to last 10 weeks (most recent data)
transactions_train = transactions[transactions.week.isin(train_weeks)]
transaction_test = transactions[transactions.week == test_week]

First week num:  0 
Last week num:  104 



In [3]:
# def calculate_score(test_week_transactions, predictions_df, k=100):
#     y_true = test_week_transactions.groupby('customer_id')['article_id'].apply(list).reset_index()
#     y_true.columns = ['customer_id', 'y_true']
#     predictions_df.columns = ['customer_id', 'y_pred']
#     eval_df = pd.merge(y_true, predictions_df, on='customer_id')
#     return recall_at_k(eval_df['y_true'], eval_df['y_pred'], k)

In [4]:
def calculate_score(test_week_transactions, predictions_df, k=100):
    y_true = test_week_transactions.groupby('customer_id')['article_id'].apply(list).reset_index()
    y_true.columns = ['customer_id', 'y_true']
    predictions_df.columns = ['customer_id', 'y_pred']
    eval_df = pd.merge(y_true, predictions_df, on='customer_id')
    return recall_at_k(eval_df, k)

In [5]:
test_week_transactions = transaction_test.copy()

## Evaluate Radek's candidates

In [35]:
candidates_last_purchase_test = pd.read_csv('../../../data/candidates/radek_last_purchase.csv')
bestsellers_previous_week = pd.read_csv('../../../data/candidates_100/radek_bestsellers_previous_week.csv')
candidates_bestsellers_test_week = pd.read_csv('../../../data/candidates_100/radek_bestsellers.csv')

bestsellers_last_week = bestsellers_previous_week[bestsellers_previous_week['week'] == bestsellers_previous_week['week'].max()]['article_id'].tolist()

In [34]:
predictions_df = candidates_bestsellers_test_week\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x))\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.11905203020399299

In [36]:
predictions_df = candidates_last_purchase_test\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x))\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.04666869968353053

# Evaluate my candidates

### Seasonal candidates
#### Seasonal previous baskets

In [6]:
seasonal_candidates = pd.read_csv('../../../data/candidates_100/baskets_seasonal.csv', index_col=0)

In [8]:
# 2018 candidates
predictions_df = seasonal_candidates[seasonal_candidates.year == 2018]\
    .drop(columns=['year'])\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x))\
    .reset_index()


calculate_score(test_week_transactions, predictions_df)

0.0005885792960200041

In [10]:
# 2019 candidates
predictions_df = seasonal_candidates[seasonal_candidates.year == 2019]\
    .drop(columns=['year'])\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x))\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.003252009003428331

#### Best seasonal candidates

In [11]:
best_seasonal_items = pd.read_csv('../../../data/candidates/best_seasonal.csv', index_col=0)

In [12]:
predictions_df = pd.DataFrame(
    test_week_transactions.customer_id.unique(),
    columns=['customer_id']
    )
predictions_df['year'] = 2018
predictions_df = pd.merge(predictions_df, best_seasonal_items, on='year')\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x))\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.009176737674634968

In [13]:
predictions_df = pd.DataFrame(
    test_week_transactions.customer_id.unique(),
    columns=['customer_id']
    )
predictions_df['year'] = 2019
predictions_df = pd.merge(predictions_df, best_seasonal_items, on='year')\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x))\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.027426012836377116

### Similar not bought candidates

In [20]:
similar_not_bought_candidates = pd.read_csv('../../../data/candidates_100/similar_not_bought.csv', index_col=0)

# add bestsellers for sake of completeness
predictions_df = similar_not_bought_candidates\
    .groupby('customer_id')['sim_not_bought']\
    .apply(lambda x: list(x))\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.01041905066245714

### Items from categories user not interacted with

In [22]:
def not_interacted_with_candidates_v2(t, a, articles_col, k=10):
    
    # Get unique values of given category
    group_unique_values = a[articles_col].unique()
    group_df = pd.merge(t, a[['article_id', articles_col]])

    # Not interacted category for each customer
    not_interacted_with = group_df\
        .groupby('customer_id')[articles_col]\
        .apply(lambda x: np.array(list(set(x))))\
        .apply(lambda x: np.setdiff1d(group_unique_values, x))
    
    # Get k most popular articles in given category
    items_popularity = group_df\
        .groupby(articles_col)['article_id']\
        .value_counts()\
        .groupby(articles_col)\
        .head(k)\
        .reset_index()

    # Rank items by popularity (number of purchases)
    items_popularity['not_interacted_rank'] = items_popularity['count']\
        .rank(method='dense', ascending=False)\
        .astype('int16')
    
    items_popularity = items_popularity\
        .filter(items=['article_id', articles_col, 'not_interacted_rank'])\
        .sort_values(by=['not_interacted_rank'])

    candidates = []

    # For each customer get k most popular articles in categories that customer did not interact with
    for cid in tqdm(not_interacted_with.index.values):
        groups = not_interacted_with.loc[cid]

        cid_candidates = items_popularity\
            [items_popularity[articles_col].isin(groups)]\
            .head(k)\
            .drop(columns=[articles_col])
        
        cid_candidates['customer_id'] = cid

        candidates.append(cid_candidates)

    return pd.concat(candidates)[['customer_id', 'article_id', 'not_interacted_rank']]

In [26]:
article_groups_cols = [
    'product_group_name',
    'graphical_appearance_name',
    'colour_group_name',
    'perceived_colour_value_name',
    'perceived_colour_master_name',
    'index_name',
    'index_group_name',
    'section_name',
    'garment_group_name'
]
k = 100
for col_name in article_groups_cols:
    candidates = not_interacted_with_candidates_v2(transactions_train, articles, col_name, k)

    candidates.to_csv(f'../../../data/candidates_100/not_interacted_with_{col_name}_k{k}.csv', index=False)

    predictions_df = candidates\
        .groupby('customer_id')['article_id']\
        .apply(lambda x: list(x))\
        .reset_index()

    score = calculate_score(test_week_transactions, predictions_df)

    print(f'\nPrediction using column {col_name}: \n\t Score: {score:.5f}.')

100%|██████████| 439368/439368 [02:29<00:00, 2935.76it/s]



Prediction using column product_group_name: 
	 Score: 0.03020.


100%|██████████| 439368/439368 [02:33<00:00, 2855.70it/s]



Prediction using column graphical_appearance_name: 
	 Score: 0.03476.


100%|██████████| 439368/439368 [02:42<00:00, 2709.00it/s]



Prediction using column colour_group_name: 
	 Score: 0.04318.


100%|██████████| 439368/439368 [02:05<00:00, 3512.70it/s]



Prediction using column perceived_colour_value_name: 
	 Score: 0.03111.


100%|██████████| 439368/439368 [02:12<00:00, 3319.83it/s]



Prediction using column perceived_colour_master_name: 
	 Score: 0.03950.


100%|██████████| 439368/439368 [02:05<00:00, 3492.60it/s]



Prediction using column index_name: 
	 Score: 0.03206.


100%|██████████| 439368/439368 [02:03<00:00, 3555.18it/s]



Prediction using column index_group_name: 
	 Score: 0.02880.


100%|██████████| 439368/439368 [02:54<00:00, 2517.51it/s]



Prediction using column section_name: 
	 Score: 0.04312.


100%|██████████| 439368/439368 [02:14<00:00, 3256.77it/s]



Prediction using column garment_group_name: 
	 Score: 0.04813.


In [28]:
article_groups_cols = [
    'product_group_name',
    'graphical_appearance_name',
    'colour_group_name',
    'perceived_colour_value_name',
    'perceived_colour_master_name',
    'index_name',
    'index_group_name',
    'section_name',
    'garment_group_name'
]
k = 50
for col_name in article_groups_cols:
    candidates = not_interacted_with_candidates_v2(transactions_train, articles, col_name, k)

    candidates.to_csv(f'../../../data/candidates_50/not_interacted_with_{col_name}_k{k}.csv', index=False)

    predictions_df = candidates\
        .groupby('customer_id')['article_id']\
        .apply(lambda x: list(x))\
        .reset_index()

    score = calculate_score(test_week_transactions, predictions_df)

    print(f'\nPrediction using column {col_name}: \n\t Score: {score:.5f}.')

100%|██████████| 439368/439368 [02:06<00:00, 3475.66it/s]



Prediction using column product_group_name: 
	 Score: 0.01907.


100%|██████████| 439368/439368 [02:09<00:00, 3388.90it/s]



Prediction using column graphical_appearance_name: 
	 Score: 0.02319.


100%|██████████| 439368/439368 [13:35<00:00, 538.45it/s]  



Prediction using column colour_group_name: 
	 Score: 0.02694.


100%|██████████| 439368/439368 [02:01<00:00, 3627.63it/s]



Prediction using column perceived_colour_value_name: 
	 Score: 0.01942.


100%|██████████| 439368/439368 [02:03<00:00, 3549.70it/s]



Prediction using column perceived_colour_master_name: 
	 Score: 0.02419.


100%|██████████| 439368/439368 [02:00<00:00, 3654.71it/s]



Prediction using column index_name: 
	 Score: 0.02106.


100%|██████████| 439368/439368 [01:55<00:00, 3813.64it/s]



Prediction using column index_group_name: 
	 Score: 0.01933.


100%|██████████| 439368/439368 [02:17<00:00, 3190.56it/s]



Prediction using column section_name: 
	 Score: 0.02731.


100%|██████████| 439368/439368 [02:01<00:00, 3615.08it/s]



Prediction using column garment_group_name: 
	 Score: 0.02949.
