In [1]:
import pandas as pd
import numpy as np
from eval_helpers import mean_average_precision
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
transactions = pd.read_pickle('../../data/compressed_data/transactions_train.pkl')
customers = pd.read_pickle('../../data/compressed_data/customers.pkl')
articles = pd.read_pickle('../../data/compressed_data/articles.pkl')

transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

print('First week num: ', transactions.week.min(), '\nLast week num: ', transactions.week.max(), '\n')

# Test week is week after last week in train data
test_week = transactions.week.max()
train_weeks = range(test_week - 10, test_week)

# Filter transactions to last 10 weeks (most recent data)
transactions_train = transactions[transactions.week.isin(train_weeks)]
transaction_test = transactions[transactions.week == test_week]

First week num:  0 
Last week num:  104 



In [34]:
def calculate_score(test_week_transactions, predictions_df, k=12):
    y_true = test_week_transactions.groupby('customer_id')['article_id'].apply(list).reset_index()
    y_true.columns = ['customer_id', 'y_true']
    predictions_df.columns = ['customer_id', 'y_pred']
    eval_df = pd.merge(y_true, predictions_df, on='customer_id')
    return mean_average_precision(eval_df['y_true'], eval_df['y_pred'], k)

# Generate and evaluate Radek's candidates 

In [4]:
c2weeks = transactions_train.groupby('customer_id')['week'].unique()

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

candidates_last_purchase = transactions_train.copy()

weeks = []
for i, (c_id, week) in enumerate(zip(transactions_train['customer_id'], transactions_train['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

# bestseller
mean_price = transactions_train.groupby(['week', 'article_id'])['price'].mean()
sales = transactions_train \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

unique_transactions = transactions_train \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

test_set_transactions = unique_transactions\
    .drop_duplicates('customer_id')\
    .reset_index(drop=True)

test_set_transactions.week = test_week

candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

# # Concatenate data with test data
# candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
# candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [5]:
candidates_last_purchase_test = candidates_last_purchase[candidates_last_purchase.week == test_week]

In [6]:
bestsellers_last_week = bestsellers_previous_week[bestsellers_previous_week['week'] == bestsellers_previous_week['week'].max()]['article_id'].tolist()
test_week_transactions = transaction_test.copy()

In [7]:
predictions_df = candidates_bestsellers_test_week\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x) + bestsellers_last_week)\
    .apply(lambda x: x[:12])\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.00852672068485865

In [8]:
predictions_df = candidates_last_purchase_test\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x) + bestsellers_last_week)\
    .apply(lambda x: x[:12])\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.03255156197770692

# Evaluate my candidates

### Seasonal candidates
#### Seasonal previous baskets

In [9]:
seasonal_candidates = pd.read_csv('../../data/candidates/baskets_seasonal.csv', index_col=0)

In [41]:
# 2018 candidates
predictions_df = seasonal_candidates[seasonal_candidates.year == 2018]\
    .drop(columns=['year'])\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x) + bestsellers_last_week)\
    .apply(lambda x: x[:12])\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.005022726347485874

In [14]:
# 2019 candidates
predictions_df = seasonal_candidates[seasonal_candidates.year == 2019]\
    .drop(columns=['year'])\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x) + bestsellers_last_week)\
    .apply(lambda x: x[:12])\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.0054099095872320885

#### Best seasonal candidates

In [26]:
best_seasonal_items = pd.read_csv('../../data/candidates/best_seasonal.csv', index_col=0)

In [28]:
predictions_df = pd.DataFrame(
    test_week_transactions.customer_id.unique(),
    columns=['customer_id']
    )
predictions_df['year'] = 2018
predictions_df = pd.merge(predictions_df, best_seasonal_items, on='year')\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x) + bestsellers_last_week)\
    .apply(lambda x: x[:12])\
    .reset_index()
calculate_score(test_week_transactions, predictions_df)

9.460632124859302e-05

In [29]:
predictions_df = pd.DataFrame(
    test_week_transactions.customer_id.unique(),
    columns=['customer_id']
    )
predictions_df['year'] = 2019
predictions_df = pd.merge(predictions_df, best_seasonal_items, on='year')\
    .groupby('customer_id')['article_id']\
    .apply(lambda x: list(x) + bestsellers_last_week)\
    .apply(lambda x: x[:12])\
    .reset_index()
calculate_score(test_week_transactions, predictions_df)

0.002140628852461899

### Similar not bought candidates

In [53]:
similar_not_bought_candidates = pd.read_csv('../../data/candidates/similar_not_bought.csv', index_col=0)

# add bestsellers for sake of completeness
predictions_df = similar_not_bought_candidates\
    .groupby('customer_id')['sim_not_bought']\
    .apply(lambda x: list(x) + bestsellers_last_week)\
    .apply(lambda x: x[:12])\
    .reset_index()

calculate_score(test_week_transactions, predictions_df)

0.001300274631099207

### Items from categories user not interacted with

In [54]:
def not_interacted_with_candidates(t, a, articles_col, k=2):

    # Get unique values of given category
    group_unique_values = a[articles_col].unique()
    group_df = pd.merge(t, a[['article_id', articles_col]])

    # Get k most popular articles in given category
    popular_by_group = group_df.groupby(articles_col)['article_id'].value_counts()\
        .groupby(articles_col).head(k).reset_index()
    popular_by_group = popular_by_group[['article_id', articles_col]]

    # Not interacted category for each customer
    not_interacted_with = group_df.groupby('customer_id')[articles_col].unique()\
        .apply(lambda x: np.setdiff1d(group_unique_values, x))\
        .explode().reset_index()
    
    # Join to create recommendation based on lack of interaction
    candidates = pd.merge(not_interacted_with, popular_by_group, on=articles_col)

    return candidates[['customer_id', 'article_id']]

In [70]:
article_groups_cols = [
    'product_group_name',
    'graphical_appearance_name',
    'colour_group_name',
    'perceived_colour_value_name',
    'perceived_colour_master_name',
    'index_name',
    'index_group_name',
    'section_name',
    'garment_group_name'
]
for col_name in article_groups_cols:
    candidates = not_interacted_with_candidates(transactions_train, articles, col_name, 5)

    candidates.to_csv(f'../../data/candidates/not_interacted_with_{col_name}.csv', index=False)

    predictions_df = candidates\
        .groupby('customer_id')['article_id']\
        .apply(lambda x: list(x) + bestsellers_last_week)\
        .apply(lambda x: x[:12])\
        .reset_index()

    score = calculate_score(test_week_transactions, predictions_df)

    print(f'\nPrediction using column {col_name}: \n\t Score: {score:.5f}.')


Prediction using column product_group_name: 
	 Score: 0.00098.

Prediction using column graphical_appearance_name: 
	 Score: 0.00061.

Prediction using column colour_group_name: 
	 Score: 0.00162.

Prediction using column perceived_colour_value_name: 
	 Score: 0.00089.

Prediction using column perceived_colour_master_name: 
	 Score: 0.00180.

Prediction using column index_name: 
	 Score: 0.00027.

Prediction using column index_group_name: 
	 Score: 0.00084.

Prediction using column section_name: 
	 Score: 0.00016.

Prediction using column garment_group_name: 
	 Score: 0.00165.


In [71]:
article_groups_cols = [
    'product_group_name',
    'graphical_appearance_name',
    'colour_group_name',
    'perceived_colour_value_name',
    'perceived_colour_master_name',
    'index_name',
    'index_group_name',
    'section_name',
    'garment_group_name'
]
k = 2
for col_name in article_groups_cols:
    candidates = not_interacted_with_candidates(transactions_train, articles, col_name, k)

    candidates.to_csv(f'../../data/candidates/not_interacted_{k}/not_interacted_with_{col_name}.csv', index=False)

    predictions_df = candidates\
        .groupby('customer_id')['article_id']\
        .apply(lambda x: list(x) + bestsellers_last_week)\
        .apply(lambda x: x[:12])\
        .reset_index()

    score = calculate_score(test_week_transactions, predictions_df)

    print(f'\nPrediction using column {col_name}: \n\t Score: {score:.5f}.')


Prediction using column product_group_name: 
	 Score: 0.00091.

Prediction using column graphical_appearance_name: 
	 Score: 0.00074.

Prediction using column colour_group_name: 
	 Score: 0.00149.

Prediction using column perceived_colour_value_name: 
	 Score: 0.00175.

Prediction using column perceived_colour_master_name: 
	 Score: 0.00152.

Prediction using column index_name: 
	 Score: 0.00048.

Prediction using column index_group_name: 
	 Score: 0.00255.

Prediction using column section_name: 
	 Score: 0.00011.

Prediction using column garment_group_name: 
	 Score: 0.00176.
