In [1]:
import numpy as np
import pandas as pd

In [828]:
BASE_PATH = '../../data/parquet/'

transactions_original = pd.read_parquet(BASE_PATH + 'transactions_train.parquet')
customers_original = pd.read_parquet(BASE_PATH + 'customers.parquet')
articles_original = pd.read_parquet(BASE_PATH + 'articles.parquet')

transactions = transactions_original.copy()
customers = customers_original.copy()
articles = articles_original.copy()

In [780]:
def recall(predictions, test_data):
    joined = pd.merge(test_data, predictions, how='inner').drop_duplicates()
    relevant_selected = joined.groupby('customer_id').count()
    relevant_total = test_data.groupby('customer_id').count()

    recall = relevant_selected.divide(relevant_total, fill_value=0)
    return recall.mean().values[0]
    # return recall[~recall.article_id.isna()].mean().values[0]
    # return recall.loc[test_data.customer_id.unique()].mean().values[0]
    # return recall.loc[predictions.customer_id.unique()].mean().values[0]

In [703]:
recall(test_data.iloc[:1], test_data)

0.125

In [987]:
# clean slate
transactions = transactions_original.copy()
customers = customers_original.copy()
articles = articles_original.copy()

# split data into test en training
test_weeks = [103, 104]
train_weeks = [91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]
test_data = transactions[transactions.week.isin(test_weeks)][['customer_id', 'article_id']].drop_duplicates()
test_X = pd.Series(test_data['customer_id'].unique(), name='customer_id')
train_data = transactions[transactions.week.isin(train_weeks)].copy()

# add bin/clip features
transactions['clipped_price'] = transactions['price'].clip(0, 150)
transactions['binned_price'] = pd.cut(transactions.price, bins=[0, 4, 5, 6, 7, 8, 9, 10, 13, 15, 18, 20, 25, 30, 40, 60, np.inf])
customers['age_group'] = pd.cut(customers.age, bins=[0, 40, 60, 100], labels=["Young Adult", "Middle Aged", "Old"])

# add purchase behaviour features
article_max_price = transactions.groupby('article_id')['binned_price'].max().to_frame('article_price')
articles = pd.merge(articles, article_max_price, on='article_id', how='left')

In [833]:
v = len(test_X) / len(customers)
print(f'test customers: {len(test_X)} ({v:.2%} of total)')
v = len(set(test_X) & set(train_data.customer_id.unique())) / len(test_X)
print(f'test customers in training data: {v:.2%}')
v = len(pd.merge(train_data, test_data, on=['customer_id', 'article_id'])) / len(test_data)
print(f'repurchase rate: {v:.2%}')

test customers: 128333 (9.35% of total)
test customers in training data: 70.67%
repurchase rate: 2.69%


In [1037]:
def merge_candidates(cs):
    c = pd.concat(cs)
    c.drop_duplicates(['customer_id', 'article_id'], inplace=True)
    return c[['customer_id', 'article_id']]

def popular_global(t, c, k=10):
    popular_articles = pd.Series(t.article_id.value_counts().head(k).index, name='article_id')
    return pd.merge(c, popular_articles, how='cross')

def repurchase(t, c):
    return t[t.customer_id.isin(c)][['customer_id', 'article_id']]

# get `k` most popular items among users who match on user `feature`
def popular_by_feature(t, c, feature, k=10):
    cc = pd.merge(c, customers, on='customer_id', how='left')
    tt = pd.merge(t, customers, on='customer_id', how='left')
    tt = tt.groupby(feature, observed=False, as_index=False).article_id.value_counts()
    tt = tt.groupby(feature, observed=False).head(k)
    return pd.merge(cc, tt, on=feature)[['customer_id', 'article_id']]

# get top `k2` items in user history (with at least `threshold` occurences in that history)
# for each of those items, get `k1` most popular articles which match on item `feature`
def popular_similar_items(t, c, feature, k1=10, k2=1, threshold=2):
    tt = pd.merge(t, articles[['article_id', feature]], on='article_id', how='left')

    # k1 popular representatives per group
    a = tt \
        .groupby(feature, observed=False, as_index=False).article_id.value_counts() \
        .groupby(feature, observed=False).head(k1)[[feature, 'article_id']]

    # k2 common items in user history
    b = tt[tt.customer_id.isin(c)] \
        .groupby('customer_id', as_index=False)[feature].value_counts()
    b = b[b['count'] >= threshold].groupby('customer_id').head(k2)[['customer_id', feature]]

    return pd.merge(a, b, on=feature, how='inner')[['customer_id', 'article_id']]

In [1023]:
# radek
def baseline(t):
    # repurchase
    c2weeks = t.groupby('customer_id')['week'].unique()

    c2weeks2shifted_weeks = {}
    
    for c_id, weeks in c2weeks.items():
        c2weeks2shifted_weeks[c_id] = {}
        for i in range(weeks.shape[0]-1):
            c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
        c2weeks2shifted_weeks[c_id][weeks[-1]] = test_weeks[0]

    candidates_last_purchase = t.copy()

    weeks = []
    for i, (c_id, week) in enumerate(zip(t['customer_id'], t['week'])):
        weeks.append(c2weeks2shifted_weeks[c_id][week])
        
    candidates_last_purchase.week=weeks

    # bestseller
    mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()
    sales = transactions \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')
    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    bestsellers_previous_week.week += 1

    unique_transactions = t \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()

    candidates_bestsellers = pd.merge(
        unique_transactions,
        bestsellers_previous_week,
        on='week',
    )

    data = pd.concat([t, candidates_last_purchase, candidates_bestsellers])
    data.drop_duplicates(['customer_id', 'article_id'], inplace=True)
    return data
candidates_radek = baseline(train_data)
print_scheme(candidates_radek)

28.08	3.58%


In [1003]:
candidates = dict()
count = 10

In [1004]:
candidates['repurchase'] = repurchase(train_data, test_X)
candidates['pop (global)'] = popular_global(train_data, test_X, count)
candidates['pop (per age group)'] = popular_by_feature(train_data, test_X, 'age_group', count)
candidates['pop (per age)'] = popular_by_feature(train_data, test_X, 'age', count)
candidates['pop (per postal)'] = popular_by_feature(train_data, test_X, 'postal_code', count)
candidates['pop (per active)'] = popular_by_feature(train_data, test_X, 'Active', count)

In [1021]:
for f in ['product_code','prod_name',
 'product_type_name',
 'product_group_name',
 'graphical_appearance_name',
 'colour_group_name',
 'perceived_colour_value_name',
 'perceived_colour_master_name',
 'department_name',
 'index_name',
 'index_group_name',
 'section_name',
 'garment_group_name',
 'article_price']:
    # print(f, len(pd.merge(train_data, articles)[f].unique()))
    candidates[f'items ({f})'] = merge_candidates([
        popular_similar_items(train_data, test_X, f, 3, 3, 1),
        popular_similar_items(train_data, test_X, f, 10, 5, 2)
    ])

In [1020]:
c_all = merge_candidates(candidates.values())
recall_all = recall(c_all, test_data)
print(f'{"Scheme":>40} {"Count":>8} {"Recall":>8} {"(per slot)":>7}\n{"":->80}')
for k, v in candidates.items():
    candidates_per_user = len(v) / len(v.customer_id.unique())
    recall1 = recall(v, test_data)
    recall2 =  recall1 / candidates_per_user
    fract = 0
    print(f'{k:>40} {candidates_per_user:>8.2f} {recall1:>8.2%} ({recall2:.4%})')


candidates_per_user = len(c_all) / len(c_all.customer_id.unique())
print(f'\n{"all":>40} {candidates_per_user:>8.2f} {recall_all:>8.2%}')

                                  Scheme    Count   Recall (per slot)
--------------------------------------------------------------------------------
                              repurchase    11.63    2.63% (0.2260%)
                            pop (global)    10.00    0.89% (0.0886%)
                     pop (per age group)    10.00    1.03% (0.1028%)
                           pop (per age)    10.00    1.14% (0.1138%)
                        pop (per postal)     8.62    1.59% (0.1846%)
                        pop (per active)    10.00    0.83% (0.0828%)
                       items (prod_name)    11.11    2.27% (0.2046%)
               items (product_type_name)    23.57    1.16% (0.0491%)
              items (product_group_name)    22.22    0.94% (0.0421%)
       items (graphical_appearance_name)    19.52    0.85% (0.0437%)
               items (colour_group_name)    22.92    0.92% (0.0401%)
     items (perceived_colour_value_name)    22.97    0.96% (0.0416%)
    items (perceived_

In [1022]:
print_scheme(merge_candidates([
    repurchase(train_data, test_X)
]))
print_scheme(merge_candidates([
    repurchase(train_data, test_X),
    popular_similar_items(train_data, test_X, 'prod_name', 10, 3)
]))
print_scheme(merge_candidates([
    repurchase(train_data, test_X),
    popular_similar_items(train_data, test_X, 'prod_name', 10, 3),
    popular_by_feature(train_data, test_X, 'postal_code', 10),
    popular_by_feature(train_data, test_X, 'age', 10),
]))
print_scheme(merge_candidates([
    repurchase(train_data, test_X),
    popular_similar_items(train_data, test_X, 'prod_name', 20, 6, 2),
    popular_by_feature(train_data, test_X, 'postal_code', 15),
    popular_by_feature(train_data, test_X, 'age', 15),
]))

10.13	2.63%
14.22	3.04%
24.89	4.23%
33.48	4.71%


In [770]:
def print_scheme(v):
    cpu = len(v) / len(v.customer_id.unique())
    r = recall(v, test_data)
    print(f'{cpu:.2f}\t{r:.2%}')

In [1039]:
cand = merge_candidates([
    repurchase(train_data, customers.customer_id),
    popular_similar_items(train_data, customers.customer_id, 'prod_name', 20, 6, 2),
    popular_by_feature(train_data, customers.customer_id, 'postal_code', 15),
    popular_by_feature(train_data, customers.customer_id, 'age', 15),
])

In [1042]:
print_scheme(cand)

26.76	4.70%


In [1043]:
cand.to_parquet(BASE_PATH + 'candidates.parquet', index=False)