In [1]:
from lightgbm.sklearn import LGBMRanker

import numpy as np
import pandas as pd

In [2]:
%%time

transactions = pd.read_parquet('data/parquet/transactions_train.parquet')
customers = pd.read_parquet('data/parquet/customers.parquet')
articles = pd.read_parquet('data/parquet/articles.parquet')

all_data = transactions.merge(customers, on='customer_id', how='left')
all_data = all_data.merge(articles, on='article_id', how='left')

CPU times: total: 20.3 s
Wall time: 18.4 s


In [3]:
TEST_WEEK = 105
TRAINING_WEEKS = 10
POPULARITY_WEEKS = 3

transactions = transactions[transactions.week > transactions.week.max() - TRAINING_WEEKS]
train_weeks = range(TEST_WEEK - TRAINING_WEEKS, TEST_WEEK)
train = transactions[transactions.week.isin(train_weeks)]
test = transactions[transactions.week == TEST_WEEK]

In [4]:
mean_price = transactions.groupby(['article_id'])['price'].mean()
common_sales_channel = transactions.groupby(['article_id'])['sales_channel_id'].agg(lambda x: x.value_counts().index[0])

customers['age_group'] = pd.cut(customers['age'], bins=[0, 25, 40, 60, 100], labels=[0, 1, 2, 3])

avg_price_spent = train.groupby('customer_id')['price'].mean()
max_price_spent = train.groupby('customer_id')['price'].max()

customers['avg_price_spent'] = customers['customer_id'].map(avg_price_spent).fillna(0)
customers['max_price_spent'] = customers['customer_id'].map(max_price_spent).fillna(0)

customers['avg_price_group'] = pd.cut(customers['avg_price_spent'], bins=[-1, 0.02, 0.04, 0.1, 0.3, 1], labels=[0, 1, 2, 3, 4])
customers['max_price_group'] = pd.cut(customers['max_price_spent'], bins=[-1, 0.02, 0.04, 0.1, 0.3, 1], labels=[0, 1, 2, 3, 4])

transactions_with_age = pd.merge(transactions, customers[['customer_id', 'age']], on='customer_id', how='left')
avg_purchaser_age = transactions_with_age.groupby('article_id')['age'].mean().reset_index()
avg_purchaser_age.rename(columns={'age': 'avg_purchaser_age'}, inplace=True)
avg_purchaser_age['avg_purchaser_age'].fillna(-1, inplace=True)
articles = pd.merge(articles, avg_purchaser_age, on='article_id', how='left')

In [5]:
pivot_table = pd.pivot_table(
    all_data[all_data.week < TEST_WEEK],
    index='customer_id',
    columns='index_code',
    values='article_id',
    aggfunc='count',
    fill_value=0
)

pivot_table['total_purchases'] = pivot_table.sum(axis=1)
pivot_table['percentage_women_purchases'] = ((pivot_table[0] + pivot_table[7] + pivot_table[6]) / pivot_table['total_purchases'])
pivot_table['percentage_children_purchases'] = ((pivot_table[5] + pivot_table[3] + pivot_table[4] + pivot_table[8]) / pivot_table['total_purchases'])
pivot_table['percentage_men_purchases'] = (pivot_table[2] / pivot_table['total_purchases'])
pivot_table.reset_index(inplace=True)
pivot_table['most_bought_gender'] = pivot_table[['percentage_women_purchases', 'percentage_children_purchases', 'percentage_men_purchases']].idxmax(axis=1)

customers['most_bought_gender'] = customers['customer_id'].map(pivot_table.set_index('customer_id')['most_bought_gender'])
customers['percentage_women_purchases'] = customers['customer_id'].map(pivot_table.set_index('customer_id')['percentage_women_purchases'])
customers['percentage_children_purchases'] = customers['customer_id'].map(pivot_table.set_index('customer_id')['percentage_children_purchases'])
customers['percentage_men_purchases'] = customers['customer_id'].map(pivot_table.set_index('customer_id')['percentage_men_purchases'])
customers['total_purchases'] = customers['customer_id'].map(pivot_table.set_index('customer_id')['total_purchases'])

In [6]:
unique_customers = pd.DataFrame(train['customer_id'].unique(), columns=['customer_id']).merge(customers, on='customer_id', how='left')
train_customers = train.merge(customers, on='customer_id', how='left')
train_customers = train_customers[train_customers['customer_id'].isin(unique_customers['customer_id'])]

In [7]:
def candidates_user_feature(feature, count=12):
    candidates = pd.DataFrame()
    for week in range(TEST_WEEK - TRAINING_WEEKS + POPULARITY_WEEKS, TEST_WEEK):
        relevant_weeks = train_customers[(week - POPULARITY_WEEKS) < train_customers.week][train_customers.week <= week]
        recent_article_counts = relevant_weeks.groupby([feature, 'article_id']).size().reset_index(name='count')
        article_counts_sorted = recent_article_counts.sort_values([feature, 'count'], ascending=[True, False])
        top_articles_feature = article_counts_sorted.groupby(feature).head(count)
        curr_candidates = unique_customers.merge(top_articles_feature, on=[feature], how='left')[['customer_id', 'article_id']]
        curr_candidates['week'] = week + 1
        curr_candidates = pd.merge(curr_candidates, mean_price, on=['article_id'])
        curr_candidates = pd.merge(curr_candidates, common_sales_channel, on=['article_id'])
        curr_candidates['t_dat'] = transactions[transactions['week'] == week]['t_dat'].sample(n=len(curr_candidates), random_state=1, replace=True).values
        candidates = pd.concat([candidates, curr_candidates])
    return candidates

In [8]:
def candidates_radek_repurchase():
    c2weeks = transactions.groupby('customer_id')['week'].unique()
    c2weeks2shifted_weeks = {}

    for c_id, weeks in c2weeks.items():
        c2weeks2shifted_weeks[c_id] = {}
        for i in range(weeks.shape[0]-1):
            c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
        c2weeks2shifted_weeks[c_id][weeks[-1]] = TEST_WEEK
        
    weeks = []
    for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
        weeks.append(c2weeks2shifted_weeks[c_id][week])
        
    candidates_last_purchase = transactions.copy()
    candidates_last_purchase.week=weeks
    return candidates_last_purchase

In [9]:
def candidates_radek_bestseller(count=12):
    mean_price = transactions \
        .groupby(['week', 'article_id'])['price'].mean()
    sales = transactions \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(count).rename('bestseller_rank').astype('int8')
    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    bestsellers_previous_week.week += 1    
    unique_transactions = transactions \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()
    candidates_bestsellers = pd.merge(
        unique_transactions,
        bestsellers_previous_week,
        on='week',
    )
    test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_transactions.week = TEST_WEEK
    candidates_bestsellers_test_week = pd.merge(
        test_set_transactions,
        bestsellers_previous_week,
        on='week'
    )
    candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
    candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)
    return candidates_bestsellers

In [10]:
candidates_age_group = candidates_user_feature('age_group')
candidates_avg_price = candidates_user_feature('avg_price_group')
candidates_max_price = candidates_user_feature('max_price_group')
candidates_gender = candidates_user_feature('most_bought_gender')
candidates_repurchase = candidates_radek_repurchase()
candidates_bestseller = candidates_radek_bestseller()


  relevant_weeks = train_customers[(week - POPULARITY_WEEKS) < train_customers.week][train_customers.week <= week]
  recent_article_counts = relevant_weeks.groupby([feature, 'article_id']).size().reset_index(name='count')
  top_articles_feature = article_counts_sorted.groupby(feature).head(count)
  relevant_weeks = train_customers[(week - POPULARITY_WEEKS) < train_customers.week][train_customers.week <= week]
  recent_article_counts = relevant_weeks.groupby([feature, 'article_id']).size().reset_index(name='count')
  top_articles_feature = article_counts_sorted.groupby(feature).head(count)
  relevant_weeks = train_customers[(week - POPULARITY_WEEKS) < train_customers.week][train_customers.week <= week]
  recent_article_counts = relevant_weeks.groupby([feature, 'article_id']).size().reset_index(name='count')
  top_articles_feature = article_counts_sorted.groupby(feature).head(count)
  relevant_weeks = train_customers[(week - POPULARITY_WEEKS) < train_customers.week][train_customers.week 

In [11]:
all_candidate_methods = {
    "Popularity (age group)": candidates_age_group, 
    "Popularity (avg price group)": candidates_avg_price, 
    "Popularity (max price group)": candidates_max_price, 
    "Popularity (gender)": candidates_gender, 
    "Repurchase (radek)": candidates_repurchase, 
    "Bestsellers (radek)": candidates_bestseller
    }
merged_candidates = pd.concat(all_candidate_methods.values()).drop_duplicates(["customer_id", "week", "article_id"])

In [12]:
data = transactions
data['purchased'] = 1
data = pd.concat([transactions, merged_candidates]).drop_duplicates(["customer_id", "week", "article_id"])
data.purchased.fillna(0, inplace=True)

In [13]:
for method, candidates in all_candidate_methods.items():
    candidates[method] = 1
    data = data.merge(candidates[['customer_id', 'week', 'article_id', method]], on=['customer_id', 'week', 'article_id'], how='left')
    data[method].fillna(0, inplace=True)

In [14]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [16]:
data['num_methods'] = data[list(all_candidate_methods.keys())].sum(axis=1)

In [82]:
filtered_data = data[(data.purchased == 1) | (data['Bestsellers (radek)'] == 1) | (data['Repurchase (radek)'] == 1)]

In [83]:
train = data[data.week.isin(train_weeks)][(data.purchased == 1) | data.num_methods > 2]
test = data[data.week == TEST_WEEK][(data.purchased == 1) | data.num_methods > 2 | (data['Repurchase (radek)'] == 1)]

train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [108]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code',
'age_group', 'avg_purchaser_age', 'percentage_children_purchases', 'percentage_men_purchases', 'percentage_women_purchases', 'total_purchases']

# columns_to_use.extend(list(all_candidate_methods.keys()))

In [109]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: total: 1.36 s
Wall time: 1.4 s


In [110]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [111]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.804273
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.187844
[LightGBM] [Debug] init for col-wise cost 0.461211 seconds, init for row-wise cost 0.991661 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.691792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2345
[LightGBM] [Info] Number of data points in the train set: 11772260, number of used features: 23
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
CPU times: total: 35.7 s
Wall time: 9.42 s


In [112]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

article_id 0.2835955491092478
section_no 0.14733112349459293
avg_purchaser_age 0.141388567797081
product_type_no 0.1204890737084944
department_no 0.08371039396337257
garment_group_no 0.06802719390678798
colour_group_code 0.04309015634553394
perceived_colour_value_id 0.035974477753172626
graphical_appearance_no 0.032383585821524154
index_code 0.02980630461794214
perceived_colour_master_id 0.01420357348225047
index_group_no 0.0
total_purchases 0.0
percentage_women_purchases 0.0
Active 0.0
club_member_status 0.0
fashion_news_frequency 0.0
age 0.0
postal_code 0.0
age_group 0.0
percentage_children_purchases 0.0
percentage_men_purchases 0.0
FN 0.0


In [113]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

CPU times: total: 0 ns
Wall time: 0 ns


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['preds'] = ranker.predict(test_X)


In [114]:
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

In [115]:
sub = pd.read_csv('data/original/sample_submission.csv')

In [116]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1
bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

In [117]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: total: 4.27 s
Wall time: 4.34 s


In [118]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds
sub_name = 'candidate_generation_model'
sub.to_csv(f'{sub_name}.csv.gz', index=False)

In [119]:
for method, cands in all_candidate_methods.items():
    print(method, len(cands[cands.week == TEST_WEEK]))

Popularity (age group) 5220420
Popularity (avg price group) 5248380
Popularity (max price group) 5248380
Popularity (gender) 5248380
Repurchase (radek) 1527904
Bestsellers (radek) 5248380
