In [None]:
from lightgbm.sklearn import LGBMRanker

import numpy as np
import pandas as pd

In [None]:
%%time

# Read data from radek's parquet files
transactions = pd.read_parquet('data/parquet/transactions_train.parquet')
customers = pd.read_parquet('data/parquet/customers.parquet')
articles = pd.read_parquet('data/parquet/articles.parquet')

all_data = transactions.merge(customers, on='customer_id', how='left')
all_data = all_data.merge(articles, on='article_id', how='left')

In [None]:
# Test week
TEST_WEEK = 105
# Number of weeks before test week to train on
TRAINING_WEEKS = 10
# Number of weeks to consider for popularity methods (sliding window)
POPULARITY_WEEKS = 3
# Number of unique customers to consider recommending for (total number in training set of 10 weeks = 437365)
# NUM_CUSTOMERS = 437365

# Limit the transaction to the training set & number of customers
transactions = transactions[transactions.week > transactions.week.max() - TRAINING_WEEKS]
# transactions = transactions[transactions['customer_id'].isin(transactions['customer_id'].unique()[:NUM_CUSTOMERS])]

# Split up the transactions in train and test set
train_weeks = range(TEST_WEEK - TRAINING_WEEKS, TEST_WEEK)
train = transactions[transactions.week.isin(train_weeks)]
test = transactions[transactions.week == TEST_WEEK]

In [None]:
# Determine mean price and most common sales channel for each item to be used in candidate generation
mean_price = transactions.groupby(['article_id'])['price'].mean()
common_sales_channel = transactions.groupby(['article_id'])['sales_channel_id'].agg(lambda x: x.value_counts().index[0])

# Add custom features age_group, avg_price_spent, max_price_spent, avg_price_group, max_price_group
customers['age_group'] = pd.cut(customers['age'], bins=[0, 25, 40, 60, 100], labels=[0, 1, 2, 3])

avg_price_spent = train.groupby('customer_id')['price'].mean()
max_price_spent = train.groupby('customer_id')['price'].max()

customers['avg_price_spent'] = customers['customer_id'].map(avg_price_spent).fillna(0)
customers['max_price_spent'] = customers['customer_id'].map(max_price_spent).fillna(0)

customers['avg_price_group'] = pd.cut(customers['avg_price_spent'], bins=[-1, 0.02, 0.04, 0.1, 0.3, 1], labels=[0, 1, 2, 3, 4])
customers['max_price_group'] = pd.cut(customers['max_price_spent'], bins=[-1, 0.02, 0.04, 0.1, 0.3, 1], labels=[0, 1, 2, 3, 4])

# Add custom feature avg_purchaser_age, NaN values are filled with -1
transactions_with_age = pd.merge(transactions, customers[['customer_id', 'age']], on='customer_id', how='left')
avg_purchaser_age = transactions_with_age.groupby('article_id')['age'].mean().reset_index()
avg_purchaser_age.rename(columns={'age': 'avg_purchaser_age'}, inplace=True)
avg_purchaser_age['avg_purchaser_age'].fillna(-1, inplace=True)
articles = pd.merge(articles, avg_purchaser_age, on='article_id', how='left')

In [None]:
# Create pivot table with number of purchases per customer per index group
pivot_table = pd.pivot_table(
    all_data[all_data.week < TEST_WEEK],
    index='customer_id',
    columns='index_code',
    values='article_id',
    aggfunc='count',
    fill_value=0
)

# Determine the total amount of purchases from all categories, as well as the percentages of purchases for women's, children's and men's products
pivot_table['total_purchases'] = pivot_table.sum(axis=1)
pivot_table['percentage_women_purchases'] = ((pivot_table[0] + pivot_table[7] + pivot_table[6]) / pivot_table['total_purchases'])
pivot_table['percentage_children_purchases'] = ((pivot_table[5] + pivot_table[3] + pivot_table[4] + pivot_table[8]) / pivot_table['total_purchases'])
pivot_table['percentage_men_purchases'] = (pivot_table[2] / pivot_table['total_purchases'])
pivot_table.reset_index(inplace=True)
# Add a feature for which of these categories is most bought from
pivot_table['most_bought_gender'] = pivot_table[['percentage_women_purchases', 'percentage_children_purchases', 'percentage_men_purchases']].idxmax(axis=1)

# Add the features to the customers dataframe
customers['most_bought_gender'] = customers['customer_id'].map(pivot_table.set_index('customer_id')['most_bought_gender'])
customers['percentage_women_purchases'] = customers['customer_id'].map(pivot_table.set_index('customer_id')['percentage_women_purchases'])
customers['percentage_children_purchases'] = customers['customer_id'].map(pivot_table.set_index('customer_id')['percentage_children_purchases'])
customers['percentage_men_purchases'] = customers['customer_id'].map(pivot_table.set_index('customer_id')['percentage_men_purchases'])
customers['total_purchases'] = customers['customer_id'].map(pivot_table.set_index('customer_id')['total_purchases'])

In [None]:
# Create a dataframe with the unique customer data, and one merged with the transactions to be used in candidate generation
unique_customers = pd.DataFrame(train['customer_id'].unique(), columns=['customer_id']).merge(customers, on='customer_id', how='left')
train_customers = train.merge(customers, on='customer_id', how='left')
train_customers = train_customers[train_customers['customer_id'].isin(unique_customers['customer_id'])]

In [None]:
# Generic function for creating candidates based on popularity within a group of customers matching a given feature
def candidates_user_feature(feature, count=12):
    candidates = pd.DataFrame()
    # For each week in the test period (with enough prior weeks to determine popularity)
    for week in range(TEST_WEEK - TRAINING_WEEKS + POPULARITY_WEEKS, TEST_WEEK):
        # Get the number of purchases per article for each of the relevant weeks and group them based on the given feature
        relevant_weeks = train_customers[(week - POPULARITY_WEEKS) < train_customers.week][train_customers.week <= week]
        recent_article_counts = relevant_weeks.groupby([feature, 'article_id']).size().reset_index(name='count')
        article_counts_sorted = recent_article_counts.sort_values([feature, 'count'], ascending=[True, False])
        top_articles_feature = article_counts_sorted.groupby(feature).head(count)
        # Create candidates by merging the top articles for each feature group with the customers that match on that feature
        curr_candidates = unique_customers.merge(top_articles_feature, on=[feature], how='left')[['customer_id', 'article_id']]
        # Increase week by one, add the mean price, most common sales channel and a randomly sampled t_dat for the candidate
        curr_candidates['week'] = week + 1
        curr_candidates = pd.merge(curr_candidates, mean_price, on=['article_id'])
        curr_candidates = pd.merge(curr_candidates, common_sales_channel, on=['article_id'])
        curr_candidates['t_dat'] = transactions[transactions['week'] == week]['t_dat'].sample(n=len(curr_candidates), random_state=1, replace=True).values
        # Add the candidates for this week to the total candidates dataframe
        candidates = pd.concat([candidates, curr_candidates])
    # Change the datatypes of the id columns since otherwise they are somehow converted to floats!
    candidates['article_id'] = candidates['article_id'].astype('int32')
    candidates['customer_id'] = candidates['customer_id'].astype('uint64')
    return candidates

In [None]:
# Generate radek's repurchase candidates
def candidates_radek_repurchase():
    c2weeks = transactions.groupby('customer_id')['week'].unique()
    c2weeks2shifted_weeks = {}

    for c_id, weeks in c2weeks.items():
        c2weeks2shifted_weeks[c_id] = {}
        for i in range(weeks.shape[0]-1):
            c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
        c2weeks2shifted_weeks[c_id][weeks[-1]] = TEST_WEEK
        
    weeks = []
    for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
        weeks.append(c2weeks2shifted_weeks[c_id][week])
        
    candidates_last_purchase = transactions.copy()
    candidates_last_purchase.week=weeks
    return candidates_last_purchase

In [None]:
# Generate radek's bestseller candidates
def candidates_radek_bestseller(count=12):
    mean_price = transactions \
        .groupby(['week', 'article_id'])['price'].mean()
    sales = transactions \
        .groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(count).rename('bestseller_rank').astype('int8')
    bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
    bestsellers_previous_week.week += 1    
    bestsellers_previous_week.pipe(lambda df: df[df['week']==96])
    unique_transactions = transactions \
        .groupby(['week', 'customer_id']) \
        .head(1) \
        .drop(columns=['article_id', 'price']) \
        .copy()
    candidates_bestsellers = pd.merge(
        unique_transactions,
        bestsellers_previous_week,
        on='week',
    )
    test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
    test_set_transactions.week = TEST_WEEK
    candidates_bestsellers_test_week = pd.merge(
        test_set_transactions,
        bestsellers_previous_week,
        on='week'
    )
    candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
    candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)
    return candidates_bestsellers

In [None]:
# Generate candidates for each of the features, as well as radek's candidates
candidates_age_group = candidates_user_feature('age_group')
candidates_avg_price = candidates_user_feature('avg_price_group')
candidates_max_price = candidates_user_feature('max_price_group')
candidates_gender = candidates_user_feature('most_bought_gender')
candidates_repurchase = candidates_radek_repurchase()
candidates_bestseller = candidates_radek_bestseller()


In [None]:
# Create a dict with all the candidate methods as well as a dataframe with all candidates merged
all_candidate_methods = {
    "Popularity (age group)": candidates_age_group, 
    "Popularity (avg price group)": candidates_avg_price, 
    "Popularity (max price group)": candidates_max_price, 
    "Popularity (gender)": candidates_gender, 
    "Repurchase (radek)": candidates_repurchase, 
    "Bestsellers (radek)": candidates_bestseller
    }
merged_candidates = pd.concat(all_candidate_methods.values()).drop_duplicates(["customer_id", "week", "article_id"])

In [None]:
# Ensure all the article ids are of type int32, otherwise the recommendations will not be counted
for method, candidates in all_candidate_methods.items():
    assert candidates['article_id'].dtype == 'int32'

In [None]:
# Add purchased column to distinguish between candidates and real transactions
data = transactions
data['purchased'] = 1
data = pd.concat([transactions, merged_candidates]).drop_duplicates(["customer_id", "week", "article_id"])

data.purchased.fillna(0, inplace=True)

In [None]:
# Add a one hot encoding for each method, showing for each candidate which methods generated it
for method, candidates in all_candidate_methods.items():
    candidates[method] = 1
    data = data.merge(candidates[['customer_id', 'week', 'article_id', method]], on=['customer_id', 'week', 'article_id'], how='left')
    data[method].fillna(0, inplace=True)

In [None]:
# Add all the customer and article information to the data
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
# Add a feature for the number of methods that generated a candidate
data['num_methods'] = data[list(all_candidate_methods.keys())].sum(axis=1)

In [None]:
# Add bestseller_rank feature
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
# Generate the bestsellers of previous week to fill in missing recommendations
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1
bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

In [None]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [None]:
# Fill in missing bestseller ranks with 999
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [None]:
# Filter the data for various tests involving different methods

# Only radek candidates, 0.01959 - 0.02018 --> One hot encoded: 0.02062 - 0.02124
# filtered_data = data[(data.purchased == 1) | (data['Bestsellers (radek)'] == 1) | (data['Repurchase (radek)'] == 1)]

# Only popularity candidates, 0.00695 - 0.00703 --> One hot encoded: 0.00680 - 0.00659
# filtered_data = data[(data.purchased == 1) | (data['Bestsellers (radek)'] == 1)]

# Only repurchase candidates, 0.02030 - 0.02077 --> 0.02049 - 0.02116
# filtered_data = data[(data.purchased == 1) | (data['Repurchase (radek)'] == 1)]

# Only age group bestsellers, 0.00674 - 0.00655 --> 0.00725 - 0.00698
# filtered_data = data[(data.purchased == 1) | (data['Popularity (age group)'] == 1)]

# Only avg price group bestsellers, 0.00552 - 0,00540 --> 0,00643 - 0,00645
# filtered_data = data[(data.purchased == 1) | (data['Popularity (avg price group)'] == 1)]

# Only max price group bestsellers, 0.00526 - 0,00537 --> 0,00665 - 0,00641
# filtered_data = data[(data.purchased == 1) | (data['Popularity (max price group)'] == 1)]

# Only gender based bestsellers, 0.00685 - 0,00676 --> 0,00658 - 0,00625
# filtered_data = data[(data.purchased == 1) | (data['Popularity (gender)'] == 1)]

# All custom methods, 0.00524 - 0.00504 --> 0.00662 - 0.00660
# filtered_data = data[(data.purchased == 1) | (data['Popularity (age group)'] == 1) | (data['Popularity (avg price group)'] == 1) | (data['Popularity (max price group)'] == 1) | (data['Popularity (gender)'] == 1)]

# All methods, 0.01261 - 0.01385 --> 0.1966 - 0.2050 --> 0.1949 - 0.2039 (num_methods) 
filtered_data = data

# Filter based on num_methods --> >1: 0.02011 - 0.02058 --> >2: 0.02019 - 0.02083 --> >3: 0.01918 - 0.01965
# filtered_data = data[(data.purchased == 1) | (data['Repurchase (radek)'] == 1) | (data['num_methods'] > 3)]

In [None]:
# Recreate train and test data using the filtered data, and create baskets for training
train = filtered_data[filtered_data.week.isin(train_weeks)]
test = filtered_data[filtered_data.week == TEST_WEEK]

train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [None]:
# Select the columns to use for training
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code',
'age_group', 'avg_purchaser_age', 'percentage_children_purchases', 'percentage_men_purchases', 'percentage_women_purchases', 'total_purchases']
# , 'num_methods', 'bestseller_rank']

# Add the one hot encoding columns for each of the used candidate methods
columns_to_use.extend(list(all_candidate_methods.keys()))

In [None]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [None]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [None]:
%%time

# Fit the ranker model
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

In [None]:
# Show the feature importances determined by the ranker
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

In [None]:
%time

# Use the ranker to creat predictions
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

In [None]:
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

In [None]:
sub = pd.read_csv('data/original/sample_submission.csv')

In [None]:
%%time
preds = []
# Store predictions for each customer
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

In [None]:
# Create predictions file for kaggle
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds
sub_name = 'candidate_generation_model'
sub.to_csv(f'{sub_name}.csv.gz', index=False)