In [1]:
%run helper_functions.ipynb

In [2]:
DRY_RUN = False

In [3]:
import pandas as pd

In [4]:
%%time

if not DRY_RUN:
    transactions = pd.read_parquet('../data/transactions_train/transactions_train.parquet')
    customers = pd.read_parquet('../data/customers/customers.parquet')
    articles = pd.read_parquet('../data/articles/articles.parquet')
else:
    sample = 0.05
    transactions = pd.read_parquet(f'../data/transactions_train/transactions_train_sample_{sample}.parquet')
    customers = pd.read_parquet(f'../data/customers/customers_sample_{sample}.parquet')
    articles = pd.read_parquet(f'../data/articles/articles_train_sample_{sample}.parquet')

CPU times: total: 5.25 s
Wall time: 5.21 s


In [5]:
test_week = transactions.week.max()

In [6]:
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

The records I generate below can both be used as candidates for evaluation by our ranker (for the test week) and as negative examples (for training data).

I am going with the name candidates for both of these use cases, but that is certainly not ideal. Feel free to refactor the naming to increase the clarity of your code.

### Last purchase candidates

In [7]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: total: 23.1 s
Wall time: 23.3 s


In [8]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: total: 641 ms
Wall time: 631 ms


In [9]:
candidates_last_purchase = transactions.copy()

In [10]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: total: 21.2 s
Wall time: 21.3 s


### Bestsellers candidates

In [12]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank')

In [13]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [14]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [15]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [16]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [17]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [18]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

# Combining transactions and candidates / negative examples

In [19]:
transactions['purchased'] = 1

In [20]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

data.purchased.mean()

0.14606530579445656

### Add bestseller information

In [21]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [22]:
data = data[data.week != data.week.min()].copy()
data.bestseller_rank.fillna(999, inplace=True)

In [23]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [24]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [25]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [26]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [27]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [28]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: total: 438 ms
Wall time: 432 ms


# Model training

In [30]:
from lightgbm.sklearn import LGBMRanker

In [31]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [32]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.847990
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.152231
[LightGBM] [Debug] init for col-wise cost 0.204296 seconds, init for row-wise cost 0.584408 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.448383 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1077
[LightGBM] [Info] Number of data points in the train set: 10617433, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
CPU times: total: 22 s
Wall time: 8.52 s


In [33]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9988221008359978
article_id 0.00032280530876886384
age 0.000292525452583308
garment_group_no 0.0001832666983498687
postal_code 8.221541378404916e-05
product_type_no 7.728522095350483e-05
club_member_status 7.102321444967734e-05
department_no 6.243632294211845e-05
Active 4.033095364965426e-05
colour_group_code 2.739073526846399e-05
perceived_colour_value_id 1.8619843252697374e-05
fashion_news_frequency 0.0
FN 0.0
section_no 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
index_group_no 0.0


# Calculate predictions

In [34]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = ['0' + str(a_id) for a_id in \
                             bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()]

CPU times: total: 0 ns
Wall time: 0 ns


# Create a submission

In [35]:
sub = pd.read_csv('data/sample_submission.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/sample_submission.csv'

In [35]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    if c_id in c_id2predicted_article_ids:
        ps = ['0' + str(p) for p in c_id2predicted_article_ids[c_id]]
        preds.append(ps)
    else:
        preds.append(bestsellers_last_week)

CPU times: user 3.6 s, sys: 179 ms, total: 3.78 s
Wall time: 3.78 s


In [36]:
preds = [' '.join(ps) for ps in preds]
sub.prediction = preds

In [37]:
sub_name = 'basic_with_bestseller_information'
sub.to_csv(f'data/subs/{sub_name}.csv.gz', index=False)

In [38]:
!kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f 'data/subs/{sub_name}.csv.gz' -m {sub_name}

100%|██████████████████████████████████████| 58.4M/58.4M [00:31<00:00, 1.92MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations

# Where to go from here

This notebook has all the components of a valid solution.

We generate negative examples.
We generate candidates for the test set.
We also enrich transaction records with calculated statistics (bestseller information from previous week).

From there on we train a model and use that model to output predictions.

But all of this is intentionally very basic. Look how many components we had to get in place to get to this spot!

Now the fun part begins.

Further work on this should fall into three categories IMHO:
* use functionality from this and the `Solution Warmup` notebook to work out a good way of validating the LGBM ranker and end-to-end predictions (all the components for this have been developed, they just need to be piece together)
* develop useful features that would describe (Kaggle kernels for this competition already have a lot of code you can reuse, everything by Chris Deotte is gold)
    * the customer
    * the article
    * customer tendencies
* train a couple of varied models and figure out how to blend them