In [1]:
import pandas as pd
import warnings
import json
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
os.chdir('../')
from model import *
os.chdir('../../data/')

In [2]:
transactions = pd.read_pickle('../data/compressed_data/transactions_train.pkl')
customers = pd.read_pickle('../data/compressed_data/customers.pkl')
articles = pd.read_pickle('../data/compressed_data/articles.pkl')

transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

print('First week num: ', transactions.week.min(), '\nLast week num: ', transactions.week.max(), '\n')

First week num:  0 
Last week num:  104 



In [3]:
# Test week is week after last week in train data
test_week = transactions.week.max() + 1

# Filter transactions to last 10 weeks (most recent data)
transactions = transactions[transactions.week > transactions.week.max() - 10]

In [4]:
transactions.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'week'],
      dtype='object')

In [5]:
# Load radek's candidates
candidates_last_purchase = pd.read_csv('candidates/radek_last_purchase.csv')
candidates_bestsellers = pd.read_csv('candidates/radek_bestsellers.csv')
bestsellers_previous_week = pd.read_csv('candidates/radek_bestsellers_previous_week.csv')

# Load my candidates
## Similar not bought (negative cases only)

In [6]:
# Load my candidates
candidates_similar_not_bought = pd.read_csv('candidates_200_ranks/sim_not_bought.csv')

## Submission Loop -- check best k for kaggle

In [7]:
columns_to_use = [
    'article_id', 
    'product_type_no', 
    'graphical_appearance_no', 
    'colour_group_code', 
    'perceived_colour_value_id',
    'perceived_colour_master_id', 
    'department_no', 
    'index_code',
    'index_group_no', 
    'section_no', 
    'garment_group_no', 
    'FN', 
    'Active',
    'club_member_status', 
    'fashion_news_frequency', 
    'age', 
    'postal_code', 
    'bestseller_rank'
]

model_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'dart',
    'n_estimators': 1,
    'importance_type': 'gain'
}

In [8]:
# Get bestsellers from previous week
bestsellers_last_week = \
bestsellers_previous_week[bestsellers_previous_week['week'] == bestsellers_previous_week['week'].max()]['article_id'].tolist()

# for k in range(20, 201, 10):  crashed at 170
# for k in range(170, 201, 10):
# for k in range(190, 201, 10):
for k in range(200, 201, 10):
    print('k = ', k)

    # Get top k similar not bought articles for each customer
    top_k_snb_weekly = candidates_similar_not_bought\
        .groupby(['week', 'customer_id']).head(k)\
        .drop(columns=['strategy', 'similarity_score'])

    top_k_snb_weekly['t_dat'] = '2020-07-15'
    top_k_snb_weekly['price'] = 0
    top_k_snb_weekly['sales_channel_id'] = 2

    # Prepare data for model
    train_X, train_y, test_X, test, train_baskets = prepare_data(
        transactions,
        bestsellers_previous_week,
        candidates=[candidates_last_purchase, candidates_bestsellers, top_k_snb_weekly], 
        features=[customers, articles], 
        cols_to_use=columns_to_use
        )
    
    # Train model
    ranker = train_model(
        train_X, 
        train_y, 
        train_baskets, 
        model_params, 
        columns_to_use, 
        show_importance=10
    )

    # Make submission
    make_submission(customers, test, test_X, ranker, bestsellers_last_week, f'submission_snb_{k}')



k =  200
Percentage of real transactions:  0.013467521769342657
Mergining features...
Done.
Sorting data...
Done.
Preparing for training...
Done.
Training model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.137583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 162626513, number of used features: 18
Feature importance:
colour_group_code 0.22660346193184802
product_type_no 0.14202720914045072
section_no 0.14186738798970644
article_id 0.12038481379958879
index_code 0.0813696011667323
bestseller_rank 0.0744456911619956
perceived_colour_master_id 0.07294693109851845
department_no 0.049874748996315034
graphical_appearance_no 0.04420028283109395
garment_group_no 0.030675638617336694
Starting submission process...
Calculating predictions...
Done.
Creating submission...
Done.
S

100%|██████████| 58.5M/58.5M [00:27<00:00, 2.22MB/s]


Successfully submitted to H&M Personalized Fashion RecommendationsSubmission saved and submitted to Kaggle.


In [11]:
!kaggle competitions submissions -c h-and-m-personalized-fashion-recommendations | head -n 21

fileName                                                                                                             date                 description                                                                                                   status    publicScore  privateScore  
-------------------------------------------------------------------------------------------------------------------  -------------------  ------------------------------------------------------------------------------------------------------------  --------  -----------  ------------  
submission_snb_200.csv.gz                                                                                            2023-12-12 23:02:39  submission_snb_200                                                                                            complete  0.01085      0.01105       
submission_snb_190.csv.gz                                                                                            2023-12-12 22:41:31  subm