# Notebook for generating kaggle submission file
(Clearer processes are available in EnsembleOfEnsembles_OtherMetrics.ipynb)

In [1]:
import pandas as pd
from PrepareData import prepare_data
from lightgbm.sklearn import LGBMRanker
from AdaBoostRanker import AdaBoostRanker

In [2]:
train, test, train_baskets, bestsellers_previous_week = prepare_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions['purchased'] = 1


In [3]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [4]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [5]:
lgbm_ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [6]:
adaboost_ranker = AdaBoostRanker()

In [7]:
lgbm_ranker = lgbm_ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.848850
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.153099
[LightGBM] [Debug] init for col-wise cost 0.182110 seconds, init for row-wise cost 0.949143 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.580947 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12


In [8]:
for i in lgbm_ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], lgbm_ranker.feature_importances_[i]/lgbm_ranker.feature_importances_.sum())

bestseller_rank 0.9989805519216203
age 0.00024136038957903926
article_id 0.00017160828400263902
garment_group_no 0.0001448188543340445
department_no 9.637421875769266e-05
product_type_no 9.014783292439592e-05
section_no 7.067204716548531e-05
postal_code 6.792197441369627e-05
club_member_status 6.519780240033951e-05
colour_group_code 5.358754121027148e-05
perceived_colour_value_id 1.775913359216025e-05
fashion_news_frequency 0.0
Active 0.0
FN 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
index_group_no 0.0


In [9]:
adaboost_ranker = adaboost_ranker.fit(
    train_X,
    train_y,
)

In [10]:
for i in adaboost_ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], adaboost_ranker.feature_importances_[i]/adaboost_ranker.feature_importances_.sum())

article_id 0.30000000000000004
department_no 0.16000000000000003
bestseller_rank 0.12000000000000001
garment_group_no 0.10000000000000002
product_type_no 0.10000000000000002
perceived_colour_master_id 0.060000000000000005
colour_group_code 0.04000000000000001
fashion_news_frequency 0.020000000000000004
club_member_status 0.020000000000000004
section_no 0.020000000000000004
postal_code 0.020000000000000004
age 0.020000000000000004
graphical_appearance_no 0.020000000000000004
Active 0.0
FN 0.0
index_code 0.0
perceived_colour_value_id 0.0
index_group_no 0.0


In [11]:
test['lgbm_preds'] = lgbm_ranker.predict(test_X)

In [12]:
test['adaboost_preds'] = adaboost_ranker.predict(test_X)[:,1]

In [13]:
#for combining: rank predictions, sum the rankings
#test['preds'] = (test['lgbm_preds'].rank() + test['adaboost_preds'].rank())

In [14]:
#for combining: rank predictions, multiply the rankings
#test['preds'] = (test['lgbm_preds'].rank() * test['adaboost_preds'].rank())

In [15]:
#interleaving: combine rankings A and B by reordering as: A[0], B[0], A[1], B[1], A[2], etc
#if the next item from B was already included from A (and vice versa), ignore, and get the next item from A
lgbm = test[['article_id', 'customer_id', 'lgbm_preds']].copy().sort_values(by=['customer_id', 'lgbm_preds'], ascending=False).reset_index()
ada = test[['article_id', 'customer_id', 'adaboost_preds']].copy().sort_values(by=['customer_id', 'adaboost_preds'], ascending=False).reset_index()

interleaved = pd.concat([ada[['article_id', 'customer_id']], lgbm[['article_id', 'customer_id']]]).sort_index().reset_index(drop=True)

interleaved = interleaved.drop_duplicates(subset=['customer_id', 'article_id'], keep='first')\
                .reset_index()

interleaved['preds'] = interleaved.groupby('customer_id')['index'].rank(method="dense", ascending=False)
test = pd.merge(test, interleaved[['customer_id', 'article_id', 'preds']], on=['customer_id', 'article_id'], how='left')

In [16]:
c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

# Create submission

In [17]:
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

In [18]:
sub = pd.read_csv('../../../Data/sample_submission.csv')

In [19]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 3.91 s, sys: 160 ms, total: 4.07 s
Wall time: 4.07 s


In [20]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [21]:
sub_name = '../../Submissions_EnsembleOfEnsembles/LGBMAdaBoostInterleaved'
sub.to_csv(f'{sub_name}.csv.gz', index=False)