This code is from [https://github.com/radekosmulski/personalized_fashion_recs](https://github.com/radekosmulski/personalized_fashion_recs) with extra options and some improvements.
Comments explaining the original notebook code were added by me and Arno Troch

First code cell contains hardcoded variables that you can change.

Added functionality:
 1. Optionally train on past X weeks and past X weeks of last year (not recommended)
2. If a sample is generated by multiple methods, count by how many methods it was generated
3. Scale data

Results:
1. 0.02087 -> 0.01397
2. 0.02087 -> 0.02114
3. No difference


In [176]:
LGBMBoostingType = 'dart'
preprocess = '-1'  # If you get an error saying that the dataset is not found, make sure to first run part 1 with the same type of preprocessing as indicated here
bestsellerFiller = 999  # If item not in top 12 bestsellers of a week, fill with this value
transactionBackXWeeks = 10  # Size of train+test set
prevYear = ''  # if SkipYear, train on transactionBackXWeeks and transactionBackXWeeks but last year. Very poor results, not recommended.
assert LGBMBoostingType in ['gbdt','dart','goss','rf']
assert preprocess in ['-1','edited']
assert prevYear in ["","SkipYear"]

In [150]:
%run helper_functions.ipynb

In [151]:
import pandas as pd

In [152]:
%%time

transactions = pd.read_parquet(f'../data/transactions_train_{preprocess}.parquet')
customers = pd.read_parquet(f'../data/customers_{preprocess}.parquet')
articles = pd.read_parquet(f'../data/articles_{preprocess}.parquet')
# sample = 0.05
# transactions = pd.read_parquet(f'../data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'../data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'../data/articles_train_sample_{sample}.parquet')

CPU times: total: 1.28 s
Wall time: 1.1 s


In [153]:
test_week = transactions.week.max() + 1
if prevYear == 'SkipYear':
    transactions3 = transactions[transactions.week > transactions.week.max() - transactionBackXWeeks]
    transactions2 = transactions[(transactions.week.max()-52>=transactions.week) & (transactions.week >transactions.week.max() - transactionBackXWeeks-52)] # EDITED
    print(transactions3['week'].unique())
    print(transactions2['week'].unique())
    transactions = pd.concat([transactions3,transactions2])
else:
    transactions = transactions[transactions.week > transactions.week.max() - transactionBackXWeeks]


[ 95  96  97  98  99 100 101 102 103 104]
[43 44 45 46 47 48 49 50 51 52]


# Generating candidates

### Last purchase candidates

In [154]:

c2weeks = transactions.groupby('customer_id')['week'].unique()

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

candidates_last_purchase = transactions.copy()

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])

# Candidate for week X: item bought in previous purchase week
candidates_last_purchase.week=weeks

In [155]:
print(candidates_last_purchase)

              t_dat           customer_id  article_id     price  \
29030503 2020-07-15       272412481300040   778064028  0.008458   
29030504 2020-07-15       272412481300040   816592008  0.016932   
29030505 2020-07-15       272412481300040   621381021  0.033881   
29030506 2020-07-15       272412481300040   817477003  0.025407   
29030507 2020-07-15       272412481300040   899088002  0.025407   
...             ...                   ...         ...       ...   
16957605 2019-09-24  18443039671924470908   792521006  0.016932   
16943168 2019-09-24  18445164350380731040   733099001  0.016932   
16928374 2019-09-24  18445187566593112488   787028001  0.101678   
16928375 2019-09-24  18445187566593112488   787028001  0.101678   
16928376 2019-09-24  18445187566593112488   786818001  0.101678   

          sales_channel_id  week  
29030503                 1    96  
29030504                 1    96  
29030505                 1    96  
29030506                 1    96  
29030507            

### Bestsellers candidates

In [156]:
# mean price PER ITEM PER WEEK
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

# bestseller rank doet niets: ranking is belangrijk om de bestsellers te vinden, maar de kolom zelf mag weg
# For each week, list of ranked 12 bestsellers
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

# Voor elke week, zegt ge koop het best verkochte item in de vorige week
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1
# Per week lijst van customers die IETS gekocht hebben
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

# Per week lijst van customers die IETS gekocht hebben
# MERGE
# Voor elke week, zegt ge koop het best verkochte item in de vorige week

# Per week, per customer die iets gekocht heeft, de 12 bestverkochte uit DE (algemeen) vorige week
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

# unique_transactions = Per week lijst van customers die IETS gekocht hebben
# Voor elke customer waar we iets over weten en dus een voorspelling van willen doen, houden we 1 keer de customer id over en zetten we de week op test_week, want dat is wanneer we willen voorspellen wat hij koopt
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week


# Voor elke customer waar we iets over weten en dus een voorspelling van willen doen, houden we 1 keer de customer id over en zetten we de week op test_week, want dat is wanneer we willen voorspellen wat hij koopt
# MERGE
# Voor elke week, zegt ge koop het best verkochte item in de vorige week

# Resultaat: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

# bestseller rank doet niets: ranking is belangrijk om de bestsellers te vinden, maar de kolom zelf mag weg

# Per week, per customer die iets gekocht heeft, de 12 bestverkochte uit DE (algemeen) vorige week
# Resultaat: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek

candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

# Combining transactions and candidates / negative examples

transactions['purchased'] = 1


# candidates_last_purchase: Candidate for week X: item bought in previous purchase week
# candidates_bestsellers: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek
# transactions: letterlijk gewoon transactions
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

# Voor elke week: kijk alle keren dat customer het artikel koopt OF voorgesteld krijgt, en hou indien gekocht enkel de rij met purchased 1
# Opmerking: candidates voor week 105 zijn allemaal purchased==0
brak = data.groupby(['customer_id', 'article_id', 'week']).size().reset_index(name="importance")
print(brak)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

data = pd.merge(
    data,
    brak,
    on=['customer_id', 'article_id', 'week']
)

data.purchased.mean()
print(data["importance"].isna().sum())
print(data["importance"].max())
print(data["importance"].mean())
print(data["importance"].min())

data.head()

                   customer_id  article_id  week  importance
0               23962613628581   448509014   105           1
1               23962613628581   484398001    45           1
2               23962613628581   554811008    45           1
3               23962613628581   564786001    45           1
4               23962613628581   594264006    43           1
...                        ...         ...   ...         ...
33686305  18446737527580148316   923758001   104           1
33686306  18446737527580148316   923758001   105           1
33686307  18446737527580148316   924243001   104           1
33686308  18446737527580148316   924243001   105           1
33686309  18446737527580148316   924243002   105           1

[33686310 rows x 4 columns]
0
74
1.0424481636605494
1


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,importance
0,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0,1
1,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0,1
2,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0,1
3,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0,1
4,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0,1


### Add bestseller information

In [157]:
# van echte transacties: bestseller onbekend, check candidates om te kijken of er toen wel bestseller rank was. Zo nee, vul later met fillna
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [158]:
# Verwijder eerste week omdat er voor eerste week geen bestsellers_previous_week is
data = data[data.week != data.week.min()]  # Presumably to make sure no data of an incomplete week is included?
# Indien geen bestseller: keislecht verkocht
data.bestseller_rank.fillna(bestsellerFiller, inplace=True)  # EDITED

In [159]:
# per customer per week alle transacties en/of candidates

# Steek bij elke aankoop alle info over gekocht article erbij
data = pd.merge(data, articles, on='article_id', how='left')
# Steek bij elke aankoop alle info over customer erbij
data = pd.merge(data, customers, on='customer_id', how='left')

In [160]:
# Sorteer eerst op week, dan per week op customer
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [161]:
data.head()

columns_to_use = ['product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank','importance']

from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
data[columns_to_use] = scaler.fit_transform(data[columns_to_use])

columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank','importance']

In [162]:
# Niet trainen op laatste week want anders hebben we geen test set
train = data[data.week != test_week]
# Laatste week, indien item in beide candidate sets, drop duplicates.
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()
test.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,importance,bestseller_rank,product_code,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
23285066,2019-08-03,23962613628581,732842001,0.06778,1,105,0.0,-0.157588,1.569887,732842,...,37,0.992958,11,780,-0.877122,-0.865376,-0.119615,-0.852173,-0.108947,-0.67849
23285067,2019-07-18,23962613628581,924243001,0.041535,2,105,0.0,-0.157588,-0.649132,924243,...,0,-0.989949,3,13007,-0.877122,-0.865376,-0.119615,-0.852173,-0.108947,-0.67849
23285068,2019-07-18,23962613628581,924243002,0.041877,2,105,0.0,-0.157588,-0.646908,924243,...,0,-0.989949,3,13007,-0.877122,-0.865376,-0.119615,-0.852173,-0.108947,-0.67849
23285069,2019-07-18,23962613628581,918522001,0.041435,2,105,0.0,-0.157588,-0.644685,918522,...,0,-0.989949,3,28633,-0.877122,-0.865376,-0.119615,-0.852173,-0.108947,-0.67849
23285070,2019-07-18,23962613628581,923758001,0.033462,2,105,0.0,-0.157588,-0.642462,923758,...,0,0.07777,6,27869,-0.877122,-0.865376,-0.119615,-0.852173,-0.108947,-0.67849


In [163]:
print(train.groupby(['week', 'customer_id']).head())
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
print(train_baskets)
print(train_baskets.min())
print(train_baskets.max())
print(len(train_baskets))

              t_dat           customer_id  article_id     price  \
0        2019-07-27        77117344919861   471714008  0.007983   
1        2019-07-27        77117344919861   763037001  0.040034   
2        2019-07-22        77117344919861   780918001  0.013203   
3        2019-07-22        77117344919861   663498003  0.033034   
4        2019-07-22        77117344919861   755458008  0.041288   
...             ...                   ...         ...       ...   
23285050 2020-09-21  18446737527580148316   547780001  0.023712   
23285051 2020-09-21  18446737527580148316   763988001  0.023712   
23285052 2020-09-21  18446737527580148316   763988003  0.023712   
23285053 2020-09-21  18446737527580148316   547780040  0.023712   
23285054 2020-09-21  18446737527580148316   909370001  0.032947   

          sales_channel_id  week  purchased  importance  bestseller_rank  \
0                        2    44        1.0    3.663422         1.569887   
1                        2    44        1.0

In [164]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: total: 359 ms
Wall time: 978 ms


# Model training

In [165]:
from lightgbm.sklearn import LGBMRanker

In [166]:
ranker=LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type=LGBMBoostingType,
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [167]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.972335
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.102351
[LightGBM] [Debug] init for col-wise cost 0.127235 seconds, init for row-wise cost 0.334982 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 23285066, number of used features: 19
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 15
CPU times: total: 36.5 s
Wall time: 9.2 s


In [168]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9607124084765173
importance 0.03801438353073763
age 0.00034996033614452917
product_type_no 0.00030221364492572135
article_id 0.00021790279307392274
department_no 0.00013071359975942156
postal_code 0.00010661712071155978
club_member_status 0.00010304844956410496
garment_group_no 6.275204856586346e-05
Active 0.0
FN 0.0
fashion_news_frequency 0.0
index_group_no 0.0
index_code 0.0
perceived_colour_master_id 0.0
perceived_colour_value_id 0.0
colour_group_code 0.0
graphical_appearance_no 0.0
section_no 0.0


# Calculate predictions

In [169]:
%%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: total: 6.62 s
Wall time: 9.24 s


# Create submission

In [170]:
sub = pd.read_csv('../data/sample_submission.csv')

In [171]:
%%time
preds = []

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)


for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: total: 2.8 s
Wall time: 8.27 s


In [172]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [177]:
sub_name = 'basic_model_submission_' +  str(LGBMBoostingType) + '_fillna' + str(preprocess) + 'bestsellerFiller' + str(bestsellerFiller) + "_weeks" + str(transactionBackXWeeks) + "_importance" + str(prevYear)
sub.to_csv(f'../data/subs/{sub_name}.csv.gz', index=False)
sub.to_csv(f'../data/subs/{sub_name}.csv', index=False)
print("Done")
print(sub_name)

Done
basic_model_submission_dart_fillna-1bestsellerFiller999_weeks10_importanceSkipYear


In [174]:
# !kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f 'data/subs/{sub_name}.csv.gz' -m {sub_name}