This code is from [https://github.com/radekosmulski/personalized_fashion_recs](https://github.com/radekosmulski/personalized_fashion_recs) with extra options and some improvements.
Comments explaining the original notebooks code were added by me and Arno Troch

In [1]:
import time

LGBMBoostingType = 'dart'
preprocess = '-1'
bestsellerFiller = 999
transactionBackXWeeks = 10
prevYear = ''
assert LGBMBoostingType in ['gbdt','dart','goss','rf']
assert preprocess in ['-1','edited']
assert prevYear in ["","SkipYear"]

In [2]:
%run helper_functions.ipynb

In [3]:
import pandas as pd

In [4]:
%%time

transactions = pd.read_parquet(f'../data/transactions_train_{preprocess}.parquet')
transactions_full = pd.read_parquet(f'../data/transactions_train_{preprocess}.parquet')
customers = pd.read_parquet(f'../data/customers_{preprocess}.parquet')
articles = pd.read_parquet(f'../data/articles_{preprocess}.parquet')
# sample = 0.05
# transactions = pd.read_parquet(f'../data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'../data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'../data/articles_train_sample_{sample}.parquet')


CPU times: total: 2.47 s
Wall time: 2.26 s


In [5]:
test_week = transactions.week.max() + 1
if prevYear == 'SkipYear':
    transactions3 = transactions[transactions.week > transactions.week.max() - transactionBackXWeeks]
    transactions2 = transactions[(transactions.week.max()-52>=transactions.week) & (transactions.week >transactions.week.max() - transactionBackXWeeks-52)] # EDITED
    print(transactions3['week'].unique())
    print(transactions2['week'].unique())
    transactions = pd.concat([transactions3,transactions2])
else:
    transactions = transactions[transactions.week > transactions.week.max() - transactionBackXWeeks]


In [32]:
import copy
# Given customer ids and arbitrary article features (except article id), returns a df with rows containing each combination of customer_id and combination of
def get_purchase_count_df_of_attributes(transactions,articles,attributes_columns_names,feature_name):
    attributes_columns_names_plus_article_id = copy.deepcopy(attributes_columns_names)
    attributes_columns_names_plus_article_id.insert(0,"article_id")
    attributes_columns_names_plus_customer_id = copy.deepcopy(attributes_columns_names)
    attributes_columns_names_plus_customer_id.insert(0,"customer_id")
    articles_selected = articles[attributes_columns_names_plus_article_id]
    big_df = pd.merge(articles_selected,transactions[["customer_id","article_id"]],on=["article_id"])
    return big_df.groupby(attributes_columns_names_plus_customer_id).size().reset_index(name=feature_name)

def get_purchase_rank_df_of_attributes(transactions,articles,attributes_columns_names,feature_name):
    attributes_columns_names_plus_article_id = copy.deepcopy(attributes_columns_names)
    attributes_columns_names_plus_article_id.insert(0,"article_id")
    attributes_columns_names_plus_customer_id = copy.deepcopy(attributes_columns_names)
    attributes_columns_names_plus_customer_id.insert(0,"customer_id")
    articles_selected = articles[attributes_columns_names_plus_article_id]
    big_df = pd.merge(articles_selected,transactions[["customer_id","article_id"]],on=["article_id"])
    big_df = big_df.groupby(attributes_columns_names_plus_customer_id).size().reset_index(name=feature_name)
    big_df[feature_name + "_rank"] =  big_df.groupby("customer_id")[feature_name].rank(method="dense",ascending=False)
    return big_df

In [9]:
temp_1 = get_purchase_count_df_of_attributes(transactions,articles,["garment_group_no"],"colour_code_amount")
temp_1.head()

Unnamed: 0,customer_id,garment_group_no,colour_code_amount
0,28847241659200,1005,1
1,28847241659200,1007,1
2,28847241659200,1009,1
3,28847241659200,1010,2
4,41318098387474,1013,1


In [34]:
temp2 = get_purchase_rank_df_of_attributes(transactions,articles,["garment_group_no"],"amount_of_garment_group_no")
temp2.head()

Unnamed: 0,customer_id,garment_group_no,amount_of_garment_group_no,amount_of_garment_group_no_rank
0,28847241659200,1005,1,2.0
1,28847241659200,1007,1,2.0
2,28847241659200,1009,1,2.0
3,28847241659200,1010,2,1.0
4,41318098387474,1013,1,1.0


# Generating candidates

### Last purchase candidates

In [68]:

c2weeks = transactions.groupby('customer_id')['week'].unique()

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

candidates_last_purchase = transactions.copy()

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])

# Candidate for week X: item bought in previous purchase week
candidates_last_purchase.week=weeks

In [69]:
print(candidates_last_purchase)

              t_dat           customer_id  article_id     price  \
29030503 2020-07-15       272412481300040   778064028  0.008458   
29030504 2020-07-15       272412481300040   816592008  0.016932   
29030505 2020-07-15       272412481300040   621381021  0.033881   
29030506 2020-07-15       272412481300040   817477003  0.025407   
29030507 2020-07-15       272412481300040   899088002  0.025407   
...             ...                   ...         ...       ...   
31774722 2020-09-22  18439937050817258297   891591003  0.084729   
31774723 2020-09-22  18439937050817258297   869706005  0.084729   
31779097 2020-09-22  18440902715633436014   918894002  0.016932   
31779098 2020-09-22  18440902715633436014   761269001  0.016932   
31780475 2020-09-22  18443633011701112574   914868002  0.033881   

          sales_channel_id  week  
29030503                 1    96  
29030504                 1    96  
29030505                 1    96  
29030506                 1    96  
29030507            

### Bestsellers candidates

In [70]:
# mean price PER ITEM PER WEEK
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

# bestseller rank doet niets: ranking is belangrijk om de bestsellers te vinden, maar de kolom zelf mag weg
# For each week, list of ranked 12 bestsellers
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

# Voor elke week, zegt ge koop het best verkochte item in de vorige week
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1
# Per week lijst van customers die IETS gekocht hebben
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

# Per week lijst van customers die IETS gekocht hebben
# MERGE
# Voor elke week, zegt ge koop het best verkochte item in de vorige week

# Per week, per customer die iets gekocht heeft, de 12 bestverkochte uit DE (algemeen) vorige week
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

# unique_transactions = Per week lijst van customers die IETS gekocht hebben
# Voor elke customer waar we iets over weten en dus een voorspelling van willen doen, houden we 1 keer de customer id over en zetten we de week op test_week, want dat is wanneer we willen voorspellen wat hij koopt
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week


# Voor elke customer waar we iets over weten en dus een voorspelling van willen doen, houden we 1 keer de customer id over en zetten we de week op test_week, want dat is wanneer we willen voorspellen wat hij koopt
# MERGE
# Voor elke week, zegt ge koop het best verkochte item in de vorige week

# Resultaat: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

# bestseller rank doet niets: ranking is belangrijk om de bestsellers te vinden, maar de kolom zelf mag weg

# Per week, per customer die iets gekocht heeft, de 12 bestverkochte uit DE (algemeen) vorige week
# Resultaat: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek

candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

# Combining transactions and candidates / negative examples

transactions['purchased'] = 1


# candidates_last_purchase: Candidate for week X: item bought in previous purchase week
# candidates_bestsellers: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek
# transactions: letterlijk gewoon transactions
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

# Voor elke week: kijk alle keren dat customer het artikel koopt OF voorgesteld krijgt, en hou indien gekocht enkel de rij met purchased 1
# Opmerking: candidates voor week 105 zijn allemaal purchased==0
brak = data.groupby(['customer_id', 'article_id', 'week']).size().reset_index(name="importance")
print(brak)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

data = pd.merge(
    data,
    brak,
    on=['customer_id', 'article_id', 'week']
)

data.purchased.mean()
print(data["importance"].isna().sum())
print(data["importance"].max())
print(data["importance"].mean())
print(data["importance"].min())

data.head()

                   customer_id  article_id  week  importance
0               28847241659200   372860002    96           1
1               28847241659200   448509014   105           1
2               28847241659200   547780003    96           1
3               28847241659200   600886001    96           1
4               28847241659200   610776002    96           1
...                        ...         ...   ...         ...
18253744  18446737527580148316   923758001   104           1
18253745  18446737527580148316   923758001   105           1
18253746  18446737527580148316   924243001   104           1
18253747  18446737527580148316   924243001   105           1
18253748  18446737527580148316   924243002   105           1

[18253749 rows x 4 columns]
0
74
1.0362430205433415
1


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,importance
0,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0,1
1,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0,1
2,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0,1
3,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0,1
4,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0,1


### Add bestseller information

In [71]:
# van echte transacties: bestseller onbekend, check candidates om te kijken of er toen wel bestseller rank was. Zo nee, vul later met fillna
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [72]:
# Verwijder eerste week omdat er voor eerste week geen bestsellers_previous_week is
data = data[data.week != data.week.min()]  # Presumably to make sure no data of an incomplete week is included?
# Indien geen bestseller: keislecht verkocht
data.bestseller_rank.fillna(bestsellerFiller, inplace=True)  # EDITED

In [73]:
# per customer per week alle transacties en/of candidates

# Steek bij elke aankoop alle info over gekocht article erbij
data = pd.merge(data, articles, on='article_id', how='left')
# Steek bij elke aankoop alle info over customer erbij
data = pd.merge(data, customers, on='customer_id', how='left')

In [74]:
# Sorteer eerst op week, dan per week op customer
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [75]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank','importance']

import itertools
new_features = dict()
# article_features = ['product_type_no','graphical_appearance_no','colour_group_code','perceived_colour_value_id','perceived_colour_master_id', 'department_no', 'index_code','index_group_no', 'section_no', 'garment_group_no']
article_features = ['index_group_no','graphical_appearance_no','perceived_colour_value_id','garment_group_no']
for feature_column in article_features:
    new_features["amount_of_(" + feature_column + ")"] = [feature_column]
# for double_features in itertools.combinations(article_features,2):
#     new_features["amount_of_(" + double_features[0] + "_" + double_features[1] + ")"] = [double_features[0],double_features[1]]

for feature_name,partial_columns in new_features.items():
    time_start = time.time()
    columns_to_use.append(feature_name)
    df_with_customer_id_and_features_and_count = get_purchase_count_df_of_attributes(transactions_full[transactions_full.week != test_week],articles,partial_columns,feature_name)
    data = pd.merge(data,df_with_customer_id_and_features_and_count,on=(["customer_id"] + partial_columns),how="left")
    print(feature_name +  str(time.time() - time_start))


amount_of_(index_group_no)17.836069345474243
amount_of_(graphical_appearance_no)20.19042658805847
amount_of_(perceived_colour_value_id)20.18234348297119
amount_of_(garment_group_no)22.850852727890015


In [76]:
# Niet trainen op laatste week want anders hebben we geen test set
train = data[data.week != test_week]
# Laatste week, indien item in beide candidate sets, drop duplicates.
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()
test.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,importance,bestseller_rank,product_code,...,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,amount_of_(index_group_no),amount_of_(graphical_appearance_no),amount_of_(perceived_colour_value_id),amount_of_(garment_group_no)
11381612,2020-09-03,28847241659200,925246001,0.128797,2,105,0.0,1,999.0,925246,...,1,1,0,1,21,57896,54.0,59.0,36.0,2.0
11381613,2020-07-18,28847241659200,924243001,0.041535,1,105,0.0,1,1.0,924243,...,1,1,0,1,21,57896,54.0,59.0,19.0,9.0
11381614,2020-07-18,28847241659200,924243002,0.041877,1,105,0.0,1,2.0,924243,...,1,1,0,1,21,57896,54.0,59.0,36.0,9.0
11381615,2020-07-18,28847241659200,918522001,0.041435,1,105,0.0,1,3.0,918522,...,1,1,0,1,21,57896,54.0,59.0,10.0,9.0
11381616,2020-07-18,28847241659200,923758001,0.033462,1,105,0.0,1,4.0,923758,...,1,1,0,1,21,57896,54.0,59.0,10.0,11.0


In [77]:
print(train.groupby(['week', 'customer_id']).head())
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
print(train_baskets)
print(train_baskets.min())
print(train_baskets.max())
print(len(train_baskets))

              t_dat           customer_id  article_id     price  \
0        2020-07-26        28847241659200   887770001  0.016932   
1        2020-07-18        28847241659200   762846001  0.025407   
2        2020-07-18        28847241659200   829308001  0.033881   
3        2020-07-26        28847241659200   760084003  0.025094   
4        2020-07-26        28847241659200   866731001  0.024919   
...             ...                   ...         ...       ...   
11381596 2020-09-21  18446737527580148316   547780001  0.023712   
11381597 2020-09-21  18446737527580148316   763988001  0.023712   
11381598 2020-09-21  18446737527580148316   763988003  0.023712   
11381599 2020-09-21  18446737527580148316   547780040  0.023712   
11381600 2020-09-21  18446737527580148316   909370001  0.032947   

          sales_channel_id  week  purchased  importance  bestseller_rank  \
0                        1    96        1.0           1            999.0   
1                        1    96        0.0

In [78]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: total: 93.8 ms
Wall time: 416 ms


# Model training

In [79]:
from lightgbm.sklearn import LGBMRanker

In [80]:
ranker=LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type=LGBMBoostingType,
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [81]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.887558
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.161772
[LightGBM] [Debug] init for col-wise cost 0.141665 seconds, init for row-wise cost 0.493081 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2025
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 23
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
CPU times: total: 19.2 s
Wall time: 7.19 s


In [82]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9590945642075946
importance 0.03671050672288266
amount_of_(index_group_no) 0.0028664852965649586
amount_of_(garment_group_no) 0.0007547670312425131
amount_of_(graphical_appearance_no) 0.00021948505918151887
article_id 0.00018722873669037863
index_group_no 5.932510318931642e-05
amount_of_(perceived_colour_value_id) 5.675434037113316e-05
department_no 5.088350228289172e-05
index_code 0.0
product_type_no 0.0
graphical_appearance_no 0.0
colour_group_code 0.0
perceived_colour_value_id 0.0
perceived_colour_master_id 0.0
postal_code 0.0
age 0.0
section_no 0.0
garment_group_no 0.0
Active 0.0
club_member_status 0.0
fashion_news_frequency 0.0
FN 0.0


# Calculate predictions

In [83]:
%%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: total: 8.05 s
Wall time: 10.3 s


# Create submission

In [84]:
sub = pd.read_csv('../data/sample_submission.csv')

In [85]:
%%time
preds = []

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)


for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: total: 1.17 s
Wall time: 2.6 s


In [86]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [87]:
sub_name = 'basic_model_submission_' +  str(LGBMBoostingType) + '_fillna' + str(preprocess) + 'bestsellerFiller' + str(bestsellerFiller) + "_weeks" + str(transactionBackXWeeks) + "_importance" + str(prevYear)
sub.to_csv(f'../data/subs/{sub_name}.csv.gz', index=False)
sub.to_csv(f'../data/subs/{sub_name}.csv', index=False)
print("Done")
print(sub_name)

Done
basic_model_submission_dart_fillna-1bestsellerFiller999_weeks10_importance


In [88]:
# !kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f 'data/subs/{sub_name}.csv.gz' -m {sub_name}