This code is from [https://github.com/radekosmulski/personalized_fashion_recs](https://github.com/radekosmulski/personalized_fashion_recs) with extra options and some improvements.
Comments explaining the original notebooks code were added by me and Arno Troch

In [3]:
import time

LGBMBoostingType = 'dart'
preprocess = '-1'  # '-1' uses preprocessing from original notebook, 'edited' uses slightly different preprocessing. You should probably use '-1'
bestsellerFiller = 999  # If a negative sample did not appear in a bestseller list, this is what the NaN is filled with. Normal values are between 1-12
transactionBackXWeeks = 10  # Size of training+test sets: this many weeks before test set
prevYear = ''  # if "SkipYear": uses training data as explained in transactionBackXWeeks + the same weeks of the previous year. Not recommended.
assert LGBMBoostingType in ['gbdt','dart','goss','rf']
assert preprocess in ['-1','edited']
assert prevYear in ["","SkipYear"]

In [4]:
%run helper_functions.ipynb

In [5]:
import pandas as pd

In [6]:
%%time

transactions = pd.read_parquet(f'../data/transactions_train_{preprocess}.parquet')
# Backup is made because some features use the full dataset for calculations
transactions_full = pd.read_parquet(f'../data/transactions_train_{preprocess}.parquet')
customers = pd.read_parquet(f'../data/customers_{preprocess}.parquet')
articles = pd.read_parquet(f'../data/articles_{preprocess}.parquet')

CPU times: total: 3.3 s
Wall time: 2.55 s


In [7]:
# mean price PER ITEM PER WEEK
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()
mean_price.reset_index().head()

Unnamed: 0,week,article_id,price
0,0,108775015,0.008373
1,0,108775044,0.008374
2,0,108775051,0.005023
3,0,110065001,0.024983
4,0,110065002,0.02465


In [8]:
test_week = transactions.week.max() + 1
# Unless you really want to test training on transactionBackXWeeks and transactionBackXWeeks of last year, just read the else
if prevYear == 'SkipYear':
    # Starting from final week in dataset, select past transactionBackXWeeks weeks
    transactions3 = transactions[transactions.week > transactions.week.max() - transactionBackXWeeks]
    # Starting from final week in dataset but a year earlier, select past transactionBackXWeeks weeks
    transactions2 = transactions[(transactions.week.max()-52>=transactions.week) & (transactions.week >transactions.week.max() - transactionBackXWeeks-52)] # EDITED
    print(transactions3['week'].unique())
    print(transactions2['week'].unique())
    # training data now consists of transactionBackXWeeks of current year and last year
    transactions = pd.concat([transactions3,transactions2])
else:
    # Starting from final week in dataset, select past transactionBackXWeeks weeks as training data
    transactions = transactions[transactions.week > transactions.week.max() - transactionBackXWeeks]


In [9]:

import copy
def get_purchase_rank_df_of_attributes(transactions,articles,attributes_columns_names,feature_name):
    """
    Given customer ids and arbitrary article features (except article id), returns a df with rows containing each combination of customer_id and combination of
    For example: if attributes_columns_names contains ["garment_group_name"], then the final dataframe will contain for each customer how often he bought a garment with each possible value in garment_group_name, and a rank of which ones are his favourites.
    :param transactions: pandas dataframe: Transactions on which to calculate these features, can be full transactions dataset even if training data is a subset
    :param articles: pandas dataframe: Articles table, should be full table
    :param attributes_columns_names: List of strings: Article Column names for which to calculate these features. Should not contain "article_id"
    :param feature_name: string: Name for the new feature
    :return: pandas dataframe:
    """

    # To make merges later on easier, this variable contains the article columns asked form in the function argument plus article_id
    attributes_columns_names_plus_article_id = copy.deepcopy(attributes_columns_names)
    attributes_columns_names_plus_article_id.insert(0,"article_id")

    # To make merges later on easier, this variable contains the article columns asked form in the function argument plus customer_id
    attributes_columns_names_plus_customer_id = copy.deepcopy(attributes_columns_names)
    attributes_columns_names_plus_customer_id.insert(0,"customer_id")

    # From articles, select only relevant columns. If we want to calculate what a users favourite colour is, we do not need the garment type.
    articles_selected = articles[attributes_columns_names_plus_article_id]

    # This merge results in a dataframe containing for each transaction from the function argument the customer_id, article_id and article features as given in the attributes_columns_names argument
    big_df = pd.merge(articles_selected,transactions[["customer_id","article_id"]],on=["article_id"])

    # Adds a column containing for each transaction how often the customer has already bought clothing with the same attributes_columns_names as the article_id from the transaction
    big_df = big_df.groupby(attributes_columns_names_plus_customer_id).size().reset_index(name=feature_name)

    # Adds a column containing for each transaction the rank that the user gives to clothing with the same attributes_columns_names as the article_id in the transaction.
    # In this case, rank means that if the article id is blue and the user bought lots of blue things, it will be one. If the article is red and red is the users second favourite, it will be 2 etc
    big_df[feature_name + "_rank"] =  big_df.groupby("customer_id")[feature_name].rank(method="dense",ascending=False)
    return big_df


# # Ik wil: t_dat (fillna iets), customer id, article_id, price (merge met mean_price), sales_channel_id  (?), week (sales_nohead), purchased (fillna 0)
# # Sales_nohead: week, article_id, bestseller_rank, alles van articles
# def bestsellers_with_attribute(sales_nohead,unique_transactions,attributes_columns_names,rank_df,rank_column_name):
#     to_return = unique_transactions.copy(deep=True)
#     sales_nohead_filtered = sales_nohead.copy(deep=True)
#     sales_nohead_filtered = sales_nohead_filtered[attributes_columns_names] ==
#     to_return["article_id"] = sales_nohead[(sales_nohead.week == to_return.week) & (sales_nohead.bestseller_rank == sales_nohead[sales_nohead[attributes_columns_names] == ].bestseller_rank.max())]

# sales_nohead = sales_nohead[(week == test_week-1)]

# merge sales_nohead, mean_price op week, article_id -> voegt prijs toe, totaal: week, article_id, price, t_dat, customer_id, sales_channel_id,purchased
# Merge op customer_id, week: unique_transactions, vorig resultaat

In [10]:
# Final result of this cell contains for each week in transactions (training data, not full dataset) all articles that were sold, ranked by which ones sold best, their average price in that week, and all data normally included in the articles table.

# Ranks for each week which items sold best
sales_nohead = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(9999999999999).rename('bestseller_rank').astype('int64').reset_index()

# Add article columns, e.g. garment_type_name
sales_nohead = pd.merge(sales_nohead,articles,how="left",on=["article_id"])
# Add average price of product in week
sales_nohead = pd.merge(sales_nohead,mean_price,how="left",on=["week","article_id"])
sales_nohead.head(100)

Unnamed: 0,week,article_id,bestseller_rank,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,price
0,95,760084003,1,760084,1134,272,0,1,1010016,0,...,1,1,2,2,53,1,1009,5,847,0.025094
1,95,866731001,2,866731,3609,273,15,1,1010016,0,...,9,9,26,4,5,21,1005,0,3130,0.024919
2,95,600886001,3,600886,1424,59,20,6,1010016,0,...,7,7,1,0,60,22,1018,12,420,0.022980
3,95,706016001,4,706016,172,272,0,1,1010016,0,...,1,1,2,2,53,1,1009,5,30,0.033197
4,95,372860002,5,372860,19652,302,14,7,1010016,0,...,7,7,1,0,62,31,1021,13,157,0.013193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,878013001,75,878013,1011,265,1,2,1010001,1,...,0,0,1,0,15,0,1013,8,3544,0.049460
96,95,720125040,76,720125,99,273,15,1,1010005,8,...,9,9,26,4,5,21,1005,0,313,0.023239
97,95,610776071,77,610776,46,255,3,0,1010001,1,...,0,0,1,0,16,30,1002,2,60,0.008110
98,95,852174003,77,852174,3280,306,13,4,1010016,0,...,9,9,26,4,5,21,1005,0,3945,0.024849


In [None]:
# def get_top_garment(sales_nohead,feature_name,feature_value):
#     for week in sales_nohead["week"].unique():
#         filtered_on_article_value = sales_nohead[(sales_nohead[feature_name] == feature_value) & (sales_nohead["week"] == week)]
#         filtered_on_article_value.iloc[0]
# get_top_garment(1,sales_nohead,"product_group_name",1)

In [12]:
# Example application of get_purchase_rank_df_of_attributes
# Assuming output is deterministic, you can see that customer 28847241659200 bought 2 articles from garment_group_no 1010 (as seen in amount_of_garment_group_no),
# making it his favourite garment_group_no (as seen inn column amount_of_garment_group_no_rank)
temp2 = get_purchase_rank_df_of_attributes(transactions,articles,["garment_group_no"],"amount_of_garment_group_no")
temp2.head()

Unnamed: 0,customer_id,garment_group_no,amount_of_garment_group_no,amount_of_garment_group_no_rank
0,28847241659200,1005,1,2.0
1,28847241659200,1007,1,2.0
2,28847241659200,1009,1,2.0
3,28847241659200,1010,2,1.0
4,41318098387474,1013,1,1.0


In [13]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95


# Generating candidates

### Last purchase candidates

In [None]:

c2weeks = transactions.groupby('customer_id')['week'].unique()

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

candidates_last_purchase = transactions.copy()

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])

# Candidate for week X: item bought in previous purchase week
candidates_last_purchase.week=weeks

In [None]:
print(candidates_last_purchase)

### Bestsellers candidates

In [None]:
candidates_last_purchase.head()

In [None]:
transactions.head()

In [None]:

# bestseller rank doet niets: ranking is belangrijk om de bestsellers te vinden, maar de kolom zelf mag weg
# For each week, list of ranked 12 bestsellers
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
sales.head()

In [None]:

# Voor elke week, zegt ge koop het best verkochte item in de vorige week
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1
# Per week lijst van customers die IETS gekocht hebben
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()
unique_transactions.head()

In [None]:

# Per week lijst van customers die IETS gekocht hebben
# MERGE
# Voor elke week, zegt ge koop het best verkochte item in de vorige week

# Per week, per customer die iets gekocht heeft, de 12 bestverkochte uit DE (algemeen) vorige week
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

# unique_transactions = Per week lijst van customers die IETS gekocht hebben
# Voor elke customer waar we iets over weten en dus een voorspelling van willen doen, houden we 1 keer de customer id over en zetten we de week op test_week, want dat is wanneer we willen voorspellen wat hij koopt
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week


# Voor elke customer waar we iets over weten en dus een voorspelling van willen doen, houden we 1 keer de customer id over en zetten we de week op test_week, want dat is wanneer we willen voorspellen wat hij koopt
# MERGE
# Voor elke week, zegt ge koop het best verkochte item in de vorige week

# Resultaat: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

# bestseller rank doet niets: ranking is belangrijk om de bestsellers te vinden, maar de kolom zelf mag weg

# Per week, per customer die iets gekocht heeft, de 12 bestverkochte uit DE (algemeen) vorige week
# Resultaat: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek

candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

# Combining transactions and candidates / negative examples

transactions['purchased'] = 1


# candidates_last_purchase: Candidate for week X: item bought in previous purchase week
# candidates_bestsellers: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek
# transactions: letterlijk gewoon transactions
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [None]:
data.head()

In [None]:

# Voor elke week: kijk alle keren dat customer het artikel koopt OF voorgesteld krijgt, en hou indien gekocht enkel de rij met purchased 1
# Opmerking: candidates voor week 105 zijn allemaal purchased==0
brak = data.groupby(['customer_id', 'article_id', 'week']).size().reset_index(name="importance")
print(brak)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

data = pd.merge(
    data,
    brak,
    on=['customer_id', 'article_id', 'week']
)

data.purchased.mean()
print(data["importance"].isna().sum())
print(data["importance"].max())
print(data["importance"].mean())
print(data["importance"].min())

data.head()

In [None]:
sales.head()

In [None]:
bestsellers_previous_week.head()

### Add bestseller information

In [None]:
# van echte transacties: bestseller onbekend, check candidates om te kijken of er toen wel bestseller rank was. Zo nee, vul later met fillna
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [None]:
data.head()

In [None]:
# Verwijder eerste week omdat er voor eerste week geen bestsellers_previous_week is
data = data[data.week != data.week.min()]  # Presumably to make sure no data of an incomplete week is included?
# Indien geen bestseller: keislecht verkocht
data.bestseller_rank.fillna(bestsellerFiller, inplace=True)  # EDITED

In [None]:
# per customer per week alle transacties en/of candidates

# Steek bij elke aankoop alle info over gekocht article erbij
data = pd.merge(data, articles, on='article_id', how='left')
# Steek bij elke aankoop alle info over customer erbij
data = pd.merge(data, customers, on='customer_id', how='left')

In [None]:
# Sorteer eerst op week, dan per week op customer
# Sorteer eerst op week, dan per week op customer
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank','importance']

import itertools
new_features = dict()
# article_features = ['product_type_no','graphical_appearance_no','colour_group_code','perceived_colour_value_id','perceived_colour_master_id', 'department_no', 'index_code','index_group_no', 'section_no', 'garment_group_no']
article_features = ['index_group_no','graphical_appearance_no','perceived_colour_value_id','garment_group_no']
for feature_column in article_features:
    new_features["amount_of_(" + feature_column + ")"] = [feature_column]
for double_features in itertools.combinations(article_features,2):
    new_features["amount_of_(" + double_features[0] + "_" + double_features[1] + ")"] = [double_features[0],double_features[1]]

for feature_name,partial_columns in new_features.items():
    time_start = time.time()
    # columns_to_use.append(feature_name)
    columns_to_use.append(feature_name+"_rank")
    df_with_customer_id_and_features_and_count_and_rank = get_purchase_rank_df_of_attributes(transactions_full[transactions_full.week != test_week],articles,partial_columns,feature_name)
    # df_with_customer_id_and_features_and_count_and_rank = get_purchase_rank_df_of_attributes(transactions[transactions.week != test_week],articles,partial_columns,feature_name)
    data = pd.merge(data,df_with_customer_id_and_features_and_count_and_rank,on=(["customer_id"] + partial_columns),how="left")
    print(feature_name +  str(time.time() - time_start))


In [None]:
# Niet trainen op laatste week want anders hebben we geen test set
train = data[data.week != test_week]
# Laatste week, indien item in beide candidate sets, drop duplicates.
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()
test.head()

In [None]:
print(train.groupby(['week', 'customer_id']).head())
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
print(train_baskets)
print(train_baskets.min())
print(train_baskets.max())
print(len(train_baskets))

In [None]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

# Model training

In [None]:
from lightgbm.sklearn import LGBMRanker

In [None]:
ranker=LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type=LGBMBoostingType,
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [None]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

# Calculate predictions

In [None]:
%%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

# Create submission

In [None]:
sub = pd.read_csv('../data/sample_submission.csv')

In [None]:
%%time
preds = []

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)


for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

In [None]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [None]:
sub_name = 'basic_model_submission_' +  str(LGBMBoostingType) + '_fillna' + str(preprocess) + 'bestsellerFiller' + str(bestsellerFiller) + "_weeks" + str(transactionBackXWeeks) + "_importance" + str(prevYear)
sub.to_csv(f'../data/subs/{sub_name}.csv.gz', index=False)
sub.to_csv(f'../data/subs/{sub_name}.csv', index=False)
print("Done")
print(sub_name)

In [None]:
# !kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f 'data/subs/{sub_name}.csv.gz' -m {sub_name}