This code is from [https://github.com/radekosmulski/personalized_fashion_recs](https://github.com/radekosmulski/personalized_fashion_recs) with extra options and some improvements.
Comments explaining the original notebooks code were added by me and Arno Troch

In [313]:
import time

LGBMBoostingType = 'dart'
preprocess = '-1'  # '-1' uses preprocessing from original notebook, 'edited' uses slightly different preprocessing. You should probably use '-1'
bestsellerFiller = 999  # If a negative sample did not appear in a bestseller list, this is what the NaN is filled with. Normal values are between 1-12. If None, will use actual bestseller rank even beyond 12
transactionBackXWeeks = 10  # Size of training+test sets: this many weeks before test set
featuresBackXWeeks = 52  # How much data to use to calculate features based on user history. If set to more weeks than available in dataset, uses entire dataset
prevYear = ''  # if "SkipYear": uses training data as explained in transactionBackXWeeks + the same weeks of the previous year. Not recommended.
assert LGBMBoostingType in ['gbdt','dart','goss','rf']
assert preprocess in ['-1','edited','colours']
assert prevYear in ["","SkipYear"]

In [314]:
%run helper_functions.ipynb

In [315]:
import pandas as pd

In [316]:


transactions = pd.read_parquet(f'../data/transactions_train_{preprocess}.parquet')
transactions=transactions.drop(columns="t_dat")
# Backup is made because some features use the full dataset for calculations
transactions_full = pd.read_parquet(f'../data/transactions_train_{preprocess}.parquet')
transactions_full=transactions_full.drop(columns="t_dat")
# TODO temporary please remove next line
transactions_full = transactions_full[transactions_full.week > transactions_full.week.max() - transactionBackXWeeks]
customers = pd.read_parquet(f'../data/customers_{preprocess}.parquet')
articles = pd.read_parquet(f'../data/articles_{preprocess}.parquet')

In [317]:
# mean price PER ITEM PER WEEK
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()
mean_price.reset_index().head()

Unnamed: 0,week,article_id,price
0,0,108775015,0.008373
1,0,108775044,0.008374
2,0,108775051,0.005023
3,0,110065001,0.024983
4,0,110065002,0.02465


In [318]:
test_week = transactions.week.max() + 1
# Unless you really want to test training on transactionBackXWeeks and transactionBackXWeeks of last year, just read the else
if prevYear == 'SkipYear':
    # Starting from final week in dataset, select past transactionBackXWeeks weeks
    transactions3 = transactions[transactions.week > transactions.week.max() - transactionBackXWeeks]
    # Starting from final week in dataset but a year earlier, select past transactionBackXWeeks weeks
    transactions2 = transactions[(transactions.week.max()-52>=transactions.week) & (transactions.week >transactions.week.max() - transactionBackXWeeks-52)] # EDITED
    print(transactions3['week'].unique())
    print(transactions2['week'].unique())
    # training data now consists of transactionBackXWeeks of current year and last year
    transactions = pd.concat([transactions3,transactions2])
else:
    # Starting from final week in dataset, select past transactionBackXWeeks weeks as training data
    transactions = transactions[transactions.week > transactions.week.max() - transactionBackXWeeks]

min_week = transactions["week"].min()

In [319]:

import copy
def get_purchase_rank_df_of_attributes(transactions,articles,attributes_columns_names,feature_name):
    """
    Given customer ids and arbitrary article features (except article id), returns a df with rows containing each combination of customer_id and combination of
    For example: if attributes_columns_names contains ["garment_group_name"], then the final dataframe will contain for each customer how often he bought a garment with each possible value in garment_group_name, and a rank of which ones are his favourites.
    :param transactions: pandas dataframe: Transactions on which to calculate these features, can be full transactions dataset even if training data is a subset
    :param articles: pandas dataframe: Articles table, should be full table
    :param attributes_columns_names: List of strings: Article Column names for which to calculate these features. Should not contain "article_id"
    :param feature_name: string: Name for the new feature
    :return: pandas dataframe: columns customer_id, attributes_columns_names, feature_name, str(feature_name)+"_rank"
    """

    # To make merges later on easier, this variable contains the article columns asked form in the function argument plus article_id
    attributes_columns_names_plus_article_id = copy.deepcopy(attributes_columns_names)
    attributes_columns_names_plus_article_id.insert(0,"article_id")

    # To make merges later on easier, this variable contains the article columns asked form in the function argument plus customer_id
    attributes_columns_names_plus_customer_id = copy.deepcopy(attributes_columns_names)
    attributes_columns_names_plus_customer_id.insert(0,"customer_id")

    # From articles, select only relevant columns. If we want to calculate what a users favourite colour is, we do not need the garment type.
    articles_selected = articles[attributes_columns_names_plus_article_id]

    # This merge results in a dataframe containing for each transaction from the function argument the customer_id, article_id and article features as given in the attributes_columns_names argument
    big_df = pd.merge(articles_selected,transactions[["customer_id","article_id"]],on=["article_id"])

    # Adds a column containing for each transaction how often the customer has already bought clothing with the same attributes_columns_names as the article_id from the transaction
    big_df = big_df.groupby(attributes_columns_names_plus_customer_id).size().reset_index(name=feature_name)

    # Adds a column containing for each transaction the rank that the user gives to clothing with the same attributes_columns_names as the article_id in the transaction.
    # In this case, rank means that if the article id is blue and the user bought lots of blue things, it will be one. If the article is red and red is the users second favourite, it will be 2 etc
    big_df[feature_name + "_rank"] =  big_df.groupby("customer_id")[feature_name].rank(method="dense",ascending=False)
    return big_df

def get_purchase_count_df_of_attributes(transactions,articles,attributes_columns_names,feature_name):
    """
    Given customer ids and arbitrary article features (except article id), returns a df with rows containing each combination of customer_id and combination of
    For example: if attributes_columns_names contains ["garment_group_name"], then the final dataframe will contain for each customer how often he bought a garment with each possible value in garment_group_name, and a rank of which ones are his favourites.
    :param transactions: pandas dataframe: Transactions on which to calculate these features, can be full transactions dataset even if training data is a subset
    :param articles: pandas dataframe: Articles table, should be full table
    :param attributes_columns_names: List of strings: Article Column names for which to calculate these features. Should not contain "article_id"
    :param feature_name: string: Name for the new feature
    :return: pandas dataframe: columns customer_id, attributes_columns_names, feature_name, str(feature_name)+"_rank"
    """

    # To make merges later on easier, this variable contains the article columns asked form in the function argument plus article_id
    attributes_columns_names_plus_article_id = copy.deepcopy(attributes_columns_names)
    attributes_columns_names_plus_article_id.insert(0,"article_id")

    # To make merges later on easier, this variable contains the article columns asked form in the function argument plus customer_id
    attributes_columns_names_plus_customer_id = copy.deepcopy(attributes_columns_names)
    attributes_columns_names_plus_customer_id.insert(0,"customer_id")

    # From articles, select only relevant columns. If we want to calculate what a users favourite colour is, we do not need the garment type.
    articles_selected = articles[attributes_columns_names_plus_article_id]

    # This merge results in a dataframe containing for each transaction from the function argument the customer_id, article_id and article features as given in the attributes_columns_names argument
    big_df = pd.merge(articles_selected,transactions[["customer_id","article_id"]],on=["article_id"])

    # Adds a column containing for each transaction how often the customer has already bought clothing with the same attributes_columns_names as the article_id from the transaction
    big_df = big_df.groupby(attributes_columns_names_plus_customer_id).size().reset_index(name=feature_name)

    # Adds a column containing for each transaction the rank that the user gives to clothing with the same attributes_columns_names as the article_id in the transaction.
    # In this case, rank means that if the article id is blue and the user bought lots of blue things, it will be one. If the article is red and red is the users second favourite, it will be 2 etc
    # big_df[feature_name + "_rank"] =  big_df.groupby("customer_id")[feature_name].rank(method="dense",ascending=False)
    return big_df


In [320]:
# Final result of this cell contains for each week in transactions (training data, not full dataset) all articles that were sold, ranked by which ones sold best, their average price in that week, and all data normally included in the articles table.

# Ranks for each week which items sold best
sales_nohead = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(9999999999999).rename('bestseller_rank').astype('int64').reset_index()

# Add article columns, e.g. garment_type_name
sales_nohead = pd.merge(sales_nohead,articles,how="left",on=["article_id"])
# Add average price of product in week
sales_nohead = pd.merge(sales_nohead,mean_price,how="left",on=["week","article_id"])
sales_nohead.head(100)

Unnamed: 0,week,article_id,bestseller_rank,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,price
0,95,760084003,1,760084,1134,272,0,1,1010016,0,...,1,1,2,2,53,1,1009,5,847,0.025094
1,95,866731001,2,866731,3609,273,15,1,1010016,0,...,9,9,26,4,5,21,1005,0,3130,0.024919
2,95,600886001,3,600886,1424,59,20,6,1010016,0,...,7,7,1,0,60,22,1018,12,420,0.022980
3,95,706016001,4,706016,172,272,0,1,1010016,0,...,1,1,2,2,53,1,1009,5,30,0.033197
4,95,372860002,5,372860,19652,302,14,7,1010016,0,...,7,7,1,0,62,31,1021,13,157,0.013193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,878013001,75,878013,1011,265,1,2,1010001,1,...,0,0,1,0,15,0,1013,8,3544,0.049460
96,95,720125040,76,720125,99,273,15,1,1010005,8,...,9,9,26,4,5,21,1005,0,313,0.023239
97,95,610776071,77,610776,46,255,3,0,1010001,1,...,0,0,1,0,16,30,1002,2,60,0.008110
98,95,852174003,77,852174,3280,306,13,4,1010016,0,...,9,9,26,4,5,21,1005,0,3945,0.024849


In [321]:
# Columns to use for training
# Useful because including garment_type_name and garment_type_no would be redundant
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank','importance']

import itertools
# For features generated by get_purchase_rank_df_of_attributes
# Key: feature name
# Value: list of strings: columns of article_ids the feature is based on
new_features = dict()
# Generate new features based on these article_id columns
article_features = ['index_group_no','graphical_appearance_no','perceived_colour_value_id','garment_group_no']
# For each column listed in article_features: say that we want to make a new feature out of it later on
for feature_column in article_features:
    new_features["amount_of_(" + feature_column + ")"] = [feature_column]
# For each combination of 2 columns listed in article_features: say that we want to make a new feature out of it later on
for double_features in itertools.combinations(article_features,2):
    new_features["amount_of_(" + double_features[0] + "_" + double_features[1] + ")"] = [double_features[0],double_features[1]]

all_new_features = []

# For everything I said I would make a new feature of:
for feature_name,partial_columns in new_features.items():
    tempname = str(feature_name)+"_temp"
    time_start = time.time()
    # Tell ranker to use new features
    # Can be commented out to only use either count/rank
    columns_to_use.append(feature_name)
    columns_to_use.append(feature_name+"_rank")

    current_week = min_week+1
    feature_all_weeks = pd.DataFrame()
    while current_week < test_week:
        # Features are calculated on purchase history: taki into account not to incorporate future data
        if feature_all_weeks.empty:
            # See definition of get_purchase_rank_df_of_attributes for comments
            # Get purchase count of articles with certain attributes in past X weeks
            df_with_customer_id_and_features_and_count = get_purchase_count_df_of_attributes(transactions_full[(transactions_full.week < current_week) & (transactions_full.week > current_week-featuresBackXWeeks)],articles,partial_columns,feature_name)
            df_with_customer_id_and_features_and_count[feature_name] = df_with_customer_id_and_features_and_count[feature_name].fillna(0)  # If user did not buy anything yet, set purchase count to 0
            df_with_customer_id_and_features_and_count["week"] = current_week
        else:
            # Objective of this section: calculate purchase counts for one week, and add those of last week to prevent calculating the entirety of get_purchase_count_df_of_attributes for the (near) full dataset every week
            # Get purchase counts for this week
            df_with_customer_id_and_features_and_count = get_purchase_count_df_of_attributes(transactions_full[(transactions_full.week == current_week)],articles,partial_columns,feature_name)
            df_with_customer_id_and_features_and_count[feature_name] = df_with_customer_id_and_features_and_count[feature_name].fillna(0)  # If user did not buy anything this week, set purchase count to 0
            # get purchase counts of last week
            previous_week_purchase_counts = feature_all_weeks[(feature_all_weeks.week == (current_week - 1))]
            # Rename purchase counts of this week
            df_with_customer_id_and_features_and_count = df_with_customer_id_and_features_and_count.rename(columns={feature_name:tempname})
            # For each customer and selected article feature, get purchase count of this one week and purchase counts up to and including previous week
            df_with_customer_id_and_features_and_count = pd.merge(df_with_customer_id_and_features_and_count[["customer_id",tempname]+partial_columns],previous_week_purchase_counts[["customer_id",feature_name]+partial_columns],how="outer",on=[["customer_id"]+partial_columns][0])
            # If customer had not purchased items with certain feature in either this week or before this week, set purchase count to 0
            df_with_customer_id_and_features_and_count.fillna(0)
            # Add purchase counts of this week and before this week to get purchase counts up to and including this week
            df_with_customer_id_and_features_and_count[feature_name] = df_with_customer_id_and_features_and_count[feature_name] + df_with_customer_id_and_features_and_count[tempname]
            # Remove temporary column used for calculation above
            df_with_customer_id_and_features_and_count.drop(columns=[tempname],inplace=True)
            # All purchase counts are up to and including this week
            df_with_customer_id_and_features_and_count["week"] = current_week

        # Store all weeks in one dataframe
        if feature_all_weeks.empty:
            feature_all_weeks = df_with_customer_id_and_features_and_count.copy()
        else:
            feature_all_weeks = pd.concat([feature_all_weeks,df_with_customer_id_and_features_and_count])
        current_week += 1

    # Include ranking of feature: if blue was the users most bought garment color, each transaction where the customer buys blue things will be 1
    feature_all_weeks[feature_name+"_rank"] = feature_all_weeks.groupby(["customer_id","week"])[feature_name].rank(method="dense",ascending=False)

    # Keep list of all new feature dataframes + column names to merge them later
    all_new_features.append([feature_all_weeks,partial_columns])

    # Print time it took to generate feature
    print(feature_name +  str(time.time() - time_start))

amount_of_(index_group_no)2.4210541248321533
amount_of_(graphical_appearance_no)2.817498207092285
amount_of_(perceived_colour_value_id)3.0411486625671387
amount_of_(garment_group_no)3.244259834289551
amount_of_(index_group_no_graphical_appearance_no)3.4993338584899902
amount_of_(index_group_no_perceived_colour_value_id)3.7298989295959473
amount_of_(index_group_no_garment_group_no)3.738874673843384
amount_of_(graphical_appearance_no_perceived_colour_value_id)4.020076513290405
amount_of_(graphical_appearance_no_garment_group_no)4.051626920700073
amount_of_(perceived_colour_value_id_garment_group_no)4.507635831832886


In [356]:
all_new_features[0][0].head()

Unnamed: 0,customer_id,index_group_no,amount_of_(index_group_no),week,amount_of_(index_group_no)_rank
0,28847241659200,1,1.0,96,1.0
1,28847241659200,26,1.0,96,1.0
2,200292573348128,1,8.0,96,1.0
3,200292573348128,3,1.0,96,2.0
4,272412481300040,1,2.0,96,2.0


In [322]:
# Example application of get_purchase_rank_df_of_attributes
# Assuming output is deterministic, you can see that customer 28847241659200 bought 2 articles from garment_group_no 1010 (as seen in amount_of_garment_group_no),
# making it his favourite garment_group_no (as seen inn column amount_of_garment_group_no_rank)
temp2 = get_purchase_count_df_of_attributes(transactions,articles,["garment_group_no"],"amount_of_garment_group_no")
temp2.head()

Unnamed: 0,customer_id,garment_group_no,amount_of_garment_group_no
0,28847241659200,1005,1
1,28847241659200,1007,1
2,28847241659200,1009,1
3,28847241659200,1010,2
4,41318098387474,1013,1


In [323]:
# Example application of get_purchase_rank_df_of_attributes
# Assuming output is deterministic, you can see that customer 28847241659200 bought 2 articles from garment_group_no 1010 (as seen in amount_of_garment_group_no),
# making it his favourite garment_group_no (as seen inn column amount_of_garment_group_no_rank)
temp2 = get_purchase_rank_df_of_attributes(transactions,articles,["garment_group_no"],"amount_of_garment_group_no")
temp2.head()

Unnamed: 0,customer_id,garment_group_no,amount_of_garment_group_no,amount_of_garment_group_no_rank
0,28847241659200,1005,1,2.0
1,28847241659200,1007,1,2.0
2,28847241659200,1009,1,2.0
3,28847241659200,1010,2,1.0
4,41318098387474,1013,1,1.0


In [324]:
transactions.head()

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week
29030503,272412481300040,778064028,0.008458,1,95
29030504,272412481300040,816592008,0.016932,1,95
29030505,272412481300040,621381021,0.033881,1,95
29030506,272412481300040,817477003,0.025407,1,95
29030507,272412481300040,899088002,0.025407,1,95


# Generating candidates

### Last purchase candidates

In [325]:
# Final result of cell:
# Candidate for week X: item bought in previous purchase week

c2weeks = transactions.groupby('customer_id')['week'].unique()

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

candidates_last_purchase = transactions.copy()

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])

# Candidate for week X: item bought in previous purchase week
candidates_last_purchase.week=weeks

In [326]:
print(candidates_last_purchase)

                   customer_id  article_id     price  sales_channel_id  week
29030503       272412481300040   778064028  0.008458                 1    96
29030504       272412481300040   816592008  0.016932                 1    96
29030505       272412481300040   621381021  0.033881                 1    96
29030506       272412481300040   817477003  0.025407                 1    96
29030507       272412481300040   899088002  0.025407                 1    96
...                        ...         ...       ...               ...   ...
31774722  18439937050817258297   891591003  0.084729                 2   105
31774723  18439937050817258297   869706005  0.084729                 2   105
31779097  18440902715633436014   918894002  0.016932                 1   105
31779098  18440902715633436014   761269001  0.016932                 1   105
31780475  18443633011701112574   914868002  0.033881                 1   105

[2762872 rows x 5 columns]


### Bestsellers candidates

In [327]:
candidates_last_purchase.head()

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week
29030503,272412481300040,778064028,0.008458,1,96
29030504,272412481300040,816592008,0.016932,1,96
29030505,272412481300040,621381021,0.033881,1,96
29030506,272412481300040,817477003,0.025407,1,96
29030507,272412481300040,899088002,0.025407,1,96


In [328]:
transactions.head()

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week
29030503,272412481300040,778064028,0.008458,1,95
29030504,272412481300040,816592008,0.016932,1,95
29030505,272412481300040,621381021,0.033881,1,95
29030506,272412481300040,817477003,0.025407,1,95
29030507,272412481300040,899088002,0.025407,1,95


In [329]:

# bestseller rank doet niets: ranking is belangrijk om de bestsellers te vinden, maar de kolom zelf mag weg
# For each week, list of ranked 12 bestsellers
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
sales.head()

week  article_id
95    760084003     1
      866731001     2
      600886001     3
      706016001     4
      372860002     5
Name: bestseller_rank, dtype: int8

In [330]:

# Voor elke week, zegt ge koop het best verkochte item in de vorige week
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1
# Per week lijst van customers die IETS gekocht hebben
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()
unique_transactions.head()

Unnamed: 0,customer_id,sales_channel_id,week
29030503,272412481300040,1,95
29064059,1456826891333599,1,95
29067103,2133687643102426,2,95
29027487,6010692573790711,1,95
29046403,6171059100114610,2,95


In [331]:

# Per week lijst van customers die IETS gekocht hebben
# MERGE
# Voor elke week, zegt ge koop het best verkochte item in de vorige week

# Per week, per customer die iets gekocht heeft, de 12 bestverkochte uit DE (algemeen, niet per customer) vorige week
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

# unique_transactions = Per week lijst van customers die IETS gekocht hebben
# Voor elke customer waar we iets over weten en dus een voorspelling van willen doen, houden we 1 keer de customer id over en zetten we de week op test_week, want dat is wanneer we willen voorspellen wat hij koopt
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week


# Voor elke customer waar we iets over weten en dus een voorspelling van willen doen, houden we 1 keer de customer id over en zetten we de week op test_week, want dat is wanneer we willen voorspellen wat hij koopt
# MERGE
# Voor elke week, zegt ge koop het best verkochte item in de vorige week

# Resultaat: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

# Per week, per customer die iets gekocht heeft, de 12 bestverkochte uit DE (algemeen) vorige week
# Resultaat: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

# Combining transactions and candidates / negative examples
transactions['purchased'] = 1


# candidates_last_purchase: Candidate for week X: item bought in previous purchase week, negative samples
# candidates_bestsellers: voor elke customer waarvoor we iets kunnen voorspellen, geven we de 12 bestseller van testweek-1 als candidate voor testweek, negative samples
# transactions: letterlijk gewoon transactions, positive samples
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
# For real transactions, purchased was 1 (positive sample). This sets the value to 0 for negative samples
data.purchased.fillna(0, inplace=True)

In [332]:
data.head()

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,272412481300040,778064028,0.008458,1,95,1.0
29030504,272412481300040,816592008,0.016932,1,95,1.0
29030505,272412481300040,621381021,0.033881,1,95,1.0
29030506,272412481300040,817477003,0.025407,1,95,1.0
29030507,272412481300040,899088002,0.025407,1,95,1.0


In [333]:

# Voor elke week: kijk alle keren dat customer het artikel koopt OF voorgesteld krijgt (kolom importance), en hou indien gekocht enkel de rij met purchased 1
# Opmerking: candidates voor week 105 zijn allemaal purchased==0
brak = data.groupby(['customer_id', 'article_id', 'week']).size().reset_index(name="importance")
print(brak)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

data = pd.merge(
    data,
    brak,
    on=['customer_id', 'article_id', 'week']
)

data.purchased.mean()
print(data["importance"].isna().sum())
print(data["importance"].max())
print(data["importance"].mean())
print(data["importance"].min())

data.head()

                   customer_id  article_id  week  importance
0               28847241659200   372860002    96           1
1               28847241659200   448509014   105           1
2               28847241659200   547780003    96           1
3               28847241659200   600886001    96           1
4               28847241659200   610776002    96           1
...                        ...         ...   ...         ...
18253744  18446737527580148316   923758001   104           1
18253745  18446737527580148316   923758001   105           1
18253746  18446737527580148316   924243001   104           1
18253747  18446737527580148316   924243001   105           1
18253748  18446737527580148316   924243002   105           1

[18253749 rows x 4 columns]
0
74
1.0362430205433415
1


Unnamed: 0,customer_id,article_id,price,sales_channel_id,week,purchased,importance
0,272412481300040,778064028,0.008458,1,95,1.0,1
1,272412481300040,816592008,0.016932,1,95,1.0,1
2,272412481300040,621381021,0.033881,1,95,1.0,1
3,272412481300040,817477003,0.025407,1,95,1.0,1
4,272412481300040,899088002,0.025407,1,95,1.0,1


In [334]:
sales.head()

week  article_id
95    760084003     1
      866731001     2
      600886001     3
      706016001     4
      372860002     5
Name: bestseller_rank, dtype: int8

In [335]:
bestsellers_previous_week.head()

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.02298
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193


### Add bestseller information

In [336]:
# Van echte transacties: bestseller onbekend, check candidates om te kijken of er toen wel bestseller rank was. Zo nee, vul later met fillna
if bestsellerFiller is None:
    full_bestsellers_previous_week = sales_nohead.copy(deep=True)
    full_bestsellers_previous_week.week += 1
    data = pd.merge(
        data,
        sales_nohead[['week', 'article_id', 'bestseller_rank']],
        on=['week', 'article_id'],
        how='left'
    )
else:
    data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [337]:
data.head()

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week,purchased,importance,bestseller_rank
0,272412481300040,778064028,0.008458,1,95,1.0,1,
1,272412481300040,816592008,0.016932,1,95,1.0,1,
2,272412481300040,621381021,0.033881,1,95,1.0,1,
3,272412481300040,817477003,0.025407,1,95,1.0,1,
4,272412481300040,899088002,0.025407,1,95,1.0,1,


In [338]:
# Verwijder eerste week omdat er voor eerste week geen bestsellers_previous_week is
data = data[data.week != data.week.min()]  # Presumably to make sure no data of an incomplete week is included?
# Indien geen bestseller: keislecht verkocht (default bestsellerFiller is 999, wat betekent dat er zogezegd 998 beter verkopende items zijn)
data.bestseller_rank.fillna(bestsellerFiller, inplace=True)

In [339]:
# per customer per week alle transacties en/of candidates

# Steek bij elke aankoop alle info over gekocht article erbij
data = pd.merge(data, articles, on='article_id', how='left')
# Steek bij elke aankoop alle info over customer erbij
data = pd.merge(data, customers, on='customer_id', how='left')

In [340]:
# Sorteer eerst op week, dan per week op customer
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [341]:
data.head()

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week,purchased,importance,bestseller_rank,product_code,prod_name,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,28847241659200,887770001,0.016932,1,96,1.0,1,999.0,887770,727,...,10,1010,6,3692,1,1,0,1,21,57896
1,28847241659200,762846001,0.025407,1,96,0.0,1,999.0,762846,472,...,7,1010,6,492,1,1,0,1,21,57896
2,28847241659200,829308001,0.033881,1,96,0.0,1,999.0,829308,11402,...,21,1005,0,9082,1,1,0,1,21,57896
3,28847241659200,760084003,0.025094,1,96,0.0,1,1.0,760084,1134,...,1,1009,5,847,1,1,0,1,21,57896
4,28847241659200,866731001,0.024919,1,96,0.0,1,2.0,866731,3609,...,21,1005,0,3130,1,1,0,1,21,57896


In [342]:



for feature_df_partial_columns in all_new_features:
    # merge new features into training data
    data = pd.merge(data,feature_df_partial_columns[0],on=(["customer_id","week"] + feature_df_partial_columns[1]),how="left")

In [343]:
# Niet trainen op laatste week want anders hebben we geen test set
train = data[data.week != test_week]
# Laatste week, indien item in beide candidate sets, drop duplicates.
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()
test.head()

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week,purchased,importance,bestseller_rank,product_code,prod_name,...,amount_of_(index_group_no_perceived_colour_value_id),amount_of_(index_group_no_perceived_colour_value_id)_rank,amount_of_(index_group_no_garment_group_no),amount_of_(index_group_no_garment_group_no)_rank,amount_of_(graphical_appearance_no_perceived_colour_value_id),amount_of_(graphical_appearance_no_perceived_colour_value_id)_rank,amount_of_(graphical_appearance_no_garment_group_no),amount_of_(graphical_appearance_no_garment_group_no)_rank,amount_of_(perceived_colour_value_id_garment_group_no),amount_of_(perceived_colour_value_id_garment_group_no)_rank
11381612,28847241659200,925246001,0.128797,2,105,0.0,1,999.0,925246,25454,...,,,,,,,,,,
11381613,28847241659200,924243001,0.041535,1,105,0.0,1,1.0,924243,19190,...,,,,,,,,,,
11381614,28847241659200,924243002,0.041877,1,105,0.0,1,2.0,924243,19190,...,,,,,,,,,,
11381615,28847241659200,918522001,0.041435,1,105,0.0,1,3.0,918522,26372,...,,,,,,,,,,
11381616,28847241659200,923758001,0.033462,1,105,0.0,1,4.0,923758,19359,...,,,,,,,,,,


In [344]:
print(train.groupby(['week', 'customer_id']).head())
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
print(train_baskets)
print(train_baskets.min())
print(train_baskets.max())
print(len(train_baskets))

                   customer_id  article_id     price  sales_channel_id  week  \
0               28847241659200   887770001  0.016932                 1    96   
1               28847241659200   762846001  0.025407                 1    96   
2               28847241659200   829308001  0.033881                 1    96   
3               28847241659200   760084003  0.025094                 1    96   
4               28847241659200   866731001  0.024919                 1    96   
...                        ...         ...       ...               ...   ...   
11381596  18446737527580148316   547780001  0.023712                 2   104   
11381597  18446737527580148316   763988001  0.023712                 2   104   
11381598  18446737527580148316   763988003  0.023712                 2   104   
11381599  18446737527580148316   547780040  0.023712                 2   104   
11381600  18446737527580148316   909370001  0.032947                 2   104   

          purchased  importance  bestse

In [345]:

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

# Model training

In [346]:
from lightgbm.sklearn import LGBMRanker

In [347]:
ranker=LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type=LGBMBoostingType,
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [348]:


ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.966881
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.597342
[LightGBM] [Debug] init for col-wise cost 0.206742 seconds, init for row-wise cost 0.673104 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1634
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 39
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 13


In [349]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.957814386479696
importance 0.03715317827077021
amount_of_(perceived_colour_value_id_garment_group_no) 0.0022851503484961656
amount_of_(index_group_no_graphical_appearance_no) 0.0008381855423129702
amount_of_(graphical_appearance_no_perceived_colour_value_id) 0.0005350353642320298
amount_of_(index_group_no) 0.0004550145217848267
amount_of_(index_group_no_garment_group_no) 0.00015598598585343868
amount_of_(graphical_appearance_no_garment_group_no) 0.00012536116257250133
age 0.00011990337260238691
amount_of_(graphical_appearance_no) 8.868150822980683e-05
article_id 7.908940445814566e-05
product_type_no 5.835835446335918e-05
postal_code 5.7423780135186813e-05
club_member_status 5.3361202197422264e-05
amount_of_(index_group_no_perceived_colour_value_id) 5.2607449396314046e-05
colour_group_code 3.904877755246444e-05
section_no 3.4283764899243374e-05
graphical_appearance_no 2.8817696256820992e-05
garment_group_no 2.612701409075152e-05
Active 0.0
perceived_colour_value_id 0.0

# Calculate predictions

In [350]:

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

# Create submission

In [351]:
sub = pd.read_csv('../data/sample_submission.csv')

In [352]:

preds = []

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)


for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

In [353]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [354]:
sub_name = 'basic_model_submission_' +  str(LGBMBoostingType) + '_fillna' + str(preprocess) + 'bestsellerFiller' + str(bestsellerFiller) + "_weeks" + str(transactionBackXWeeks) + "_importance" + str(prevYear)
sub.to_csv(f'../data/subs/{sub_name}.csv.gz', index=False)
sub.to_csv(f'../data/subs/{sub_name}.csv', index=False)
print("Done")
print(sub_name)

Done
basic_model_submission_dart_fillna-1bestsellerFiller999_weeks10_importance


In [355]:
# !kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f 'data/subs/{sub_name}.csv.gz' -m {sub_name}