In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from lightgbm.sklearn import LGBMRanker
from Question1 import *
import pickle

In [2]:
%%time

transactions = pd.read_parquet('../data/transactions_train.parquet')
customers = pd.read_parquet('../data/customers.parquet')
articles = pd.read_parquet('../data/articles.parquet')

test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

CPU times: total: 672 ms
Wall time: 911 ms


# Generating candidates

### Last purchase candidates

In [3]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: total: 2.59 s
Wall time: 5.55 s


In [4]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
95,2020-07-15,2020-07-21
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15
104,2020-09-16,2020-09-22


In [5]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: total: 141 ms
Wall time: 331 ms


In [6]:
candidates_last_purchase = transactions.copy()

In [7]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: total: 4.39 s
Wall time: 9.69 s


### Bestsellers candidates

In [8]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [9]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
sales

week  article_id
95    760084003      1
      866731001      2
      600886001      3
      706016001      4
      372860002      5
                    ..
104   915529003      8
      915529005      9
      448509014     10
      762846027     11
      714790020     12
Name: bestseller_rank, Length: 120, dtype: int8

In [10]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.022980
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
...,...,...,...,...
115,105,915529003,8,0.033439
116,105,915529005,9,0.033417
117,105,448509014,10,0.041630
118,105,762846027,11,0.025005


In [11]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [12]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,bestseller_rank,price
0,2020-07-22,200292573348128,2,96,760084003,1,0.025094
1,2020-07-22,200292573348128,2,96,866731001,2,0.024919
2,2020-07-22,200292573348128,2,96,600886001,3,0.022980
3,2020-07-22,200292573348128,2,96,706016001,4,0.033197
4,2020-07-22,200292573348128,2,96,372860002,5,0.013193
...,...,...,...,...,...,...,...
8141191,2020-09-22,18440902715633436014,1,104,918292001,8,0.041424
8141192,2020-09-22,18440902715633436014,1,104,762846027,9,0.025104
8141193,2020-09-22,18440902715633436014,1,104,809238005,10,0.041656
8141194,2020-09-22,18440902715633436014,1,104,673677002,11,0.024925


In [13]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [14]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [15]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,200292573348128,2,96,760084003,0.025094
1,2020-07-22,200292573348128,2,96,866731001,0.024919
2,2020-07-22,200292573348128,2,96,600886001,0.022980
3,2020-07-22,200292573348128,2,96,706016001,0.033197
4,2020-07-22,200292573348128,2,96,372860002,0.013193
...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,1,105,915529003,0.033439
5248376,2020-09-22,18438270306572912089,1,105,915529005,0.033417
5248377,2020-09-22,18438270306572912089,1,105,448509014,0.041630
5248378,2020-09-22,18438270306572912089,1,105,762846027,0.025005


# Combining transactions and candidates / negative examples

In [16]:
transactions['purchased'] = 1

In [17]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [18]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [19]:
data.purchased.mean()

0.13607582749165664

### Add bestseller information

In [20]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [21]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [22]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

# Collaborative Filtering

In [23]:
itemcf_transactions = transactions.copy()
itemcf_transactions.drop(['sales_channel_id', 'price','week'], inplace=True, axis=1)
article_bought_count = itemcf_transactions[['article_id', 't_dat']].groupby('article_id').count().reset_index().rename(columns={'t_dat': 'count'})
most_bought_articles = article_bought_count[article_bought_count['count']>10]['article_id'].values
itemcf_transactions = itemcf_transactions[itemcf_transactions['article_id'].isin(most_bought_articles)]

In [24]:
itemcf_transactions['article_id'] = itemcf_transactions['article_id'].astype(str)

In [25]:
np.random.seed(0)

negative_samples = pd.DataFrame({
    'article_id': np.random.choice(itemcf_transactions.article_id.unique(), itemcf_transactions.shape[0]),
    'customer_id': np.random.choice(itemcf_transactions.customer_id.unique(), itemcf_transactions.shape[0]),
    'purchased': np.zeros(itemcf_transactions.shape[0])
})

In [26]:
rec = ItemCF(itemcf_transactions, negative_samples, num_components=500)
rec.fit(n_epochs=60)

In [27]:
# import pickle
# with open('output/rq60_1000-notebook.pickle', 'rb') as file:
#     rec:ItemCF = pickle.load(file)

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

s2 = pd.DataFrame(cosine_similarity(rec.articles_latent_matrix, rec.articles_latent_matrix, dense_output=False), index=most_bought_articles.tolist(),columns=most_bought_articles.tolist())
ss = s2.columns.to_numpy()[np.argsort(s2.values, axis=1)]
ss = pd.DataFrame(ss[:, -4:])
ss['article_id'] = most_bought_articles.tolist()

In [47]:
ss['article_id'] = ss['article_id'].astype(int)
ss[['article_id',0,1,2]]

Unnamed: 0,article_id,0,1,2
0,108775044,900659001,823180003,761018017
1,111565001,749699002,854789001,669091046
2,111586001,760084009,880008001,857417005
3,111593001,851996001,695632113,832309007
4,111609001,793185033,763284001,886538001
...,...,...,...,...
19444,949551002,866596001,740927007,369796007
19445,952267001,882810001,754267058,776781001
19446,953450001,884345001,932383001,874704002
19447,953763001,885950003,907432002,909921001


# Training

In [30]:
data = pd.merge(data,ss, on='article_id', how='left')

In [31]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [32]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [33]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [34]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no',  'age']
# columns_to_use.append(0)
# columns_to_use.append(1)
# columns_to_use.append(2)
# columns_to_use.append(3)
# columns_to_use.append(4)
# columns_to_use.append(5)
# columns_to_use.append(6)
# columns_to_use.append(7)
columns_to_use.append(8)
columns_to_use.append(9)
columns_to_use.append(10)
# columns_to_use.append(11)

In [35]:
%%time
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

train_X = train_X.fillna(0)

# train_X[0] = train_X[0].astype(int)
# train_X[1] = train_X[1].astype(int)
# train_X[2] = train_X[2].astype(int)
# train_X[3] = train_X[3].astype(int)
# train_X[4] = train_X[4].astype(int)
# train_X[5] = train_X[5].astype(int)
# train_X[6] = train_X[6].astype(int)
# train_X[7] = train_X[7].astype(int)
train_X[8] = train_X[8].astype(int)
train_X[9] = train_X[9].astype(int)
train_X[10] = train_X[10].astype(int)
# train_X[11] = train_X[11].astype(int)
train_X 

CPU times: total: 156 ms
Wall time: 395 ms


Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,age,8,9,10
0,887770001,253,1010016,9,4,5,1510,0,1,6,1010,21,728473002,782901006,825744001
1,762846001,259,1010016,10,3,9,1515,0,1,11,1010,21,920012002,866482006,909179001
2,829308001,273,1010016,9,4,5,8310,9,26,5,1005,21,738417001,333323030,857176007
3,760084003,272,1010016,9,4,5,1747,1,2,53,1009,21,737221011,797451001,760084006
4,866731001,273,1010016,9,4,5,8310,9,26,5,1005,21,866731002,866731003,915292001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11381607,918292001,273,1010010,9,4,5,8310,9,26,5,1005,60,602673015,868823008,856270002
11381608,762846027,259,1010016,13,1,1,1515,0,1,11,1010,60,941005005,762846031,809883003
11381609,809238005,252,1010010,8,4,12,1647,1,2,53,1003,60,854305001,736681001,870611003
11381610,673677002,252,1010016,9,4,5,1616,0,1,11,1003,60,704760008,902806001,875072003


# Model training

In [36]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [37]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.081943
[LightGBM] [Debug] init for col-wise cost 0.000008 seconds, init for row-wise cost 0.140769 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1561
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 15
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
CPU times: total: 4.88 s
Wall time: 1.88 s


In [38]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

article_id 0.29416790486237293
8 0.18004267422572867
10 0.13324521533735556
department_no 0.13122791338926112
product_type_no 0.09812354671428973
section_no 0.05097555297515842
9 0.03773201599629019
perceived_colour_master_id 0.02625207161275154
colour_group_code 0.024686818917617353
garment_group_no 0.023546285969174505
age 0.0
index_group_no 0.0
index_code 0.0
perceived_colour_value_id 0.0
graphical_appearance_no 0.0


# Calculate predictions

In [39]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: total: 0 ns
Wall time: 0 ns


# Create submission

In [40]:
sub = pd.read_csv('../data/sample_submission.csv')

In [41]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds
sub_name = 'rq10_1000-notebook2'
sub.to_csv(f'./output/{sub_name}.csv.gz', index=False)

CPU times: total: 7.19 s
Wall time: 13.5 s
