# Lecture 6: Improvement of the scores

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r /content/drive/MyDrive/recpack /content

In [None]:
!unzip /content/drive/MyDrive/transactions_train.zip
!unzip /content/drive/MyDrive/articles.zip
!unzip /content/drive/MyDrive/customers.zip

Archive:  /content/drive/MyDrive/transactions_train.zip
  inflating: transactions_train.feather  
Archive:  /content/drive/MyDrive/articles.zip
  inflating: articles.feather        
Archive:  /content/drive/MyDrive/customers.zip
  inflating: customers.feather       


In [None]:
import numpy as np 
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import string
import seaborn as sns

### In order to achieve a better perfomance in RAM and running time, we convert the CSV datasets into Feather format.

In [None]:
articles = pd.read_feather('articles.feather')
customers = pd.read_feather('customers.feather')
transactions = pd.read_feather('transactions_train.feather')

### Preprocessing the transaction dataset


In [None]:
def customer_id_to_int(x): return int(x[-16:], 16)

transactions['customer_id'] = transactions['customer_id'].apply(customer_id_to_int).astype('int32')
transactions['article_id'] = transactions['article_id'].astype('int32')

In [None]:
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
#transactions['article_id'] = transactions['article_id'].astype('int32')
#transactions['customer_id'] = transactions['customer_id'].apply(customer_id_to_int).astype('int32')
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')
transactions.price = transactions.price.astype('float32')

In [None]:
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7
transactions.week = transactions.week.astype('int8')

last_month = transactions['week'].max() - 8
transactions = transactions.loc[transactions["week"] >= last_month]
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29314980,2020-07-22,294687682,778064038,0.008458,2,96
29314981,2020-07-22,294687682,817166007,0.006763,2,96
29314982,2020-07-22,294687682,840360002,0.008458,2,96
29314983,2020-07-22,294687682,817166003,0.008458,2,96
29314984,2020-07-22,974978159,624486001,0.012729,2,96
...,...,...,...,...,...,...
31788319,2020-09-22,1362182998,929511001,0.059305,2,104
31788320,2020-09-22,1362182998,891322004,0.042356,2,104
31788321,2020-09-22,-324376415,918325001,0.043203,1,104
31788322,2020-09-22,2104975119,833459002,0.006763,1,104


### Preprocessing the article dataset

In [None]:
articles['article_id'] = articles['article_id'].astype('int32')
articles['product_code'] = articles['product_code'].astype('int8')
articles['prod_name'] = pd.factorize(articles['prod_name'])[0].astype('int8')
articles['product_group_name'] = pd.factorize(articles['product_group_name'])[0].astype('int8')
articles['product_type_no'] = articles['product_type_no'].astype('int8')
articles['graphical_appearance_no'] = pd.factorize(articles['graphical_appearance_no'])[0].astype('int8')
articles['colour_group_code'] = articles['colour_group_code'].astype('int8')
articles['perceived_colour_value_id'] = articles['perceived_colour_value_id'].astype('int8')
articles['perceived_colour_master_id'] = articles['perceived_colour_master_id'].astype('int8')
articles['department_no'] = articles['department_no'].astype('int8')
articles['index_code'] = pd.factorize(articles['department_no'])[0].astype('int8')
articles['index_group_no'] = articles['index_group_no'].astype('int8')
articles['section_no'] = articles['section_no'].astype('int8')
articles['garment_group_no'] = articles['garment_group_no'].astype('int16')

In [None]:
train_frame = articles[
        ['article_id', 'product_code', 'prod_name', 'product_type_name', 'product_group_name',
         'graphical_appearance_name', 'department_name', 'index_name', 'index_group_name', 'section_name',
         'garment_group_name']].drop_duplicates()
train_frame = train_frame.apply(lambda x: ','.join(x.astype(str)), axis=1)
train_frame = pd.DataFrame({'clean': train_frame})
lol = [row.split(',') for row in train_frame['clean']]

In [None]:
import sys
!{sys.executable} -m pip install gensim --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from gensim.models import Word2Vec

In [None]:
model = Word2Vec(min_count=1,vector_size=20,workers=7,window=3,sg=0)
model.build_vocab(lol)
model.train(lol, total_examples=model.corpus_count, epochs=30)

(24011716, 35344620)

In [None]:
articles["w2v"] = articles['article_id'].apply(lambda x: model.wv[str(x)])
articles[[f"w2v_{i}" for i in range(20)]] = pd.DataFrame(articles['w2v'].tolist(), index=articles.index)
articles.drop(columns=['w2v'], inplace=True)

In [None]:
articles.drop(
        columns=['product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name',
                 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name',
                 'garment_group_name', 'detail_desc'], inplace=True)

In [None]:
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,...,w2v_10,w2v_11,w2v_12,w2v_13,w2v_14,w2v_15,w2v_16,w2v_17,w2v_18,w2v_19
0,108775015,-25,0,-3,0,0,9,4,5,-116,...,0.007704,2.479805e-02,0.068470,0.043121,-0.145160,0.061358,-0.026883,0.006210,0.067468,-0.056050
1,108775044,-25,0,-3,0,0,10,3,9,-116,...,-0.092454,7.572486e-02,-0.020917,-0.019000,-0.035082,0.088324,0.023263,0.033272,0.070299,-0.056039
2,108775051,-25,1,-3,0,1,11,1,9,-116,...,-0.054498,1.676207e-01,0.103985,0.061501,-0.141416,0.164250,-0.051793,-0.068908,0.027160,-0.078507
3,110065001,-15,2,50,1,0,9,4,5,59,...,-0.094800,-1.100953e-01,-0.132556,0.046063,-0.054343,0.115471,0.046811,0.046766,0.033120,0.104933
4,110065002,-15,2,50,1,0,10,3,9,59,...,-0.020617,5.773571e-02,-0.041889,-0.086233,-0.057722,0.159183,0.028517,0.077849,0.119956,0.071445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,106,46,46,2,18,9,4,5,20,...,-0.019319,2.515537e-03,-0.044766,0.023603,0.016392,0.038854,0.012690,0.023919,0.084795,-0.031676
105538,953763001,-93,47,-3,0,0,9,4,5,127,...,-0.077943,9.505538e-02,0.001528,-0.059353,-0.087014,0.017127,-0.052961,-0.034946,0.108969,-0.043540
105539,956217002,57,48,9,11,0,9,4,5,105,...,0.007406,-3.459068e-02,0.193177,0.027697,0.029575,0.089456,-0.071702,0.026386,0.152050,-0.063461
105540,957375001,-65,49,72,4,0,9,4,5,106,...,0.010626,8.114557e-07,0.041194,0.028786,-0.000209,-0.067273,-0.107796,-0.034879,0.078106,-0.024279


### Preprocess of customer dataset

In [None]:
customers.fillna({"FN": 0, "Active": 0}, inplace=True)

def customer_id_to_int(x): return int(x[-16:], 16)
customers['customer_id'] = customers['customer_id'].apply(customer_id_to_int).astype('int32')

In [None]:
customers["FN"] = customers["FN"].astype('int8')
customers["Active"] = customers["Active"].astype('int8')
customers['age'].fillna(int((customers['age'].mean())), inplace=True)
customers["fashion_news_frequency"] = pd.factorize(customers["fashion_news_frequency"])[0].astype('int8')
customers["club_member_status"] = pd.factorize(customers["club_member_status"])[0].astype('int8')
customers['postal_code'] = pd.factorize(customers['postal_code'])[0].astype('int32')
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,-1612724649,0,0,0,0,49.0,0
1,-1740365574,0,0,0,0,25.0,1
2,277996312,0,0,0,0,24.0,2
3,-16268226,0,0,0,0,54.0,3
4,-1922717606,1,1,0,1,52.0,4
...,...,...,...,...,...,...,...
1371975,-1940645839,0,0,0,0,24.0,62927
1371976,-1245382473,0,0,0,0,21.0,6316
1371977,-47869340,1,1,0,1,21.0,273671
1371978,1238687594,1,1,0,1,18.0,218323


# Merging articles and customers dataset with transaction.

In [None]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29314980,2020-07-22,294687682,778064038,0.008458,2,96
29314981,2020-07-22,294687682,817166007,0.006763,2,96
29314982,2020-07-22,294687682,840360002,0.008458,2,96
29314983,2020-07-22,294687682,817166003,0.008458,2,96
29314984,2020-07-22,974978159,624486001,0.012729,2,96
...,...,...,...,...,...,...
31788319,2020-09-22,1362182998,929511001,0.059305,2,104
31788320,2020-09-22,1362182998,891322004,0.042356,2,104
31788321,2020-09-22,-324376415,918325001,0.043203,1,104
31788322,2020-09-22,2104975119,833459002,0.006763,1,104


In [None]:
transactions.sort_values(['week', 'customer_id'], inplace=True)

In [None]:
transactions.drop_duplicates(inplace=True)
transactions_merge = transactions.merge(articles, how="inner", on='article_id')
transactions_merge = transactions.merge(customers, how="inner", on='customer_id')
transactions_processed = transactions_merge[['t_dat','customer_id','article_id', 'age', 'price', 'sales_channel_id', 'week']].copy()
transactions_processed

Unnamed: 0,t_dat,customer_id,article_id,age,price,sales_channel_id,week
0,2020-07-27,-2147481293,697564061,20.0,0.016932,1,96
1,2020-07-27,-2147481293,865594002,20.0,0.025407,1,96
2,2020-07-27,-2147481293,697564042,20.0,0.016932,1,96
3,2020-07-27,-2147481293,854043005,20.0,0.030492,1,96
4,2020-07-27,-2147481293,852092002,20.0,0.030492,1,96
...,...,...,...,...,...,...,...
2263235,2020-09-17,2147354887,715828028,23.0,0.032593,1,104
2263236,2020-09-17,2147354887,893059004,23.0,0.040729,1,104
2263237,2020-09-17,2147354887,752814017,23.0,0.032593,1,104
2263238,2020-09-17,2147354887,843614008,23.0,0.032593,1,104


In [None]:
transactions_processed = transactions_processed.dropna()

In [None]:
transactions = transactions_processed

In [None]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

In [None]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,age,price,sales_channel_id,week
0,2020-07-27,-2147481293,697564061,20.0,0.016932,1,96
1,2020-07-27,-2147481293,865594002,20.0,0.025407,1,96
2,2020-07-27,-2147481293,697564042,20.0,0.016932,1,96
3,2020-07-27,-2147481293,854043005,20.0,0.030492,1,96
4,2020-07-27,-2147481293,852092002,20.0,0.030492,1,96
...,...,...,...,...,...,...,...
2263235,2020-09-17,2147354887,715828028,23.0,0.032593,1,104
2263236,2020-09-17,2147354887,893059004,23.0,0.040729,1,104
2263237,2020-09-17,2147354887,752814017,23.0,0.032593,1,104
2263238,2020-09-17,2147354887,843614008,23.0,0.032593,1,104


# Candidate Generation

In [None]:
c2weeks = transactions.groupby('customer_id')['week'].unique()
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

# Bestseller candidates


In [None]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [None]:
candidates_bestsellers.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-29,1135579992,1,97,827968001,0.016591
1,2020-07-29,1135579992,1,97,706016003,0.033237
2,2020-07-29,1135579992,1,97,706016001,0.033265
3,2020-07-29,1135579992,1,97,760084003,0.024986
4,2020-07-29,1135579992,1,97,717490064,0.008299


In [None]:
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.scenarios import Timed
from recpack.pipelines import PipelineBuilder

In [None]:
c = customers['customer_id'].unique().tolist()

# Calculating the popularity for each of the articles

# Creating the interaction matrix with RecPack

In [None]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='week')
proc.add_filter(MinUsersPerItem(8, item_ix='article_id', user_ix='customer_id'))
proc.add_filter(MinItemsPerUser(8, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(transactions)

  0%|          | 0/1305264 [00:00<?, ?it/s]

  0%|          | 0/1305264 [00:00<?, ?it/s]

# Sequential Rules algorithm

In [None]:
from asyncio.format_helpers import _format_callback_source
import pandas as pd
import numpy as np
from collections import Counter
from itertools import chain
from scipy.sparse import csr_matrix, lil_matrix
from recpack.algorithms.base import ItemSimilarityMatrixAlgorithm
from recpack.algorithms.util import invert
from recpack.matrix import InteractionMatrix, Matrix, to_csr_matrix
from recpack.matrix.interaction_matrix import InteractionMatrix
from recpack.util import to_binary, get_top_K_ranks


def last_item_recommendations(X: InteractionMatrix):
    
    return get_top_K_ranks(X.last_timestamps_matrix, K=1)


class SequentialRules(ItemSimilarityMatrixAlgorithm):
    def __init__(self, max_steps=20):
        self.max_steps = max_steps

    def _transform_fit_input(self, X: Matrix) -> InteractionMatrix:
        # X_interaction_matrix = isinstance(X, InteractionMatrix)
        if isinstance(X, InteractionMatrix) == False:
            raise ValueError(f"Interaction Matrix")
        elif X.has_timestamps == False:
            raise ValueError(f"requires timestamp information")
        else:
            return X

    def _fit(self, X: InteractionMatrix):
        a = list(X.sorted_item_history)
        sim_matrix = lil_matrix((X.shape[1], X.shape[1]))
        for user, items_per_user in X.sorted_item_history:
            for i in range(0, len(items_per_user) - 1):
                for n_steps, j in enumerate(
                    range(i + 1, min(len(items_per_user), i + self.max_steps + 1)),
                    start=1,
                ):
                    LHS = items_per_user[i]
                    RHS = items_per_user[j]
                    sim_matrix[LHS, RHS] += 1 / n_steps
        support = csr_matrix(X.values.sum(axis=0))

        sim_matrix = sim_matrix.multiply(invert(support).T)
        self.similarity_matrix_ = sim_matrix.tocsr()

    def _transform_predict_input(self, X):
        X = last_item_recommendations(X)
        return X


# Registering both implemented algorithms.

In [None]:
from recpack.pipelines import ALGORITHM_REGISTRY, PipelineBuilder, pipeline_builder

ALGORITHM_REGISTRY.register('Sequential_Rules', SequentialRules)

# ItemKNN used to generate candidates

In [None]:
from recpack.algorithms import ItemKNN
knn = ItemKNN(K=20, similarity='cosine')
knn.fit(interaction_matrix)

2022-11-28 21:58:09,833 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.5s


INFO:recpack:Fitting ItemKNN complete - Took 1.5s


ItemKNN(K=20)

In [None]:
predictions = knn.predict(interaction_matrix)

# Sequential Rules used to generate candidates

In [None]:
sr = SequentialRules(max_steps=20)
sr.fit(interaction_matrix)

  self._set_arrayXarray(i, j, x)


2022-11-29 01:22:03,474 - base - recpack - INFO - Fitting SequentialRules complete - Took 2.07e+02s


INFO:recpack:Fitting SequentialRules complete - Took 2.07e+02s


SequentialRules()

In [None]:
pred = sr.predict(interaction_matrix)



In [None]:
def top_n_idx_sparse(matrix: csr_matrix, n: int):
    """Return index of top n values in each row of a sparse matrix.
    source: https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
    """
    top_n_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        if n_row_pick == n:
          top_n_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]].tolist())
    
    # Get the values corresponding to the indices
    top_n_values = []
    for row_idx, col_idxs in enumerate(top_n_idx):
        top_n_values.append(matrix[row_idx, col_idxs].toarray().tolist()[0])
        assert(len(top_n_values[row_idx]) == len(top_n_idx[row_idx]))
    return top_n_idx, top_n_values

In [None]:
top_k_idx, top_k_values = top_n_idx_sparse(pred, 12)

In [None]:
top_k_values

In [None]:
# source of support: https://github.com/LienM/ai-project-22-23/blob/main/ArnoTroch/lecture6/submission-KNN-similarity-feature.ipynb
uid_cid_map = interaction_matrix._df[["uid", "customer_id"]].drop_duplicates().set_index("uid").to_dict()["customer_id"]
iid_aid_map = interaction_matrix._df[["iid", "article_id"]].drop_duplicates().set_index("iid").to_dict()["article_id"]

ar_customers = []
ar_articles = []
ar_scores = []

for i, row in enumerate(top_k_idx):
        user_predictions = [iid_aid_map[iid] for iid in row]
        ar_customers.extend([uid_cid_map[i]] * len(user_predictions))
        ar_articles.extend(user_predictions)
        ar_scores.extend(top_k_values[i])
ar_items = pd.DataFrame({"customer_id": ar_customers, "article_id": ar_articles, "sr_prediction": ar_scores})

In [None]:
ar_items

Unnamed: 0,customer_id,article_id,sr_prediction
0,-1086329024,933706001,0.037037
1,-1086329024,822237002,0.037037
2,-1086329024,885989002,0.037037
3,-1086329024,781467014,0.037037
4,-1086329024,925472001,0.043210
...,...,...,...
1003603,-5381211,573085028,0.000000
1003604,-5381211,789309002,0.000000
1003605,-5381211,875784003,0.000000
1003606,-5381211,921918001,0.000000


In [None]:
candidates_similar_items = pd.merge(
    ar_items,
    test_set_transactions,
    on='customer_id',
    how='left'
)

candidates_similar_items.drop(columns='sr_prediction', inplace=True)
last_price = transactions \
    .groupby(['article_id', 'week']) \
    .price \
    .last() \
    .reset_index() \
    .groupby('article_id') \
    .price \
    .last() \
    .reset_index()

candidates_similar_items = pd.merge(
    candidates_similar_items,
    last_price,
    on='article_id',
    how='left'
)

In [None]:
popularity = transactions\
    .groupby(['article_id', 'week']).size().reset_index(name='weekly_purchase_count')
weekly_popularity = []
def func(row):
    weeks_before = popularity[(row.article_id == popularity.article_id) & (row.week > popularity.week)]
    # get last row of weeks_before
    previous_week_popularity = 0
    if weeks_before.shape[0] > 0:
        previous_week_popularity = weekly_popularity[-1]
    return previous_week_popularity / 2.0 + float(row.weekly_purchase_count)
# iterate over all rows
for i, row in popularity.iterrows():
    weekly_popularity.append(func(row))
popularity['weekly_popularity'] = weekly_popularity
popular_articles_per_week = popularity.sort_values(['week', 'weekly_popularity'], ascending=False)\
    .groupby('week').head(20).reset_index(drop=True)
popular_articles_previous_week = pd.merge(popular_articles_per_week, mean_price, on=['week', 'article_id']).reset_index(drop=True)
# make a new column to rank the weekly_popularity
popular_articles_previous_week['last_week_popularity_rank'] = popular_articles_previous_week.groupby('week')['weekly_popularity'].rank(ascending=False).astype(np.int32)
popular_articles_previous_week.week += 1

In [None]:
transactions['purchased'] = 1
data = pd.concat([transactions, candidates_last_purchase, candidates_similar_items])
data.purchased.fillna(0, inplace=True)

data.purchased.mean()

0.41566738864755637

In [None]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [None]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
data = data[data.week != data.week.min()]

In [None]:
data = pd.merge(
    data, 
    ar_items[['customer_id', 'article_id', 'sr_prediction']], 
    on=['customer_id', 'article_id'], 
    how='left'
)

In [None]:
data = pd.merge(
    data,
    popular_articles_previous_week[['week', 'article_id', 'weekly_purchase_count', 'weekly_popularity', 'last_week_popularity_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [None]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

data['weekly_purchase_count'].fillna(0, inplace=True)
data['weekly_popularity'].fillna(0, inplace=True)
data['sr_prediction'].fillna(data['purchased'], inplace=True)
data['bestseller_rank'].fillna(data.bestseller_rank.max() + 1, inplace=True)
data['last_week_popularity_rank'].fillna(data.last_week_popularity_rank.max() + 1, inplace=True)
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,sr_prediction,weekly_purchase_count,...,w2v_16,w2v_17,w2v_18,w2v_19,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-07-29,1135579992,799365027,0.050831,1,97,1.0,13.0,1.0,0.0,...,-0.043852,0.034088,0.150648,-0.068062,0,0,0,0,51.0,68
1,2020-07-29,-795540489,636323002,0.016932,2,97,1.0,13.0,1.0,0.0,...,-0.036174,0.005803,0.108137,0.023602,0,0,0,0,19.0,180
2,2020-07-29,-795540489,706016039,0.033881,2,97,1.0,13.0,1.0,0.0,...,-0.091274,0.012108,0.04253,-0.082185,0,0,0,0,19.0,180
3,2020-07-29,1681945608,622966014,0.033881,2,97,1.0,13.0,1.0,0.0,...,-0.116543,0.065365,0.113445,-0.041885,0,0,0,0,35.0,183
4,2020-07-29,1681945608,805917001,0.013542,2,97,1.0,13.0,1.0,0.0,...,0.00033,-0.054623,0.070279,-0.014389,0,0,0,0,35.0,183


In [None]:
data.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'week', 'purchased', 'bestseller_rank', 'sr_prediction',
       'weekly_purchase_count', 'weekly_popularity',
       'last_week_popularity_rank', 'product_code_x', 'prod_name_x',
       'product_type_no_x', 'product_group_name_x',
       'graphical_appearance_no_x', 'colour_group_code_x',
       'perceived_colour_value_id_x', 'perceived_colour_master_id_x',
       'department_no_x', 'index_code_x', 'index_group_no_x', 'section_no_x',
       'garment_group_no_x', 'w2v_0_x', 'w2v_1_x', 'w2v_2_x', 'w2v_3_x',
       'w2v_4_x', 'w2v_5_x', 'w2v_6_x', 'w2v_7_x', 'w2v_8_x', 'w2v_9_x',
       'w2v_10_x', 'w2v_11_x', 'w2v_12_x', 'w2v_13_x', 'w2v_14_x', 'w2v_15_x',
       'w2v_16_x', 'w2v_17_x', 'w2v_18_x', 'w2v_19_x', 'FN_x', 'Active_x',
       'club_member_status_x', 'fashion_news_frequency_x', 'postal_code_x',
       'product_code', 'prod_name', 'product_type_no', 'product_group_name',
       'graphical_appearance

In [None]:
data.drop(
        columns=['age_y', 'product_code_y', 'prod_name_y', 'product_type_no_y',
       'product_group_name_y', 'graphical_appearance_no_y',
       'colour_group_code_y', 'perceived_colour_value_id_y',
       'perceived_colour_master_id_y', 'department_no_y', 'index_code_y',
       'index_group_no_y', 'section_no_y', 'garment_group_no_y', 'w2v_0_y',
       'w2v_1_y', 'w2v_2_y', 'w2v_3_y', 'w2v_4_y', 'w2v_5_y', 'w2v_6_y',
       'w2v_7_y', 'w2v_8_y', 'w2v_9_y', 'w2v_10_y', 'w2v_11_y', 'w2v_12_y',
       'w2v_13_y', 'w2v_14_y', 'w2v_15_y', 'w2v_16_y', 'w2v_17_y', 'w2v_18_y',
       'w2v_19_y', 'FN_y', 'Active_y', 'club_member_status_y',
       'fashion_news_frequency_y','postal_code_y'], inplace=True)

In [None]:
data.drop(
        columns=['product_code_x', 'prod_name_x', 'product_type_no_x',
       'product_group_name_x', 'graphical_appearance_no_x',
       'colour_group_code_x', 'perceived_colour_value_id_x',
       'perceived_colour_master_id_x', 'department_no_x', 'index_code_x',
       'index_group_no_x', 'section_no_x', 'garment_group_no_x', 'w2v_0_x',
       'w2v_1_x', 'w2v_2_x', 'w2v_3_x', 'w2v_4_x', 'w2v_5_x', 'w2v_6_x',
       'w2v_7_x', 'w2v_8_x', 'w2v_9_x', 'w2v_10_x', 'w2v_11_x', 'w2v_12_x',
       'w2v_13_x', 'w2v_14_x', 'w2v_15_x', 'w2v_16_x', 'w2v_17_x', 'w2v_18_x',
       'w2v_19_x', 'FN_x', 'Active_x', 'club_member_status_x',
       'fashion_news_frequency_x','postal_code_x'], inplace=True)

In [None]:
data.drop(
        columns=['age_x'], inplace=True)

In [None]:
data.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'week', 'purchased', 'bestseller_rank', 'sr_prediction',
       'weekly_purchase_count', 'weekly_popularity',
       'last_week_popularity_rank', 'product_code', 'prod_name',
       'product_type_no', 'product_group_name', 'graphical_appearance_no',
       'colour_group_code', 'perceived_colour_value_id',
       'perceived_colour_master_id', 'department_no', 'index_code',
       'index_group_no', 'section_no', 'garment_group_no', 'w2v_0', 'w2v_1',
       'w2v_2', 'w2v_3', 'w2v_4', 'w2v_5', 'w2v_6', 'w2v_7', 'w2v_8', 'w2v_9',
       'w2v_10', 'w2v_11', 'w2v_12', 'w2v_13', 'w2v_14', 'w2v_15', 'w2v_16',
       'w2v_17', 'w2v_18', 'w2v_19', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')

In [None]:
data.rename(columns = {'product_code_x':'product_code',
                       'prod_name_x':'prod_name',
                       'product_type_no_x':'product_type_no',
                       'product_group_name_x':'product_group_name',
                       'graphical_appearance_no_x':'graphical_appearance_no',
                       'colour_group_code_x':'colour_group_code',
                       'perceived_colour_value_id_x':'perceived_colour_value_id',
                       'perceived_colour_master_id_x':'perceived_colour_master_id',
                       'department_no_x':'department_no',
                       'index_code_x':'index_code',
                       'index_group_no_x':'index_group_no',
                       'section_no_x':'section_no',
                       'garment_group_no_x':'garment_group_no',
                       'w2v_0_x':'w2v_0',
                       'w2v_1_x':'w2v_1',
                       'w2v_2_x':'w2v_2',
                       'w2v_3_x':'w2v_3',
                       'w2v_4_x':'w2v_4',
                       'w2v_5_x':'w2v_5',
                       'w2v_6_x':'w2v_6',
                       'w2v_7_x':'w2v_7',
                       'w2v_8_x':'w2v_8',
                       'w2v_9_x':'w2v_9',
                       'w2v_10_x':'w2v_10',
                       'w2v_11_x':'w2v_11',
                       'w2v_12_x':'w2v_12',
                       'w2v_13_x':'w2v_13',
                       'w2v_14_x':'w2v_14',
                       'w2v_15_x':'w2v_15',
                       'w2v_16_x':'w2v_16',
                       'w2v_17_x':'w2v_17',
                       'w2v_18_x':'w2v_18',
                       'w2v_19_x':'w2v_19',
                       'FN_x':'FN',
                       'Active_x':'Active',
                       'club_member_status_x':'club_member_status',
                       'fashion_news_frequency_x':'fashion_news_frequency',
                       'postal_code_x':'postal_code'}, inplace = True)

In [None]:
data.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'week', 'purchased', 'bestseller_rank', 'ar_prediction',
       'weekly_purchase_count', 'weekly_popularity',
       'last_week_popularity_rank', 'product_code', 'prod_name',
       'product_type_no', 'product_group_name', 'graphical_appearance_no',
       'colour_group_code', 'perceived_colour_value_id',
       'perceived_colour_master_id', 'department_no', 'index_code',
       'index_group_no', 'section_no', 'garment_group_no', 'w2v_0', 'w2v_1',
       'w2v_2', 'w2v_3', 'w2v_4', 'w2v_5', 'w2v_6', 'w2v_7', 'w2v_8', 'w2v_9',
       'w2v_10', 'w2v_11', 'w2v_12', 'w2v_13', 'w2v_14', 'w2v_15', 'w2v_16',
       'w2v_17', 'w2v_18', 'w2v_19', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'postal_code', 'age'],
      dtype='object')

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
scaler = StandardScaler()
columns_to_scale = ['product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'last_week_popularity_rank', 'sr_prediction']
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'last_week_popularity_rank', 'sr_prediction']

In [None]:
data.week

0           97
1           97
2           97
3           97
4           97
          ... 
5152421    105
5152422    105
5152423    105
5152424    105
5152425    105
Name: week, Length: 5152426, dtype: int64

In [None]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [None]:
from lightgbm.sklearn import LGBMRanker
ranker = LGBMRanker(
    objective="lambdarank",
    num_leaves=200,
    metric="ndcg",
    boosting_type="dart",
    n_estimators=100,
    importance_type='gain',
    verbose=10,
)

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

sr_prediction 0.9976611384460279
postal_code 0.0008358960386946574
age 0.00045416364905672123
article_id 0.0002934602056915189
department_no 9.494764139763863e-05
product_type_no 9.288186415549709e-05
index_code 9.041459463317987e-05
colour_group_code 9.014426041813989e-05
graphical_appearance_no 6.709766910710308e-05
last_week_popularity_rank 5.79239570321964e-05
perceived_colour_value_id 5.5566035699150467e-05
perceived_colour_master_id 5.2309340186891294e-05
section_no 4.696982283424664e-05
garment_group_no 3.0596125029540896e-05
FN 2.5380688874271344e-05
Active 2.3818606755238475e-05
index_group_no 1.720460751631346e-05
fashion_news_frequency 9.151688970352965e-06
club_member_status 9.347579194054504e-07


In [None]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

In [None]:
!unzip /content/drive/MyDrive/sample_submission.csv.zip

Archive:  /content/drive/MyDrive/sample_submission.csv.zip
  inflating: sample_submission.csv   


In [None]:
sub = pd.read_csv('sample_submission.csv')

In [None]:
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    predd = c_id2predicted_article_ids.get(c_id, [])
    predd = predd + bestsellers_last_week
    preds.append(predd[:12])

In [None]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [None]:
sub.to_csv('submission_sr_.csv.gz', index=False)