## Light GBM Ranker  
This work is motivated based on 2 notebooks which results in generation of known / negative candidates - Last purchased candidates, best seller candidates. Below list gives us a reference for capturing these 2 candidate types:  

**Customer Last Purchase & Bestselling Articles:** This work is motivated based on notebook developed by [Radek](https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03c_Basic_Model_Submission.ipynb). It helps us in generating essential candidates around customer transactions and also encompasses all static metadata from customer as well as article dataset.  

**Customer and Article Dynamic Attributes:** This work is motivated based on notebook developed by [Alex](https://www.kaggle.com/code/alexvishnevskiy/gbm-ranking/notebook). It helps to generate [customer](https://www.kaggle.com/code/alexvishnevskiy/ranking-user-features/notebook) and [article](https://www.kaggle.com/code/alexvishnevskiy/ranking-item-features) based dynamic attributes as well.

Related work:  
1. Light GBM Ranker Hyper-Parameter Tuning: https://www.kaggle.com/code/rickykonwar/h-m-lgbmranker-hyperparametertuning
2. Light GBM Ranking Cross Validation: kaggle.com/code/rickykonwar/h-m-lgbmranker-crossvalidation

## Helper Functions

In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

## Importing Modules / Reading Data

In [3]:
import os
import tqdm
import pandas as pd
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Union

In [4]:
%%time

transactions = pd.read_parquet('../input/hm-lgbm-supporting-data/transactions_train.parquet')
customers = pd.read_parquet('../input/hm-lgbm-supporting-data/customers.parquet')
articles = pd.read_parquet('../input/hm-lgbm-supporting-data/articles.parquet')

CPU times: user 2.74 s, sys: 4.9 s, total: 7.64 s
Wall time: 8.27 s


In [5]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

## Generating candidates

In [6]:
class GenerateCandidates:
    def __init__(self, transactions, articles, customers):
        self._transactions = transactions
        self._articles = articles
        self._customer = customers
        
    def get_candidates(self):
        return self._last_purchase_candidates, self._bestseller_candidates
    
    def get_bestsellers(self):
        return self._bestseller_previous_week
    
    def generate_candidates(self):
        '''
        1. Last purchase made by each customer
        2. Bestselling articles on weekly basis
        '''
        self._last_purchase_candidates = self.last_purchase_candidates()
        self._bestseller_candidates, self._bestseller_previous_week = self.best_seller_candidates()
        
    def last_purchase_candidates(self):
        '''
        Retrieve Customer Ids with respective weeks of having positive interactions
        '''
        c2weeks = self._transactions.groupby('customer_id')['week'].unique()
        print('#### Customer having positive interactions at week level ####')
        print(c2weeks)
        
        '''
        Generating a dictionary to capture the effect of each purchase made by each customer 
        and then mapping it back to subsequent week of next transaction made
        '''
        c2weeks2shifted_weeks = {}
        for c_id, weeks in c2weeks.items():
            c2weeks2shifted_weeks[c_id] = {}
            for i in range(weeks.shape[0]-1):
                c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
            c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
        print('\n#### Customer having positive interactions interchanging subsequent purchases ####')
        print(c2weeks2shifted_weeks[272412481300040])
        
        candidates_last_purchase = self._transactions.copy()
        
        '''
        Generate duplicate positive interactions by mapping extracted weeks to subsequent weeks
        of interactions
        '''
        weeks = []
        for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
            weeks.append(c2weeks2shifted_weeks[c_id][week])

        candidates_last_purchase.week=weeks
        print('\n#### Duplicating interactions for each customer to subsequent weeks ####')
        print(candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040])
        
        return candidates_last_purchase
    
    def best_seller_candidates(self):
        '''
        Retrieve Mean Prices of articles for each week and also rank the highest selling articles
        per week
        '''
        mean_price = self._transactions.groupby(['week', 'article_id'])['price'].mean()
        print('\n#### Mean Price of articles on weekly basis ####')
        print(mean_price)
        
        sales = self._transactions \
                    .groupby('week')['article_id'].value_counts() \
                    .groupby('week').rank(method='dense', ascending=False) \
                    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
        print('\n#### Best seller ranks for articles on weekly basis and based on no of transactions made each week ####')
        print(sales)
        
        bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
        bestsellers_previous_week.week += 1
        print('\n#### Best seller ranks for articles on weekly basis along with mean prices ####')
        print(bestsellers_previous_week)
        
        '''
        Generate best seller candidates for each week and customer combination
        '''
        unique_transactions = self._transactions \
                                .groupby(['week', 'customer_id']) \
                                .head(1) \
                                .drop(columns=['article_id', 'price']) \
                                .copy()
        print('\n#### Unique transactions for each customer on weekly basis ####')
        print(unique_transactions)
        
        candidates_bestsellers = pd.merge(
                                    unique_transactions,
                                    bestsellers_previous_week,
                                    on='week'
                                )
        print('\n#### Best selling Candidates for each week and customer combination ####')
        print(candidates_bestsellers)
        
        '''
        Generate test transactions for each week
        '''
        test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
        test_set_transactions.week = test_week
        
        candidates_bestsellers_test_week = pd.merge(
                                                test_set_transactions,
                                                bestsellers_previous_week,
                                                on='week'
                                            )
        
        '''
        Combining Actuals and test week bestseller candidates
        '''
        candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
        candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)
        print('\n#### Best selling Candidates for customer with ID 272412481300040 ####')
        print(candidates_bestsellers.loc[candidates_bestsellers.customer_id.isin([272412481300040])])
        
        return candidates_bestsellers, bestsellers_previous_week

1. Customer Last Purchase made  
2. Article Best Sellers

In [7]:
generate_candidate_instance = GenerateCandidates(transactions, articles, customers)
generate_candidate_instance.generate_candidates()
candidates_last_purchase, candidates_bestsellers = generate_candidate_instance.get_candidates()
bestsellers_previous_week = generate_candidate_instance.get_bestsellers()

#### Customer having positive interactions at week level ####
customer_id
28847241659200          [95, 96, 101, 102]
41318098387474                        [98]
116809474287335                 [101, 103]
200292573348128          [95, 96, 99, 102]
248294615847351                       [96]
                               ...        
18446624797007271432                  [95]
18446630855572834764                 [103]
18446662237889060501                 [100]
18446705133201055310                 [102]
18446737527580148316                 [104]
Name: week, Length: 437365, dtype: object

#### Customer having positive interactions interchanging subsequent purchases ####
{95: 96, 96: 103, 103: 105}

#### Duplicating interactions for each customer to subsequent weeks ####
              t_dat      customer_id  article_id     price  sales_channel_id  \
29030503 2020-07-15  272412481300040   778064028  0.008458                 1   
29030504 2020-07-15  272412481300040   816592008  0.016932       

In [8]:
candidates_last_purchase

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,96
29030504,2020-07-15,272412481300040,816592008,0.016932,1,96
29030505,2020-07-15,272412481300040,621381021,0.033881,1,96
29030506,2020-07-15,272412481300040,817477003,0.025407,1,96
29030507,2020-07-15,272412481300040,899088002,0.025407,1,96
...,...,...,...,...,...,...
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,105
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,105
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,105
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,105


In [9]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,200292573348128,2,96,760084003,0.025094
1,2020-07-22,200292573348128,2,96,866731001,0.024919
2,2020-07-22,200292573348128,2,96,600886001,0.022980
3,2020-07-22,200292573348128,2,96,706016001,0.033197
4,2020-07-22,200292573348128,2,96,372860002,0.013193
...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,1,105,915529003,0.033439
5248376,2020-09-22,18438270306572912089,1,105,915529005,0.033417
5248377,2020-09-22,18438270306572912089,1,105,448509014,0.041630
5248378,2020-09-22,18438270306572912089,1,105,762846027,0.025005


In [10]:
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.022980
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
...,...,...,...,...
115,105,915529003,8,0.033439
116,105,915529005,9,0.033417
117,105,448509014,10,0.041630
118,105,762846027,11,0.025005


## Generating Dynamic User Features

In [11]:
class UserFeatures(ABC):
    @abstractmethod
    def get(self) -> pd.DataFrame:
        """
        customer_id -> features
        """
        pass

class AggrFeatures(UserFeatures):
    """
    basic aggregation features(min, max, mean and etc...)
    """
    def __init__(self, transactions_df):
        self.groupby_df = transactions_df.groupby('customer_id', as_index = False)

    def get(self):
        output_df = (
            self.groupby_df['price']
            .agg({
                'mean_transactions': 'mean',
                'max_transactions': 'max',
                'min_transactions': 'min',
                'median_transactions': 'median',
                'sum_transactions': 'sum',
                'max_minus_min_transactions': lambda x: x.max()-x.min()
            })
            .set_index('customer_id')
            .astype('float32')
        )
        return output_df

class CountFeatures(UserFeatures):
    """
    basic features connected with transactions
    """
    def __init__(self, transactions_df, topk = 10):
        self.transactions_df = transactions_df
        self.topk = topk

    def get(self):
        grouped = self.transactions_df.groupby('customer_id', as_index = False)
        #number of transactions, number of online articles,
        #number of transactions bigger than mean price of transactions
        a = (
            grouped
            .agg({
                'article_id': 'count',
                'price': lambda x: sum(np.array(x) > x.mean()),
                'sales_channel_id': lambda x: sum(x == 2),
            })
            .rename(columns = {
                'article_id': 'n_transactions',
                'price': 'n_transactions_bigger_mean',
                'sales_channel_id': 'n_online_articles'
            })
            .set_index('customer_id')
            .astype('int8')
        )
        #number of unique articles, number of store articles
        b = (
            grouped
            .agg({
                'article_id': 'nunique',
                'sales_channel_id': lambda x: sum(x == 1),
            })
            .rename(columns = {
                'article_id': 'n_unique_articles',
                'sales_channel_id': 'n_store_articles',
            })
            .set_index('customer_id')
            .astype('int8')
        )
        #number of transactions that are in top
        topk_articles = self.transactions_df['article_id'].value_counts()[:self.topk].index
        c = (
            grouped['article_id']
            .agg({
               f'top_article_{i}':  lambda x: sum(x == k) for i, k in enumerate(topk_articles)
            }
            )
            .set_index('customer_id')
            .astype('int8')
        )
        
        output_df = a.merge(b, on = ('customer_id')).merge(c, on = ('customer_id'))
        return output_df

class CustomerFeatures(UserFeatures):
    """
    All columns from customers dataframe
    """
    def __init__(self, customers_df):
        self.customers_df = self._prepare_customers(customers_df)
    
    def _prepare_customers(self, customers_df):
        customers_df['FN'] = customers_df['FN'].fillna(0).astype('int8')
        customers_df['Active'] = customers_df['Active'].fillna(0).astype('int8')
        customers_df['club_member_status'] = customers_df['club_member_status'].fillna('UNKNOWN')
        customers_df['age'] = customers_df['age'].fillna(customers_df['age'].mean()).astype('int8')
        customers_df['fashion_news_frequency'] = (
            customers_df['fashion_news_frequency']
            .replace('None', 'NONE')
            .replace(np.nan, 'NONE')
        )
        return customers_df

    def get(self):
        output = (
            self.customers_df[filter(lambda x: x != 'postal_code', customers_df.columns)]
            .set_index('customer_id')
        )
        return output

class ArticlesFeatures(UserFeatures):
    """
    returns article features: whether category appears in top categories
    """
    def __init__(self, transactions_df, articles, topk = 10):
        self.merged_df = transactions_df.merge(articles, on = ('article_id'))
        self.articles = articles
        self.topk = topk
    
    def get(self):
        output_df = None

        for col in tqdm.tqdm(self.articles.columns, desc = 'extracting features'):
            if 'name' in col:
                if output_df is None:
                    output_df = self.aggregate_topk(self.merged_df, col, self.topk)
                else:
                    intermediate_out = self.aggregate_topk(self.merged_df, col, self.topk)
                    output_df = output_df.merge(intermediate_out, on = ('customer_id'))
        return output_df

    def return_value_counts(self, df, column_name, k):
        value_counts = df[column_name].value_counts()[:k].index
        value_counts = list(map(lambda x: x[1], value_counts))
        return value_counts

    def aggregate_topk(self, merged_df, column_name, k):
        grouped_df_indx = merged_df.groupby('customer_id')
        grouped_df = merged_df.groupby('customer_id', as_index = False)
        
        topk_values = self.return_value_counts(grouped_df_indx, column_name, k)
        #how many transactions appears in top category(column)
        n_top_k = (
            grouped_df[column_name]
            .agg({
                f'top_{column_name}_{i}': lambda x: sum(x == k) for i, k in enumerate(topk_values)
            })
            .set_index('customer_id')
            #.astype('int16')
        )
        return n_top_k

class UserFeaturesCollector:
    """
    collect all features and aggregate them
    """
    @staticmethod
    def collect(features: Union[List[UserFeatures], List[str]], **kwargs) -> pd.DataFrame:
        output_df = None

        for feature in tqdm.tqdm(features):
            if isinstance(feature, UserFeatures):
                feature_out = feature.get(**kwargs)
            if isinstance(feature, str):
                try:
                    feature_out = pd.read_csv(feature)
                except:
                    feature_out = pd.read_parquet(feature)

            if output_df is None:
                output_df = feature_out
            else:
                output_df = output_df.merge(feature_out, on = ('customer_id'))
        return output_df

In [12]:
if os.path.exists(r'../input/hm-lgbm-supporting-data/customers_dynamic_features.parquet'):
    # customer_features = pd.read_parquet(r'../input/hm-lgbm-supporting-data/customers_dynamic_features.parquet')
    pass
else:
    customer_features = UserFeaturesCollector.collect([
        AggrFeatures(transactions),
        CountFeatures(transactions, 3),
        # CustomerFeatures(customers),
        # ArticlesFeatures(transactions, articles, 3),
    ])
    customer_features.to_parquet('customers_dynamic_features.parquet')

In [13]:
# customer_features.columns

## Combining transactions and candidates / negative examples

In [14]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95
...,...,...,...,...,...,...
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,104
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,104
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,104
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,104


In [15]:
transactions['purchased'] = 1

In [16]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95,1
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95,1
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95,1
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95,1
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95,1
...,...,...,...,...,...,...,...
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,104,1
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,104,1
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,104,1
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,104,1


In [17]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [18]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [19]:
data.purchased.mean()

0.13607582749165664

## Add bestseller information

In [20]:
data = pd.merge(
                data,
                bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
                on=['week', 'article_id'],
                how='left'
        )

In [21]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [22]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [23]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

## Add Customer Dynamic Features

In [24]:
data = pd.merge(data, pd.read_parquet(r'../input/hm-lgbm-supporting-data/customers_dynamic_features.parquet').reset_index(), how='left', on='customer_id')

In [25]:
data.head(10)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,top_article_2,top_article_3,top_article_4,top_article_5,top_article_6,top_article_7,top_article_8,top_article_9,top_article_10,top_article_11
0,2020-07-26,28847241659200,887770001,0.016932,1,96,1.0,999.0,887770,727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-07-18,28847241659200,762846001,0.025407,1,96,0.0,999.0,762846,472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-07-18,28847241659200,829308001,0.033881,1,96,0.0,999.0,829308,11402,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-07-26,28847241659200,760084003,0.025094,1,96,0.0,1.0,760084,1134,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-07-26,28847241659200,866731001,0.024919,1,96,0.0,2.0,866731,3609,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2020-07-26,28847241659200,600886001,0.02298,1,96,0.0,3.0,600886,1424,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2020-07-26,28847241659200,706016001,0.033197,1,96,0.0,4.0,706016,172,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2020-07-26,28847241659200,372860002,0.013193,1,96,0.0,5.0,372860,19652,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2020-07-26,28847241659200,610776002,0.008318,1,96,0.0,6.0,610776,46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2020-07-26,28847241659200,877278002,0.025036,1,96,0.0,7.0,877278,11255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Clearing Variables

In [26]:
import gc
del [[transactions, candidates_last_purchase, candidates_bestsellers, articles, customers]]

In [27]:
gc.collect()

105

## Splitting to train and test sets / Model Train

In [28]:
optimized_params = {'objective': "lambdarank",
                    'metric': "ndcg",
                    'num_threads': 4,
                    'boosting_type': "dart",
                    'importance_type': "gain",
                    'verbose': -1,
                    'n_estimators': 30, 
                    'learning_rate': 0.13148686772117982, 
                    'num_leaves': 60, 
                    'max_depth': 9, 
                    'min_data_in_leaf': 10, 
                    'lambda_l1': 55, 
                    'lambda_l2': 95, 
                    'min_gain_to_split': 4.720196844504682, 
                    'bagging_fraction': 0.8, 
                    'bagging_freq': 1, 
                    'feature_fraction': 0.8,
                    'seed': 42}

In [29]:
from lightgbm.sklearn import LGBMRanker

def train_model(data=None, columns_to_use=[]):
    train = data[data.week != test_week]
    test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()
    
    train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
    
    train_X = train[columns_to_use]
    train_y = train['purchased']

    test_X = test[columns_to_use]
    
    ranker = LGBMRanker(
                random_state=42, 
                **optimized_params
            )
    ranker = ranker.fit(
                train_X,
                train_y,
                group=train_baskets,
            )
    return ranker, test

In [30]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank',
'mean_transactions', 'max_transactions', 'min_transactions', # Additional Customer dynamic features
'median_transactions', 'sum_transactions', 'max_minus_min_transactions',
'n_transactions', 'n_transactions_bigger_mean', 'n_online_articles',
'n_unique_articles', 'n_store_articles', 
'top_article_0', 'top_article_1', 'top_article_2', 'top_article_3', 
'top_article_4', 'top_article_5', 'top_article_6', 'top_article_7', 
'top_article_8', 'top_article_9', 'top_article_10', 'top_article_11']

In [31]:
# Uncomment to train the ranking model
trained_ranker, test = train_model(data.tail(10000000), columns_to_use)

for i in trained_ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], trained_ranker.feature_importances_[i]/trained_ranker.feature_importances_.sum())

bestseller_rank 0.850483774865963
article_id 0.044663004069361655
product_type_no 0.027092853018018122
garment_group_no 0.019418875431771784
section_no 0.01430503031065723
perceived_colour_value_id 0.009123099641804215
colour_group_code 0.008463891955858902
department_no 0.00781839175117006
perceived_colour_master_id 0.006128802083123934
graphical_appearance_no 0.00601359512211231
index_group_no 0.0038261476392288042
index_code 0.0026625341109299846
max_transactions 0.0
FN 0.0
Active 0.0
club_member_status 0.0
fashion_news_frequency 0.0
age 0.0
postal_code 0.0
mean_transactions 0.0
top_article_11 0.0
top_article_10 0.0
median_transactions 0.0
top_article_9 0.0
top_article_8 0.0
top_article_7 0.0
top_article_6 0.0
top_article_5 0.0
top_article_4 0.0
top_article_3 0.0
top_article_2 0.0
top_article_1 0.0
top_article_0 0.0
n_store_articles 0.0
n_unique_articles 0.0
n_online_articles 0.0
n_transactions_bigger_mean 0.0
n_transactions 0.0
max_minus_min_transactions 0.0
sum_transactions 0.0
mi

## Calculate predictions

In [32]:
%time

test['preds'] = trained_ranker.predict(test[columns_to_use])

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.34 µs


## Create intermediate submissions

In [33]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [34]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 6.15 s, sys: 246 ms, total: 6.39 s
Wall time: 6.4 s


In [35]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [36]:
sub_name = 'intermediate_basic_model_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)

## Create final submissions

In [37]:
sub_name = 'final_model_submission'
final_sub = pd.read_csv(r'../input/hm-lgbm-supporting-data/basic_model_submission.csv')
final_sub.to_csv(f'{sub_name}.csv.gz', index=False)