In [1]:
import pandas as pd
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
os.chdir('../')
from model import *
os.chdir('../../data/')

In [2]:
transactions = pd.read_pickle('compressed_data/transactions_train.pkl')
customers = pd.read_pickle('compressed_data/customers.pkl')
articles = pd.read_pickle('compressed_data/articles.pkl')[[
        'article_id', 
        'product_type_no', 
        'graphical_appearance_no', 
        'colour_group_code', 
        'perceived_colour_value_id',
        'perceived_colour_master_id', 
        'department_no', 
        'index_code',
        'index_group_no', 
        'section_no', 
        'garment_group_no', 
    ]]

transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

print('First week num: ', transactions.week.min(), '\nLast week num: ', transactions.week.max(), '\n')

First week num:  0 
Last week num:  104 



In [3]:
# Test week is week after last week in train data
test_week = transactions.week.max() + 1

# Filter transactions to last 10 weeks (most recent data)
transactions = transactions[transactions.week > transactions.week.max() - 10]

In [4]:
transactions.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'week'],
      dtype='object')

In [5]:
# # Load radek's candidates
candidates_last_purchase = pd.read_csv('candidates/radek_last_purchase.csv')
candidates_bestsellers = pd.read_csv('candidates/radek_bestsellers.csv')
bestsellers_previous_week = pd.read_csv('candidates/radek_bestsellers_previous_week.csv')

# Load my candidates
## Seasonal candidates (best k = 20)

In [6]:
k_seasonal = 20

seasonal_bestsellers = pd.read_csv('candidates_200_ranks/seasonal_candidates_2019.csv')

seasonal_candidates_filtered = seasonal_bestsellers[seasonal_bestsellers.article_id.isin(transactions.article_id.unique())].head(k_seasonal)

seasonal_candidates = transactions[['customer_id', 'week']].drop_duplicates()
test_candidates = customers[customers.customer_id.isin(transactions.customer_id.unique())][['customer_id']].drop_duplicates()
test_candidates['week'] = test_week
seasonal_candidates = pd.concat([seasonal_candidates, test_candidates], ignore_index=True)

seasonal_candidates['key'] = 1
seasonal_candidates_filtered['key'] = 1

seasonal_candidates = seasonal_candidates.merge(seasonal_candidates_filtered, on='key').drop(columns='key')
seasonal_candidates['t_dat'] = '2020-07-15'
seasonal_candidates['price'] = 0
seasonal_candidates['sales_channel_id'] = 2
seasonal_candidates.drop(columns=['seasonal_bestseller_rank', 'year'], inplace=True)

## Similar not bought candidates (best k = 120)

In [7]:
k_snb = 120

candidates_similar_not_bought = pd.read_csv('candidates_200_ranks/sim_not_bought.csv')

top_k_snb_weekly = candidates_similar_not_bought\
    .groupby(['week', 'customer_id']).head(k_snb)\
    .drop(columns=['strategy', 'similarity_score'])

del candidates_similar_not_bought
gc.collect()

top_k_snb_weekly['t_dat'] = '2020-07-15'
top_k_snb_weekly['price'] = 0
top_k_snb_weekly['sales_channel_id'] = 2

## Not interacted with candidates
### Colour group (best k = 60)

In [8]:
k_niw_colour = 20

candidates_niw_loaded = pd.read_csv('candidates_200_ranks/niw_candidates_colour_group_name.csv')

candidates_niw_colour = candidates_niw_loaded.groupby(['week', 'customer_id']).head(k_niw_colour)\
        .drop(columns=['strategy', 'not_interacted_weekly_rank'])

niw_ranks_colour = candidates_niw_loaded[['week', 'article_id', 'not_interacted_weekly_rank']].drop_duplicates().rename(columns={'not_interacted_weekly_rank': 'niw_rank_colour'})

del candidates_niw_loaded
gc.collect()

candidates_niw_colour['week'] = candidates_niw_colour['week'] + 1

candidates_niw_colour['t_dat'] = '2020-07-15'
candidates_niw_colour['price'] = 0
candidates_niw_colour['sales_channel_id'] = 2


### Garment group (best k = 30/40/50) 

In [9]:
k_niw_garment = 20

candidates_niw_loaded = pd.read_csv('candidates_200_ranks/niw_candidates_garment_group_name.csv')

candidates_niw_garment = candidates_niw_loaded.groupby(['week', 'customer_id']).head(k_niw_garment)\
        .drop(columns=['strategy', 'not_interacted_weekly_rank'])

niw_ranks_garment = candidates_niw_loaded[['week', 'article_id', 'not_interacted_weekly_rank']].drop_duplicates().rename(columns={'not_interacted_weekly_rank': 'niw_rank_garment'})

del candidates_niw_loaded
gc.collect()

candidates_niw_garment['week'] = candidates_niw_garment['week'] + 1

candidates_niw_garment['t_dat'] = '2020-07-15'
candidates_niw_garment['price'] = 0
candidates_niw_garment['sales_channel_id'] = 2


### Section (best k = 40)

In [10]:
k_niw_section = 20

candidates_niw_loaded = pd.read_csv('candidates_200_ranks/niw_candidates_section_name.csv')

candidates_niw_section = candidates_niw_loaded.groupby(['week', 'customer_id']).head(k_niw_section)\
        .drop(columns=['strategy', 'not_interacted_weekly_rank'])

niw_ranks_section = candidates_niw_loaded[['week', 'article_id', 'not_interacted_weekly_rank']].drop_duplicates().rename(columns={'not_interacted_weekly_rank': 'niw_rank_section'})

del candidates_niw_loaded
gc.collect()

candidates_niw_section['week'] = candidates_niw_section['week'] + 1

candidates_niw_section['t_dat'] = '2020-07-15'
candidates_niw_section['price'] = 0
candidates_niw_section['sales_channel_id'] = 2


## Submission Loop -- check best k for kaggle

In [11]:
columns_to_use = [
    'article_id', 
    'product_type_no', 
    'graphical_appearance_no', 
    'colour_group_code', 
    'perceived_colour_value_id',
    'perceived_colour_master_id', 
    'department_no', 
    'index_code',
    'index_group_no', 
    'section_no', 
    'garment_group_no', 
    'FN', 
    'Active',
    'club_member_status', 
    'fashion_news_frequency', 
    'age', 
    'postal_code', 
    'bestseller_rank',
    'seasonal_bestseller_rank',
    'niw_rank_colour',
    'niw_rank_garment',
    'niw_rank_section'
]

model_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'dart',
    'n_estimators': 1,
    'importance_type': 'gain'
}

In [12]:
def prepare_data(t_df, candidates, features, cols_to_use, test_week=105, bestsellers_prev_week=None):
    '''
    Prepare data for training.

    Parameters
    ----------
    t_df : pd.DataFrame
        DataFrame with transactions.
    bestsellers_prev_week : pd.DataFrame
        DataFrame with bestsellers for previous week.
    candidates : list
        List of DataFrames with candidates.
    features : list
        List of DataFrames with features. DataFrames should have one at least but not all of following columns: week, article_id, customer_id.
    cols_to_use : list
        List of columns to use for training.
    test_week : int, default 105
        Week to use as test data. The default is 105.
    
    Returns
    -------
    train_X : pd.DataFrame
        Training data.
    train_y : pd.Series
        Training labels.
    test_X : pd.DataFrame
        Test data features.
    test : pd.DataFrame
        Test data.
    train_baskets : np.array
        Number of purchases for each customer week pair.    
    '''
    t_df['purchased'] = 1
    data = pd.concat([t_df] + candidates)
    data.purchased.fillna(0, inplace=True)
    data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

    del t_df, candidates
    gc.collect()

    print('Percentage of real transactions: ', data.purchased.mean())

    if bestsellers_prev_week is not None:
        model_data = pd.merge(
            data,
            bestsellers_prev_week[['week', 'article_id', 'bestseller_rank']],
            on=['week', 'article_id'],
            how='left'
        )
        del bestsellers_prev_week
        gc.collect()
    else:
        model_data = data.copy()

    del data
    gc.collect()

    # Remove first week of data, as we don't have bestseller rank for it
    # (week was shifted by one) and fill missing values with 999 -- really bad rank
    model_data = model_data[model_data.week != model_data.week.min()]
    model_data.fillna({'bestseller_rank':999}, inplace=True)

    print('Mergining features...')
    for i in range(len(features)):

        feature_df = features.pop()

        if ('week' in feature_df.columns) and ('article_id' in feature_df.columns):
            model_data = pd.merge(
                model_data, 
                feature_df, 
                on=['week', 'article_id'], 
                how='left'
            )
            
        elif ('week' in feature_df.columns) and ('customer_id' in feature_df.columns):
            model_data = pd.merge(
                model_data, 
                feature_df, 
                on=['week', 'customer_id'], 
                how='left'
            )
            
        elif ('week' not in feature_df.columns) and ('article_id' in feature_df.columns):
            model_data = pd.merge(
                model_data, 
                feature_df, 
                on='article_id', 
                how='left'
            )
            
        elif ('week' not in feature_df.columns) and ('customer_id' in feature_df.columns):
            model_data = pd.merge(
                model_data, 
                feature_df, 
                on='customer_id', 
                how='left'
            )

        else:
            raise ValueError('Feature DataFrame should have at least one of following columns: week, article_id, customer_id.')
        
        del feature_df
        gc.collect()

    del features
    gc.collect()
    
    print('Done.')
    print('Sorting data...')
    model_data.sort_values(['week', 'customer_id'], inplace=True)
    model_data.reset_index(drop=True, inplace=True)
    print('Done.')
    print('Preparing for training...')
    train = model_data[model_data.week != test_week]
    test = model_data[model_data.week == test_week]\
        .drop_duplicates(['customer_id', 'article_id', 'sales_channel_id'])\
        .copy()
    
    del model_data
    gc.collect()
    
    # Basically how many purchased for each customer week pair -- so lgbm knows its one transaction
    train_baskets = train.groupby(['week', 'customer_id'])['article_id']\
        .count()\
        .values  
    
    try:
        train_X = train[cols_to_use]
    except KeyError:
        return train_X
    train_y = train['purchased']

    test_X = test[cols_to_use]

    assert test.purchased.mean() == 0, 'Test data should not contain any actual purchases!'

    print('Done.')

    return train_X, train_y, test_X, test, train_baskets

In [13]:
# Get bestsellers from previous week
bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week['week'] == bestsellers_previous_week['week'].max()]['article_id'].tolist()

# Prepare data for model
train_X, train_y, test_X, test, train_baskets = prepare_data(
    transactions,
    candidates=[candidates_last_purchase, candidates_bestsellers, seasonal_candidates, top_k_snb_weekly, candidates_niw_colour, candidates_niw_garment, candidates_niw_section], 
    features=[customers, articles, seasonal_bestsellers[['article_id', 'seasonal_bestseller_rank']], niw_ranks_colour, niw_ranks_garment, niw_ranks_section], 
    cols_to_use=columns_to_use,
    bestsellers_prev_week=bestsellers_previous_week
    )

del candidates_last_purchase, bestsellers_previous_week, candidates_bestsellers, seasonal_candidates, top_k_snb_weekly, candidates_niw_colour, candidates_niw_garment, candidates_niw_section, niw_ranks_colour, niw_ranks_garment, niw_ranks_section
gc.collect()

# Train model
ranker = train_model(
    train_X, 
    train_y, 
    train_baskets, 
    model_params, 
    columns_to_use, 
    show_importance=10
)

gc.collect()

# Make submission
make_submission(customers, test, test_X, ranker, bestsellers_last_week, f'submission_all_best_with_radek')

Percentage of real transactions:  0.010623573219747605
Mergining features...


: 

In [None]:
!kaggle competitions submissions -c h-and-m-personalized-fashion-recommendations | head -n 3

fileName                                  date                 description                        status    publicScore  privateScore  
----------------------------------------  -------------------  ---------------------------------  --------  -----------  ------------  
submission_all_best_with_radek.csv.gz     2023-12-16 20:56:36  submission_all_best_with_radek     complete  0.00974      0.0097        
