In [None]:
#
# The MIT License (MIT)

# Copyright (c) 2021, NVIDIA CORPORATION

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

In [1]:
import os
import pandas as pd
import json
from tqdm import tqdm
import time
import numpy as np
import glob
import gc
import cudf

In [2]:
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
import dask as dask, dask_cudf
import rmm
from numba import cuda
import shutil

In [3]:
DATA_PATH = '/workspace'
PREDS_DOWNLOAD_PATH = "/workspace/models_predictions/"

### Auxiliary functions

In [4]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [5]:
def save_exploded_predictions(base_path, preds_fname, output_folder):
    pred_files_paths = sorted(list(set([f.replace(base_path, '') for f in glob.iglob(base_path + '**/**', recursive=True) if f.endswith(preds_fname)])))
    print('Files to process:')
    for f in pred_files_paths:
        print(f)
        
    print()    
    print('Start processing files...')
    
    
    for path in pred_files_paths:  

        print('Processing: ', path)
        algo, jobid, bag, fold, _ = path.split('/')
        df = pd.read_parquet(os.path.join(base_path, path), 
                             columns=['session_id_hash', 'pred_item_ids', 'pred_item_logits'])
        print('Sessions', len(df))

        print('Min preds length: ', df['pred_item_logits'].apply(len).min())        
        #Fixing cases where predictions length is 0 with a dummy prediction with 0 weight
        df['pred_item_ids'] = df['pred_item_ids'].apply(lambda x: x if len(x) > 0 else [0])
        #Apply softmax and weight by the log2(seq_len), as the softmax for shorter list of predictions is much higher
        df['pred_item_probs'] = df['pred_item_logits'].apply(lambda x: softmax(x)*np.log2(len(x)) if len(x) > 0 else [0.])        
        
        #Exploding session predictions
        df = (df.set_index('session_id_hash')[['pred_item_ids', 'pred_item_probs']] 
                  .apply(lambda x: x.apply(pd.Series).stack())
                  .reset_index()
                  .drop('level_1', 1))

        df['algorithm'] = algo
        df['jobid'] = jobid
        bag = int(bag.split('_')[1])
        df['bag'] = bag
        fold = int(fold.split('_')[1])
        df['fold'] = fold

        print('Predictions', len(df))        
        output_file_path = os.path.join(output_folder, f'preds_exploded_{algo}_bag-{bag}_fold-{fold}.parquet')
        print('Saving to', output_file_path)
        df.to_parquet(output_file_path, row_group_size=10000)
        print()

In [6]:
def ensemble_preds(df, alg_weights_df, group_by_fold):
    group_cols = ['session_id_hash', 'pred_item_ids']
    if group_by_fold:
        group_cols.append('fold')
    merged_df = df.merge(alg_weights_df, on='algorithm')
    merged_df['pred_item_scores_weighted'] = merged_df['pred_item_probs'] * merged_df['alg_weight']
    pred_scores_summed_df = merged_df \
        .groupby(group_cols)['pred_item_scores_weighted'].sum().reset_index()     
    return pred_scores_summed_df

In [7]:
def group_ensembled_session_preds(df, group_by_fold):
    group_cols = ['session_id_hash']
    if group_by_fold:
        group_cols.append('fold')
    df = df.sort_values(['session_id_hash', 'pred_item_scores_weighted'], ascending=False)
    df = df.groupby(group_cols) \
            .agg({'pred_item_ids': list, 'pred_item_scores_weighted': list}).reset_index()
    return df

In [8]:
WEIGHT_ALGORITHMS = [
    #Ensembling v1
    ('xlnet', 0.6),
    ('xlnet_freqcap', 0.55),
    ('xlnet_search', 0.50),
    ('transfoxl', 0.25),
    #Ensembling v2
    ('xlnet_freqcap_m2', 0.85),
    ('xlnet_freqcap_m3', 0.65),
    ('xlnet_m2', 0.95),
    ('xlnet_m3', 0.80),
    ('xlnet_search_m2', 1.0),
    ('xlnet_search_m3', 0.75),
    ('transfoxl_m2', 0.35),
    ('transfoxl_m3', 0.30),
]
alg_weights_df = cudf.DataFrame(WEIGHT_ALGORITHMS, columns=['algorithm', 'alg_weight'])
alg_weights_df

Unnamed: 0,algorithm,alg_weight
0,xlnet,0.6
1,xlnet_freqcap,0.55
2,xlnet_search,0.5
3,transfoxl,0.25
4,xlnet_freqcap_m2,0.85
5,xlnet_freqcap_m3,0.65
6,xlnet_m2,0.95
7,xlnet_m3,0.8
8,xlnet_search_m2,1.0
9,xlnet_search_m3,0.75


## Setup GPUs

In [9]:
CUDA_VISIBLE_DEVICES="0,1"
# Max percentage of GPU memory usage. You may want to reduce this limit to a more conservative percentage if you have issues
CUDA_SPILL_LIMIT_PERC = 0.6 

In [10]:
os.environ["CUDA_VISIBLE_DEVICES"]=CUDA_VISIBLE_DEVICES

In [11]:
#Uses a RAID folder if it is available (DGX), if not it uses the /tmp folder for DASK workspace
if os.path.exists('/raid'):
    local_directory = '/raid'
else:    
    local_directory = '/tmp'
    
dask_workdir = os.path.join(local_directory, 'dask-workdir')    
print('Dask dir:', dask_workdir)

Dask dir: /tmp/dask-workdir


In [12]:
# Make sure we have a clean worker space for Dask
if os.path.isdir(dask_workdir):
    shutil.rmtree(dask_workdir)
os.mkdir(dask_workdir)

In [13]:
def _pynvml_mem_size(kind="total", index=0):
    import pynvml

    pynvml.nvmlInit()
    size = None
    if kind == "free":
        size = int(pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(index)).free)
    elif kind == "total":
        size = int(pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(index)).total)
    else:
        raise ValueError("{0} not a supported option for device_mem_size.".format(kind))
    pynvml.nvmlShutdown()
    return size


def device_mem_size(kind="total"):
    if kind not in ["free", "total"]:
        raise ValueError("{0} not a supported option for device_mem_size.".format(kind))
    try:
        if kind == "free":
            return int(cuda.current_context().get_memory_info()[0])
        else:
            return int(cuda.current_context().get_memory_info()[1])
    except NotImplementedError:
        if kind == "free":
            # Not using NVML "free" memory, because it will not include RMM-managed memory
            warnings.warn("get_memory_info is not supported. Using total device memory from NVML.")
        size = _pynvml_mem_size(kind="total", index=0)
    return size


def get_rmm_size(size):
    return (size // 256) * 256

spill_limit = device_mem_size(kind="total") * CUDA_SPILL_LIMIT_PERC # Spill device mem to host at this limit
spill_limit

20447074713.6

In [14]:
cluster = LocalCUDACluster(dashboard_address=':7000',
                           local_directory = dask_workdir,
                           device_memory_limit = spill_limit)
client = Client(cluster)

In [15]:
# Initialize RMM pool on ALL workers
def _rmm_pool():
    rmm.reinitialize(
        pool_allocator=True,
        initial_pool_size=None, # Use default size
    )
client.run(_rmm_pool)
client

0,1
Client  Scheduler: tcp://127.0.0.1:44737  Dashboard: http://127.0.0.1:7000/status,Cluster  Workers: 2  Cores: 2  Memory: 45.79 GiB


# Ensemble Evaluation

In [None]:
VALID_FILENAME = 'valid_eval_predictions.parquet'
FOR_EVAL_FOLDER = os.path.join(PREDS_DOWNLOAD_PATH, 'for_eval/tuned/')
FOR_EVAL_FOLDER

In [None]:
FOR_EVAL_OUTPUT_FOLDER = os.path.join(FOR_EVAL_FOLDER, '_ensemble_outputs')
FOR_EVAL_OUTPUT_FOLDER

In [None]:
EVAL_PREDS_EXPLODED = os.path.join(FOR_EVAL_OUTPUT_FOLDER, 'session_preds_exploded')
os.makedirs(EVAL_PREDS_EXPLODED, exist_ok=True)
EVAL_PREDS_EXPLODED

In [None]:
EVAL_PREDS_SCORED = os.path.join(FOR_EVAL_OUTPUT_FOLDER, 'session_pred_scored')
EVAL_PREDS_SCORED

In [61]:
%%time
save_exploded_predictions(FOR_EVAL_FOLDER, VALID_FILENAME, EVAL_PREDS_EXPLODED)

Files to process:
transfoxl_m2/2062991/bag_1/fold_1/valid_eval_predictions.parquet
transfoxl_m2/2063004/bag_1/fold_4/valid_eval_predictions.parquet
transfoxl_m2/2063007/bag_1/fold_5/valid_eval_predictions.parquet
transfoxl_m2/2064363/bag_1/fold_2/valid_eval_predictions.parquet
transfoxl_m2/2064365/bag_1/fold_3/valid_eval_predictions.parquet
transfoxl_m3/2063009/bag_1/fold_1/valid_eval_predictions.parquet
transfoxl_m3/2063011/bag_1/fold_2/valid_eval_predictions.parquet
transfoxl_m3/2063018/bag_1/fold_4/valid_eval_predictions.parquet
transfoxl_m3/2064368/bag_1/fold_3/valid_eval_predictions.parquet
transfoxl_m3/2064371/bag_1/fold_5/valid_eval_predictions.parquet
xlnet_freqcap_m2/2062442/bag_1/fold_3/valid_eval_predictions.parquet
xlnet_freqcap_m2/2062910/bag_1/fold_1/valid_eval_predictions.parquet
xlnet_freqcap_m2/2062915/bag_1/fold_2/valid_eval_predictions.parquet
xlnet_freqcap_m2/2064338/bag_1/fold_4/valid_eval_predictions.parquet
xlnet_freqcap_m2/2064339/bag_1/fold_5/valid_eval_predict

### Weighting algorithms

In [20]:
eval_results_merged_df = dask_cudf.read_parquet(EVAL_PREDS_EXPLODED)

In [21]:
'''
eval_results_merged_df_copy = eval_results_merged_df
ALG_SELECTED = 'transfoxl'
eval_results_merged_df = eval_results_merged_df_copy[eval_results_merged_df_copy['algorithm'] == ALG_SELECTED]
'''

"\neval_results_merged_df_copy = eval_results_merged_df\nALG_SELECTED = 'transfoxl'\neval_results_merged_df = eval_results_merged_df_copy[eval_results_merged_df_copy['algorithm'] == ALG_SELECTED]\n"

In [23]:
eval_pred_scores_summed_df = ensemble_preds(eval_results_merged_df, alg_weights_df, group_by_fold=True)

In [24]:
eval_pred_scores_summed_df.to_parquet(EVAL_PREDS_SCORED)

In [25]:
#del eval_pred_scores_summed_df
#gc.collect()

### Group top preds into sessions

In [21]:
eval_pred_scores_summed_cdf = cudf.read_parquet(EVAL_PREDS_SCORED)
eval_pred_scores_summed_cdf

Unnamed: 0_level_0,session_id_hash,pred_item_ids,fold,pred_item_scores_weighted
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,192933,71219.0,3,0.003542
1,4595181,206835.0,5,0.017974
2,4208503,403030.0,5,0.000181
3,1924050,138930.0,2,0.008736
4,4813141,87889.0,3,0.000450
...,...,...,...,...
110956592,3442333,388954.0,1,0.033866
110956593,4228459,330512.0,3,0.008260
110956594,2570110,53064.0,3,0.114907
110956595,3713002,160175.0,5,0.013244


In [22]:
eval_results_ensembled_preds_cdf = group_ensembled_session_preds(eval_pred_scores_summed_cdf, group_by_fold=True)
eval_results_ensembled_preds_cdf

Unnamed: 0,session_id_hash,fold,pred_item_ids,pred_item_scores_weighted
0,7,5,"[497554.0, 392637.0, 119568.0, 133480.0, 45045...","[29.82871984773356, 1.3419541580848537, 0.7943..."
1,21,2,"[29868.0, 143840.0, 363279.0, 417261.0, 382222...","[4.144845788822275, 2.5119240772712192, 2.1139..."
2,40,2,"[138467.0, 430380.0, 272674.0, 351998.0, 12525...","[27.980479154624362, 2.089725645239099, 1.9551..."
3,64,4,"[116165.0, 419963.0, 97619.0, 375353.0, 221531...","[26.808105501991488, 4.29676654519713, 3.93853..."
4,74,1,"[3127.0, 118937.0, 119155.0, 388720.0, 342958....","[12.404190627400428, 4.77101212225182, 3.15984..."
...,...,...,...,...
307079,5266931,4,"[386891.0, 2504.0, 50446.0, 64131.0, 473767.0,...","[24.379723152042143, 5.067942533872458, 3.0765..."
307080,5266937,5,"[150474.0, 111037.0, 110034.0, 227358.0, 47639...","[17.213528395968556, 3.849043307720547, 2.3674..."
307081,5266950,2,"[221452.0, 255478.0, 411082.0, 4830.0, 262699....","[33.10651101213773, 0.851345949847232, 0.43872..."
307082,5266963,1,"[15926.0, 439258.0, 40872.0, 261482.0, 445615....","[15.420459061156977, 6.117983856351116, 5.6842..."


In [23]:
eval_results_ensembled_preds_df = eval_results_ensembled_preds_cdf.to_pandas()

In [24]:
eval_results_ensembled_preds_df['pred_item_ids'].apply(len).describe()

count    307084.000000
mean        361.323276
std         126.423442
min         115.000000
25%         270.000000
50%         338.000000
75%         424.000000
max        1016.000000
Name: pred_item_ids, dtype: float64

In [25]:
eval_results_ensembled_preds_df['pred_item_ids'].apply(len).describe()

count    307084.000000
mean        361.323276
std         126.423442
min         115.000000
25%         270.000000
50%         338.000000
75%         424.000000
max        1016.000000
Name: pred_item_ids, dtype: float64

In [26]:
eval_results_ensembled_preds_df['pred_item_ids'] = \
        eval_results_ensembled_preds_df['pred_item_ids'].apply(lambda x: list([int(y) for y in x])[:100])
eval_results_ensembled_preds_df['pred_item_scores_weighted'] = \
        eval_results_ensembled_preds_df['pred_item_scores_weighted'].apply(lambda x: x[:100])

In [27]:
eval_results_ensembled_preds_df

Unnamed: 0,session_id_hash,fold,pred_item_ids,pred_item_scores_weighted
0,7,5,"[497554, 392637, 119568, 133480, 450454, 36089...","[29.82871984773356, 1.3419541580848537, 0.7943..."
1,21,2,"[29868, 143840, 363279, 417261, 382222, 34133,...","[4.144845788822275, 2.5119240772712192, 2.1139..."
2,40,2,"[138467, 430380, 272674, 351998, 12525, 261734...","[27.980479154624362, 2.089725645239099, 1.9551..."
3,64,4,"[116165, 419963, 97619, 375353, 221531, 37459,...","[26.808105501991488, 4.29676654519713, 3.93853..."
4,74,1,"[3127, 118937, 119155, 388720, 342958, 178251,...","[12.404190627400428, 4.77101212225182, 3.15984..."
...,...,...,...,...
307079,5266931,4,"[386891, 2504, 50446, 64131, 473767, 414401, 9...","[24.379723152042143, 5.067942533872458, 3.0765..."
307080,5266937,5,"[150474, 111037, 110034, 227358, 476396, 21393...","[17.213528395968556, 3.849043307720547, 2.3674..."
307081,5266950,2,"[221452, 255478, 411082, 4830, 262699, 17661, ...","[33.10651101213773, 0.851345949847232, 0.43872..."
307082,5266963,1,"[15926, 439258, 40872, 261482, 445615, 430913,...","[15.420459061156977, 6.117983856351116, 5.6842..."


### Merging predictions with validation set

In [28]:
valid_folds_dfs = []
for fold in range(1,6):
    valid_fold_df = pd.read_parquet(os.path.join(DATA_PATH, f'valid-eval-{fold}.parquet'))
    print(len(valid_fold_df))
    valid_labels_fold_df = pd.read_parquet(os.path.join(DATA_PATH, f'valid-eval-labels-{fold}.parquet'))
    valid_fold_df['labels'] = valid_labels_fold_df['labels']   
    valid_folds_dfs.append(valid_fold_df)
valid_folds_merged_df = pd.concat(valid_folds_dfs)    

61219
61377
61509
61520
61459


In [29]:
eval_results_with_labels_df = eval_results_ensembled_preds_df.merge(valid_folds_merged_df, on='session_id_hash',)
len(eval_results_with_labels_df)

307084

In [30]:
eval_results_with_labels_df.to_parquet(os.path.join(FOR_EVAL_OUTPUT_FOLDER, 'eval_preds_ensembled_with_labels.parquet'))

In [31]:
len(valid_folds_merged_df)

307084

In [32]:
assert eval_results_ensembled_preds_df['session_id_hash'].nunique() == len(valid_folds_merged_df)

In [33]:
assert valid_folds_merged_df['session_id_hash'].nunique() == len(valid_folds_merged_df)

In [34]:
assert len(eval_results_ensembled_preds_df) == len(eval_results_with_labels_df)

In [35]:
assert eval_results_with_labels_df['pred_item_ids'].isna().sum() == 0

In [36]:
assert eval_results_with_labels_df['labels'].isna().sum() == 0

### Evaluation

In [37]:
import random
from collections import defaultdict
from random import randint


def convert_list_to_top_K(items_list: list, topK: int):
    """
    Extract top_K items
    It is assumed that the list of items are sorted in descending order of importance/weight/relevance

    :param items_list: list where element is a list of items
    :param topK: top K limit
    :return: list of items shortened with top K
    """
    return [list(items[:topK]) for items in items_list]


def mrr_at_k(preds: list, labels: list, topK: int):
    assert len(labels) > 0
    assert len(preds) == len(labels)

    # get top K predictions
    converted_preds = convert_list_to_top_K(items_list=preds, topK=topK)
    rr = []
    for p, l in zip(converted_preds, labels):
        if len(l) == 0:
            rr.append(0.0)
        else:
            # get next_item from labels
            next_item = l[0]
            # add 0.0 explicitly if not there (for transparency)
            if next_item not in p:
                rr.append(0.0)
            # else, take the reciprocal of prediction rank
            else:
                rr.append(1.0 / (p.index(next_item) + 1))

    # return the mean reciprocal rank
    return sum(rr) / len(labels)


def f1_at_k(preds: list, labels: list, topK: int):
    assert len(labels) > 0
    assert len(preds) == len(labels)

    # get top K predictions
    converted_preds = convert_list_to_top_K(items_list=preds, topK=topK)
    all_precision = []
    all_recall = []
    all_f1 = []

    # for each recommendation prediction, calculate the f1 based on ground truth
    for p, l in zip(converted_preds, labels):
        nb_hits = len(set(p).intersection(set(l)))
        precision = nb_hits / topK
        recall = nb_hits / len(set(l)) if len(l) > 0 else 0.0
        f1 = (
            (2 * precision * recall) / (precision + recall)
            if (precision + recall) > 0
            else 0.0
        )
        all_precision.append(precision)
        all_recall.append(recall)
        all_f1.append(f1)

    # get average f1 across all predictions
    avg_f1 = sum(all_f1) / len(labels)
    return avg_f1

In [38]:
eval_mrr = mrr_at_k(eval_results_with_labels_df['pred_item_ids'].values, 
                    eval_results_with_labels_df['labels'].values, topK=20)

In [39]:
eval_f1 = f1_at_k(eval_results_with_labels_df['pred_item_ids'].values, 
                    eval_results_with_labels_df['labels'].values, topK=20)

In [None]:
print(WEIGHT_ALGORITHMS)
#print(ALG_SELECTED)
print('MRR', eval_mrr)
print('F1 ', eval_f1)

# Ensemble Test Set

### Saving exploded predictions

In [None]:
TEST_FILENAME = 'test_predictions.parquet'
FOR_SUBMISSION_FOLDER = os.path.join(PREDS_DOWNLOAD_PATH, 'for_submission/tuned/')
FOR_SUBMISSION_FOLDER

In [None]:
FOR_SUBMISSION_OUTPUT_FOLDER = os.path.join(FOR_SUBMISSION_FOLDER, '_ensemble_outputs')
FOR_SUBMISSION_OUTPUT_FOLDER

In [None]:
TEST_PREDS_EXPLODED = os.path.join(FOR_SUBMISSION_OUTPUT_FOLDER, 'session_preds_exploded')
os.makedirs(TEST_PREDS_EXPLODED, exist_ok=True)
TEST_PREDS_EXPLODED

In [None]:
TEST_PREDS_SCORED = os.path.join(FOR_SUBMISSION_OUTPUT_FOLDER, 'session_pred_scored')
TEST_PREDS_SCORED

In [22]:
%%time
save_exploded_predictions(FOR_SUBMISSION_FOLDER, TEST_FILENAME, TEST_PREDS_EXPLODED)

Files to process:
transfoxl_m2/2064516/bag_1/fold_1/test_predictions.parquet
transfoxl_m2/2064517/bag_1/fold_2/test_predictions.parquet
transfoxl_m2/2064518/bag_1/fold_3/test_predictions.parquet
transfoxl_m2/2064520/bag_1/fold_4/test_predictions.parquet
transfoxl_m2/2064521/bag_1/fold_5/test_predictions.parquet
transfoxl_m3/2064524/bag_1/fold_1/test_predictions.parquet
transfoxl_m3/2064525/bag_1/fold_2/test_predictions.parquet
transfoxl_m3/2064527/bag_1/fold_3/test_predictions.parquet
transfoxl_m3/2064528/bag_1/fold_4/test_predictions.parquet
transfoxl_m3/2064529/bag_1/fold_5/test_predictions.parquet
xlnet_freqcap_m2/2064468/bag_1/fold_1/test_predictions.parquet
xlnet_freqcap_m2/2064474/bag_1/fold_2/test_predictions.parquet
xlnet_freqcap_m2/2064476/bag_1/fold_3/test_predictions.parquet
xlnet_freqcap_m2/2064479/bag_1/fold_4/test_predictions.parquet
xlnet_freqcap_m2/2064480/bag_1/fold_5/test_predictions.parquet
xlnet_freqcap_m3/2064481/bag_1/fold_1/test_predictions.parquet
xlnet_freqcap_

### Weighting algorithms

In [48]:
FOLD = 'fold-5'

In [49]:
sub_fold_files = [n for n in list(glob.iglob(TEST_PREDS_EXPLODED + '**/**', recursive=True)) if FOLD in n]

In [51]:
#test_results_merged_df = dask_cudf.read_parquet(TEST_PREDS_EXPLODED)
test_results_merged_df = dask_cudf.read_parquet(sub_fold_files)

In [52]:
pred_scores_summed_df = ensemble_preds(test_results_merged_df, alg_weights_df, group_by_fold=False)

In [53]:
pred_scores_summed_df.to_parquet(TEST_PREDS_SCORED+"_"+FOLD)

In [29]:
del pred_scores_summed_df
gc.collect()

473

## Averaging the item scores per fold (for sub)

In [58]:
fold_files = []
for fold in range(3,6):
    fold_files.append(TEST_PREDS_SCORED+f"_fold-{fold}/part.0.parquet")

In [59]:
test_results_merged_folds_df = dask_cudf.read_parquet(fold_files)
test_results_merged_folds_df

Unnamed: 0_level_0,session_id_hash,pred_item_ids,pred_item_scores_weighted
npartitions=3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,int64,float64,float64
,...,...,...
,...,...,...
,...,...,...


In [60]:
test_results_merged_folds_grouped_df = test_results_merged_folds_df.groupby(['session_id_hash', 'pred_item_ids'])['pred_item_scores_weighted'].sum().reset_index()
test_results_merged_folds_grouped_df

Unnamed: 0_level_0,session_id_hash,pred_item_ids,pred_item_scores_weighted
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,int64,float64,float64
,...,...,...


In [61]:
test_results_merged_folds_grouped_df.to_parquet(TEST_PREDS_SCORED+"_3-4-5")

### Another level of aggregation (for sub)

In [None]:
aggregated_sum_files = [TEST_PREDS_SCORED+"_1-2/part.0.parquet" , TEST_PREDS_SCORED+"_3-4-5/part.0.parquet"] 
aggregated_sum_files

In [17]:
test_results_merged_folds_df = dask_cudf.read_parquet(aggregated_sum_files)
test_results_merged_folds_df

Unnamed: 0_level_0,session_id_hash,pred_item_ids,pred_item_scores_weighted
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,int64,float64,float64
,...,...,...
,...,...,...


In [18]:
test_results_merged_folds_grouped_df = test_results_merged_folds_df.groupby(['session_id_hash', 'pred_item_ids'])['pred_item_scores_weighted'].sum().reset_index()
test_results_merged_folds_grouped_df

Unnamed: 0_level_0,session_id_hash,pred_item_ids,pred_item_scores_weighted
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,int64,float64,float64
,...,...,...


In [19]:
test_results_merged_folds_grouped_df.to_parquet(TEST_PREDS_SCORED+"_final")

### Group top preds into sessions

In [14]:
test_pred_scores_summed_cdf = cudf.read_parquet(TEST_PREDS_SCORED)

In [15]:
test_pred_scores_summed_cdf

Unnamed: 0_level_0,session_id_hash,pred_item_ids,pred_item_scores_weighted
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3863181,496370.0,0.032411
1,4065529,115673.0,0.045172
2,3224889,182927.0,0.003851
3,994989,22475.0,0.004814
4,769441,301606.0,0.001168
...,...,...,...
180722896,4622781,93602.0,18.983492
180722897,3883096,502762.0,0.004093
180722898,2392399,397213.0,0.004097
180722899,1189921,215467.0,0.016455


In [16]:
test_results_ensembled_preds_cdf = group_ensembled_session_preds(test_pred_scores_summed_cdf, group_by_fold=False)
test_results_ensembled_preds_cdf

Unnamed: 0,session_id_hash,pred_item_ids,pred_item_scores_weighted
0,2,"[221531.0, 105170.0, 330512.0, 8303.0, 213290....","[88.35344502508332, 35.55022974899035, 18.6937..."
1,33,"[278092.0, 376278.0, 128760.0, 351866.0, 79562...","[104.39880671402057, 12.235515430993477, 11.66..."
2,54,"[199137.0, 34047.0, 200341.0, 33193.0, 318976....","[9.008235012540148, 8.13507718646635, 7.598052..."
3,55,"[226466.0, 88631.0, 215086.0, 384756.0, 280317...","[39.57287559171993, 10.541324842056408, 10.186..."
4,63,"[79562.0, 74071.0, 351866.0, 448530.0, 372935....","[65.9833019778118, 64.49488849112365, 4.170873..."
...,...,...,...
332242,5266853,"[75016.0, 255406.0, 236069.0, 234711.0, 75213....","[93.76059495323233, 9.820606105132505, 9.31971..."
332243,5266889,"[474449.0, 91025.0, 8870.0, 298579.0, 474033.0...","[80.56800175254035, 25.245654223340853, 16.914..."
332244,5266910,"[155497.0, 403054.0, 398822.0, 347755.0, 32021...","[62.749846468035074, 36.790424820437295, 29.52..."
332245,5266915,"[357458.0, 298925.0, 194814.0, 428232.0, 24584...","[62.709502916084325, 34.50058238086534, 10.788..."


In [17]:
test_results_ensembled_preds_df = test_results_ensembled_preds_cdf.to_pandas()

In [18]:
test_results_ensembled_preds_df['pred_item_ids'].apply(len).describe()

count    332247.000000
mean        543.941408
std         265.417406
min         138.000000
25%         365.000000
50%         477.000000
75%         641.000000
max        2720.000000
Name: pred_item_ids, dtype: float64

In [19]:
test_results_ensembled_preds_df['pred_item_ids'] = \
        test_results_ensembled_preds_df['pred_item_ids'].apply(lambda x: list([int(y) for y in x])[:100])
test_results_ensembled_preds_df['pred_item_scores_weighted'] = \
        test_results_ensembled_preds_df['pred_item_scores_weighted'].apply(lambda x: x[:100])

In [20]:
test_results_ensembled_preds_df

Unnamed: 0,session_id_hash,pred_item_ids,pred_item_scores_weighted
0,2,"[221531, 105170, 330512, 8303, 213290, 489691,...","[88.35344502508332, 35.55022974899035, 18.6937..."
1,33,"[278092, 376278, 128760, 351866, 79562, 74071,...","[104.39880671402057, 12.235515430993477, 11.66..."
2,54,"[199137, 34047, 200341, 33193, 318976, 45328, ...","[9.008235012540148, 8.13507718646635, 7.598052..."
3,55,"[226466, 88631, 215086, 384756, 280317, 130221...","[39.57287559171993, 10.541324842056408, 10.186..."
4,63,"[79562, 74071, 351866, 448530, 372935, 508117,...","[65.9833019778118, 64.49488849112365, 4.170873..."
...,...,...,...
332242,5266853,"[75016, 255406, 236069, 234711, 75213, 154692,...","[93.76059495323233, 9.820606105132505, 9.31971..."
332243,5266889,"[474449, 91025, 8870, 298579, 474033, 402263, ...","[80.56800175254035, 25.245654223340853, 16.914..."
332244,5266910,"[155497, 403054, 398822, 347755, 320210, 77268...","[62.749846468035074, 36.790424820437295, 29.52..."
332245,5266915,"[357458, 298925, 194814, 428232, 245849, 10760...","[62.709502916084325, 34.50058238086534, 10.788..."


In [None]:
FOR_SUBMISSION_OUTPUT_FOLDER

In [22]:
test_results_ensembled_preds_df.to_parquet(os.path.join(FOR_SUBMISSION_OUTPUT_FOLDER, 'test_preds_ensembled_5folds.parquet'))

## Submission

In [23]:
OUTPUT_ENSEMBLE_PATH = FOR_SUBMISSION_OUTPUT_FOLDER
os.makedirs(OUTPUT_ENSEMBLE_PATH, exist_ok=True)

EMAIL_SUBMISSION = 'EMAIL@DOMAIN.COM'

In [24]:
mapping_product_sku_without_urls_df = pd.read_parquet(
        os.path.join(
            DATA_PATH, "categories/mapping_product_sku_without_urls.parquet"
        )
    )
item_id_mapping = dict(
    zip(
        mapping_product_sku_without_urls_df["encoded_product_sku"].values,
        mapping_product_sku_without_urls_df["original_product_sku"].values,
    )
)

In [25]:
# Loading session id mapping to encode session ids used for prediction
session_id_mapping_df = pd.read_parquet(
        os.path.join(
            DATA_PATH, "categories/unique.session_id_hash.parquet"
        )
    ).reset_index()
session_id_mapping_df.columns = ['session_id_hash', 'original_session_id_hash']
session_id_mapping_df

Unnamed: 0,session_id_hash,original_session_id_hash
0,0,
1,1,00000114e1075962f022114fcfc17f2d874e694ac5d201...
2,2,000001cc2b6d024f5c1d93536c2f22e7adb3f390ff0212...
3,3,00000277639fc5c6f816654b78bf3654ece7fd53a7338f...
4,4,000009f36a40de1d557afc083dbb3fc03eef2473337bad...
...,...,...
5266968,5266968,ffffed918e1086333206d96f1fae684de5774866848bb3...
5266969,5266969,fffff2183ad2daa1db3bc4cd320e7248b2f5a6ef9709c4...
5266970,5266970,fffff429d16c1f43796e2c7ccad8a34f8a37b59927b5b4...
5266971,5266971,fffff68e9f346ad171103ae6125899a72c0d77d2d0637c...


In [26]:
test_results_ensembled_preds_sub = test_results_ensembled_preds_df.merge(session_id_mapping_df, on='session_id_hash') \
                    .set_index('original_session_id_hash')
test_results_ensembled_preds_sub

Unnamed: 0_level_0,session_id_hash,pred_item_ids,pred_item_scores_weighted
original_session_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000001cc2b6d024f5c1d93536c2f22e7adb3f390ff0212207d4f5532fab1b9d9,2,"[221531, 105170, 330512, 8303, 213290, 489691,...","[88.35344502508332, 35.55022974899035, 18.6937..."
000064267054198e2f5744100c27111c0d0d585dd73e7e3be246f5d853f32183,33,"[278092, 376278, 128760, 351866, 79562, 74071,...","[104.39880671402057, 12.235515430993477, 11.66..."
0000976bb88afb92b355acdad71a8951472b1c611d31c4ce1d16d9cde7a609c2,54,"[199137, 34047, 200341, 33193, 318976, 45328, ...","[9.008235012540148, 8.13507718646635, 7.598052..."
00009b888f49f538ee3efe2a897e96ce98003b8a9f9f7012535ea5402b56eaed,55,"[226466, 88631, 215086, 384756, 280317, 130221...","[39.57287559171993, 10.541324842056408, 10.186..."
0000b30a047691fe5754ff7371fa94df09bf39b8745f94f5458748151e9379a1,63,"[79562, 74071, 351866, 448530, 372935, 508117,...","[65.9833019778118, 64.49488849112365, 4.170873..."
...,...,...,...
fffe99171c38b4615075b5f1bcff7c32e1f2a7411e7f6f5e55ea219d051bad4c,5266853,"[75016, 255406, 236069, 234711, 75213, 154692,...","[93.76059495323233, 9.820606105132505, 9.31971..."
fffee976db6c288556a3e817eb0cf176430e9265fd34013549b3c30f6459fc70,5266889,"[474449, 91025, 8870, 298579, 474033, 402263, ...","[80.56800175254035, 25.245654223340853, 16.914..."
ffff3eb9d3965596ebed08310c300877e70809c47bfd59f19869b62efed1d7b3,5266910,"[155497, 403054, 398822, 347755, 320210, 77268...","[62.749846468035074, 36.790424820437295, 29.52..."
ffff4ebcfe1a162ae0ee4d33c573fd373ffd09e3595295786bb17105f8ba0109,5266915,"[357458, 298925, 194814, 428232, 245849, 10760...","[62.709502916084325, 34.50058238086534, 10.788..."


## Generating submission file

In [27]:
def generate_submission_file(
    test_predictions_df, item_id_mapping, data_path, email_submission, output_folder
):
    with open(os.path.join(data_path, "rec_test_phase_2.json")) as json_file:
        test_queries = json.load(json_file)

    assert len(test_predictions_df) == len(test_queries)

    sessions_not_found = 0
    preds_with_length_less_than_20 = 0
    count = 0
    sessions_lengths_list = []
    for session in tqdm(test_queries):
        session_id_hash = session["query"][0]["session_id_hash"]

        session_product_interacted = set(
            list(
                [
                    interaction["product_sku_hash"]
                    for interaction in session["query"]
                    if interaction["product_sku_hash"] is not None
                ]
            )
        )

        if session_id_hash in test_predictions_df.index:
            predictions = test_predictions_df.loc[session_id_hash]["pred_item_ids"]

            # Converting to the original product sku
            predictions = list(
                [
                    item_id_mapping[pred_item]
                    for pred_item in predictions
                    if item_id_mapping[pred_item] != "missing"
                ]
            )

            # assert len(set(predictions).intersection(session_product_interacted)) == 0

            # Removing from predictions any item that was already interacted within the session
            predictions = [
                p for p in predictions if p not in session_product_interacted
            ]

            if len(predictions) < 20:
                preds_with_length_less_than_20 += 1

        else:
            raise Exception(
                "Session {} not found in the preprocessed test set".format(session)
            )            

        sessions_lengths_list.append(len(predictions))

        count += 1
        session["label"] = predictions[:20]

    total_sessions = len(test_queries)
    print(
        f"# Total sessions: {total_sessions} - # Sessions not found: {sessions_not_found} - # Sessions with length < 20: {preds_with_length_less_than_20} - avg length: "
        + str(sum(sessions_lengths_list) / len(sessions_lengths_list))
    )

    local_prediction_file = "{}_{}.json".format(
        email_submission.replace("@", "_"), round(time.time() * 1000)
    )
    local_prediction_file_path = os.path.join(output_folder, local_prediction_file)
    print("Generating JSON file with predictions")
    with open(local_prediction_file_path, "w") as fp:
        json.dump(test_queries, fp, indent=2)

    return local_prediction_file_path

In [28]:
local_prediction_file_path = generate_submission_file(
    test_results_ensembled_preds_sub, item_id_mapping, DATA_PATH, EMAIL_SUBMISSION, OUTPUT_ENSEMBLE_PATH
)

100%|██████████| 332247/332247 [00:45<00:00, 7314.71it/s]


# Total sessions: 332247 - # Sessions not found: 0 - # Sessions with length < 20: 0 - avg length: 99.9966561022372
Generating JSON file with predictions


In [None]:
!ls $local_prediction_file_path

## Submission

In [5]:
from datetime import datetime

import boto3
from dotenv import load_dotenv

# load envs from env file
load_dotenv(
    verbose=True,
    dotenv_path="../transformers/sigir_ecom_challenge_code/submission/upload.env",
)

# env info should be in your env file
BUCKET_NAME = os.getenv("BUCKET_NAME")  # you received it in your e-mail
EMAIL = os.getenv("EMAIL")  # the e-mail you used to sign up
PARTICIPANT_ID = os.getenv("PARTICIPANT_ID")  # you received it in your e-mail
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")  # you received it in your e-mail
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")  # you received it in your e-mail


def upload_submission(local_file: str, task: str):

    print("Starting submission at {}...\n".format(datetime.utcnow()))
    # instantiate boto3 client
    s3_client = boto3.client(
        "s3",
        aws_access_key_id=AWS_ACCESS_KEY,
        aws_secret_access_key=AWS_SECRET_KEY,
        region_name="us-west-2",
    )
    s3_file_name = os.path.basename(local_file)
    # prepare s3 path according to the spec
    s3_file_path = "{}/{}/{}".format(
        task, PARTICIPANT_ID, s3_file_name
    )  # it needs to be like e.g. "rec/id/*.json"
    # upload file
    s3_client.upload_file(local_file, BUCKET_NAME, s3_file_path)
    # say bye
    print("\nAll done at {}: see you, space cowboy!".format(datetime.utcnow()))

    return

In [6]:
TASK = "rec"
upload_submission(local_file=local_prediction_file_path, task=TASK)

Starting submission at 2021-06-18 13:46:39.478654...


All done at 2021-06-18 13:50:17.383293: see you, space cowboy!
