## Calibrate Results

This second step is used after the predictions, in order to increase the number of items with the same domain in the prediction N.

The final configuration in the code was defined after a lot of different gridsearch and different configurations in the small functions, providing the best results the ones that appears in the notebook.

In [1]:
import gzip
import json
import gc
import math

from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from pathlib import Path

import pandas as pd
import numpy as np
full_item_data = pd.read_json('item_data.jl.gz', lines = True)
full_item_data['domain_id'] = np.where(full_item_data.domain_id.isna(),'servicio',full_item_data.domain_id)
items_embeddings = pd.read_parquet('items_embeddings.parquet')
items_embeddings.reset_index(inplace = True, drop = False)

In [2]:
def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

item_data = jl_to_list('item_data.jl.gz')

In [3]:
def get_most_similar(itemid,reco,k):
    item_vec = items_embeddings[items_embeddings.item_id==itemid][[str(x) for x in range(6)]]
    idx = np.argpartition(np.linalg.norm(items_embeddings[[str(x) for x in range(6)]].sub(np.array(item_vec)), axis=1), 10)[:10]
    df_items = full_item_data[(full_item_data.item_id.isin(items_embeddings.iloc[idx].item_id.values)) &
                              (~full_item_data.item_id.isin(reco))]
    most_similar_top1 = list(df_items.item_id.unique())
    return most_similar_top1[:k]

In [4]:
item_domain_dict = pd.Series(full_item_data.domain_id.values,index=full_item_data.item_id).to_dict()

In [5]:
latest_results = pd.read_csv('trainning_results_cut_6_most_common_5_279279.csv', header = None)
latest_results.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
82628,1059074,452045,640905,893084,122544,1162286,1970493,987543,1847698,606727
82629,1525369,982916,1450303,1690946,537495,801112,759849,1615304,831942,1101365
82630,1994487,674874,715914,2079500,835775,1146224,120717,1397059,724662,955342
82631,1828214,2087286,724879,1168577,868903,131739,907524,576479,373681,1959472
82632,1076763,1785970,1680032,1730020,2087802,1949095,972356,1926118,490841,978239


In [7]:
def calibrate_results(reco,majority_cut,starting_pos):
    """
    Given a recomendation, drop all items starting at starting_pos index, 
    and replace them with items similar at the first recommendation.
    
    :reco param: List of items already recommended in the previous step
    :majority_cut param: Integer, represent a threshold for the number of domain ids equal
    to the domain id of the first recommendation. If the amount of recommended items that belongs to that
    domain is greater than majority_cut, remove the ones that dont belong to that domain and replace them.
    :starting_pos param: Integer, index from where the algorithm will start removing the items 
    that dont belong to the most dominant domain id.
    
    :reco output: List, a new list containing the final recommendation
    """
    majority, number =  Counter([item_domain_dict[x] for x in reco]).most_common()[0]
    if number ==10:
        return reco
    f = item_domain_dict[reco[0]]
    
    if f==majority and number>=majority_cut:
        c_reco = reco.copy()
        for item in c_reco[starting_pos:]:
            if item_domain_dict[item]!=f:
                c_reco.remove(item)
        k = 10 - len(c_reco)
        if k==0:
            return reco
        relleno = get_most_similar(reco[0],c_reco,k)
        c_reco = c_reco + relleno
        return c_reco
    
    return reco

In [8]:
y_pred = latest_results.to_numpy().tolist()
y_pred_calibrated = []
for reco in tqdm(y_pred):
    new_reco = calibrate_results(reco,6,3)
    y_pred_calibrated.append(new_reco)

HBox(children=(FloatProgress(value=0.0, max=82633.0), HTML(value='')))




In [10]:
from challenge_metric import ndcg_score
from sklearn.model_selection import train_test_split

rows = jl_to_list('train_dataset.jl.gz')
rows_train, rows_test= train_test_split(rows, test_size=0.2, random_state=42)
y_true = [x['item_bought'] for x in rows_test]

score = ndcg_score(y_true, y_pred_calibrated, item_data,n_predictions=10)
print(f'Your score is: {score}')

Your score is: 0.2816344741430788


We pass from  0.27927938293429105 to 0.2816344741430788

In [11]:
pd.DataFrame(y_pred_calibrated).to_csv('calibrated_s1_trainning_results_cut_6_most_common_5_279279.csv', index = False, header = None)

In [15]:
y_true = [x['item_bought'] for x in rows_test]
domain_most_sold = Counter([item_domain_dict[x] for x in y_true]).most_common()[0][0]
static_most_sold = list(full_item_data[full_item_data.domain_id == domain_most_sold].sample(10).item_id.unique())

def fill_with_domain_most_sold(reco,cut):
    """
    Given a calibrated recommenadtion, if it doesnt have more than cut items with the same domain,
    replace all the recommendations with the static one generated in the code above.
    :cut param: Integer, min number of items with the same domain in order avoid being replaced. 
    """
    majority =  Counter([item_domain_dict[x] for x in reco]).most_common()[0][1]
    if majority<=cut:
        reco = static_most_sold
        return reco
    return reco

In [19]:
y_pred_calibrated_2 = []
for reco in tqdm(y_pred_calibrated):
    y_pred_calibrated_2.append(fill_with_domain_most_sold(reco,2))

HBox(children=(FloatProgress(value=0.0, max=82633.0), HTML(value='')))




In [20]:
score = ndcg_score(y_true, y_pred_calibrated_2, item_data,n_predictions=10)
print(f'Your score is: {score}')

Your score is: 0.2816344741430788


In [21]:
pd.DataFrame(y_pred_calibrated_2).to_csv('final_trainning_predictions.csv', index = False, header = None)