## Final Code

After a lot of different approaches, I ended up having a "business logic" model, empowered by an ItemEmbeddings and using the "Sparse Matrix" of iteractions Item-to-Item.

I ended up creating a similar approach as the one showed in the Workshop, trying to simulate the Colaborative Filtering Matrix (item to item), but doing some tweaks in the way of calculating the final weight in the interaction item-item.
The best result came when you normalize the interaction by the total number of interactions in that session. With this you dont over estimate items viewed in session with repetitive views/search actions over the same item.
In this process I tried different weights (fixed, incrementally to the past, random), and the best one was a fixed one.

I also have other ideas (that I left in this notebook) that could be another improvement but due my lack of a good laptop I wasn't able to ran it in a decent amount of time.
Almost every Spark or xGboost/LightBM model didn't make it using my local pc, so I stop study those ones and keep this one that was the most promissing and without breaking my laptop.

In [1]:
import gzip
import json
import gc
import math
import random
from sklearn.model_selection import train_test_split

from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from pathlib import Path

import pandas as pd
import numpy as np

full_item_data = pd.read_json('item_data.jl.gz', lines = True)
full_item_data.domain_id = np.where(full_item_data.domain_id.isna(), 'servicio', full_item_data.domain_id)

items_embeddings = pd.read_parquet('items_embeddings.parquet')
items_embeddings.reset_index(inplace = True, drop = False)

In [2]:
def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output
rows = jl_to_list('train_dataset.jl.gz')
rows_train, rows_test= train_test_split(rows, test_size=0.2, random_state=42)
all_items = list(full_item_data.item_id.unique())

In [25]:
def item_domain_dict_f(item_df):
    """
    Given the Item Data Set as DataFrame,
    returns a dictionary that provide the domain of the item (key)
    like this {'10000':'MLB-TV'}
    """

    item_df = pd.Series(item_df.domain_id.values,index=item_df.item_id).to_dict()
    return item_df

item_domain_dict = item_domain_dict_f(full_item_data)

def item_to_item(rows, most_common):
    """
    Create a dictionary of dictionaries that keep the 
    times of all users who view Y ended up buying X.
    :most_common param: limit of different items viewed by
    the user. Takes the top most_common in the session.
    """
    view_purchases = defaultdict(lambda: defaultdict(int))
    search_purchases = defaultdict(lambda: defaultdict(int))
    for row in tqdm(rows):
        views = Counter([x['event_info'] for x in row['user_history'] if x['event_type']=='view']).most_common()[:most_common]
        total_views = sum([x[1] for x in views])
        searchs = Counter([x['event_info'] for x in row['user_history'] if x['event_type']!='view']).most_common()[:most_common]
        total_searchs = sum([x[1] for x in searchs])
        for v,rep in views:
            view_purchases[int(v)][int(row['item_bought'])]+=rep*(1/total_views)
        for s,rep in searchs:
            search_purchases[s][int(row['item_bought'])]+=rep*(1/total_searchs)
    return view_purchases, search_purchases

view_purchases, search_purchases = item_to_item(rows_train, 5)


def get_scores(row):
    """
    Given a user history, return a counter of the items purchased
    for the search and the viewed items by users.
    
    The weights of the wies-to-purchase was defined using different 
    combinations, but keeping the logic of view = bought should be higher
    than the others.
    """
    item_scores = defaultdict(int)
    views = [x['event_info'] for x in row['user_history'] if x['event_type']=='view']
    searchs = [x['event_info'] for x in row['user_history'] if x['event_type']!='view']
    
    for view in views:
        for k,v in view_purchases[int(view)].items():
                if view==k:
                    item_scores[k]+=6*v/len(views)
                elif item_domain_dict[view] == item_domain_dict[k]:
                    item_scores[k]+=3*v/len(views)
    for s in searchs:
        for k,v in search_purchases[s].items():
                item_scores[k]+=v/len(searchs)*0.5
    return Counter(item_scores)

HBox(children=(FloatProgress(value=0.0, max=330530.0), HTML(value='')))




Helper Functions

In [7]:
def get_domains_viewed(row):
    """
    Given a User Session, return the Items and Domains viewed by the user
    (if exists)
    """
    items = [x['event_info'] for x in row['user_history'] if x['event_type']=='view']
    domains = [item_domain_dict[x] for x in items]
    return items, domains

def get_domains_most_common(scores):
    """
    Given a list of items, returns the list of domains
    of those items.
    """
    domains = [item_domain_dict[x[0]] for x in scores]
    return domains

def find_union_domain(row, scores):
    """
    Given the session history and the Scores of that session, 
    return two values:
    :union output: Boolean, represent if there are a domain intersection
    between the scores and the domains viewed.
    :last_item output: Item id of the last item viewed in the session (if exists)
    """
    domain_scores = get_domains_most_common(scores)
    items, domains_viewed = get_domains_viewed(row)
    if items:
        return any(set(domain_scores).intersection(set(domains_viewed))) , items[-1]
    else:
        return any(set(domain_scores).intersection(set(domains_viewed))) , None

y_train = [x['item_bought'] for x in rows_train]
most_sold = [x[0] for x in Counter([item_domain_dict[x] for x in y_train]).most_common()[:1]]
items_for_most_sold = full_item_data[(full_item_data.domain_id.isin(most_sold))].sample(10).item_id.unique()

def fill_with_most_sold(reco):
    """
    Provide an static list of items that belongs to the most sold
    domain in the training data.
    """
    return [x for x in items_for_most_sold if x not in reco]

In [10]:
def get_most_similar(itemid,reco,k):
    """
    Given an especific Item Id and the items already recommended by the code,
    reutrn a list of the most similar items to the Item Id provided, with the
    help of the Item Embeddings created in the first step.
    :most_similar_top1 output: List of item ids of lenght k.
    """
    item_vec = items_embeddings[items_embeddings.item_id==itemid][[str(x) for x in range(6)]]
    idx = np.argpartition(np.linalg.norm(items_embeddings[[str(x) for x in range(6)]].sub(np.array(item_vec)), axis=1), 10)[:10]
    df_items = full_item_data[(full_item_data.item_id.isin(items_embeddings.iloc[idx].item_id.values)) &
                              (~full_item_data.item_id.isin(reco))]
    most_similar_top1 = list(df_items.item_id.unique())
    return most_similar_top1[:k]


def view_search_recom_similar(row, cut):
    """
    
    Given a defined Session, returns the top 10 (possible) items to be
    purchased given the history of views and searchs of that user.
    
    """
    reco = []
    scores = get_scores(row)
    most_common = scores.most_common()[:10]
    nan = 0
    
    union , last_item = find_union_domain(row, most_common)
    if union and last_item:
        if item_domain_dict[last_item] in get_domains_most_common(most_common):
            reco.append(last_item)
        for item, score in most_common:
            if item!=last_item:
                reco.append(item)
            if len(reco) >= cut:
                break
    
    if not union and last_item and last_item not in reco:
        reco.append(last_item)

    k = 10 - len(reco)
    queried = len(reco)
    if k<10 and k>0:
        itemid = reco[0]
        most_similar_top = get_most_similar(itemid,reco,k)
        reco = reco + most_similar_top
        return reco
    ##No last Item (aka no views)
    elif k==10:
        if len(most_common)>0:
            itemid = most_common[0][0]
            most_similar_top = get_most_similar(itemid,reco,k)
            reco = reco + most_similar_top
            return reco
        else:
            relleno = fill_with_most_sold(reco)
            reco = reco + relleno
    
    return reco

In [22]:
y_pred = []
for row in tqdm(rows_test):
    recom = view_search_recom_similar(row, cut = 6)
    y_pred.append(recom)

HBox(children=(FloatProgress(value=0.0, max=82633.0), HTML(value='')))




In [12]:
item_data = jl_to_list('item_data.jl.gz')

In [23]:
from challenge_metric import ndcg_score
y_true = [x['item_bought'] for x in rows_test]

score = ndcg_score(y_true, y_pred, item_data,n_predictions=10)
print(f'Your score is: {score}')

Your score is: 0.27927938293429105


In [24]:
pd.DataFrame(y_pred).to_csv('trainning_results_cut_6_most_common_5_279279.csv', index = False, header = None)