# I. preparing 
## a. importing lib and data, data preparation and formatting 


In [None]:
# import gzip
import json
import gc
import math
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import random
import pandas as pd
from sklearn.model_selection import train_test_split 
import operator
import unidecode
from itertools import islice

def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

#######################################################

item_data = jl_to_list('item_data.jl.gz')
metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

#######################################################

samples = False
rows = jl_to_list('train_dataset.jl.gz')
print(len(rows))
if samples:
    rows = rows[:samples]
    
#######################################################

metadata_domain_quantity =defaultdict(int)
metadata_domain_id_with_item = defaultdict(lambda: defaultdict(int))
metadata_domain_item_words = defaultdict(lambda: defaultdict(lambda: []))
metadata_domain_words = defaultdict(lambda : defaultdict(int))

for x in tqdm(item_data):
    metadata_domain_id_with_item[x['domain_id']][x['item_id']]=0   
    words = x['title'].split()
    
    for w in words:
        if (len(w)>3) & (unidecode.unidecode(w).upper() not in metadata_domain_item_words[x['domain_id']][x['item_id']]):
            metadata_domain_item_words[x['domain_id']][x['item_id']].append(unidecode.unidecode(w).upper())
        if (len(w)>3):
            metadata_domain_words[x['domain_id']][unidecode.unidecode(w).upper()]+=1


for row in tqdm(rows):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
    for item in viewed:
        domain = metadata[item]['domain_id']
        metadata_domain_id_with_item[domain][item]+=1
        
    metadata_domain_quantity[metadata[row['item_bought']]['domain_id']]+=1
     
metadata_domain_quantity = dict(sorted(metadata_domain_quantity.items(), key=lambda item: item[1], reverse = True))

lim_gr = 0.98
sum_item=0
sum_all_items = sum(metadata_domain_quantity.values())
list_domain_to_study = []
for dom, n_item_bought in metadata_domain_quantity.items():
    sum_item+=n_item_bought
    list_domain_to_study.append(dom)
    if float(sum_item)/float(sum_all_items)>lim_gr:
        break



## I. b. scoring metric and helper function

In [None]:
#######################################################
# score metric
def ndcg(y_pred, y_true):

    ncdg_li = []
    for y_pred_el, y_true_el in zip(y_pred, y_true):
        dcg = 0
        pos=1
        uniq_ypred = []
        for y_pred_el_pos in y_pred_el:
            if y_pred_el_pos not in uniq_ypred:
                uniq_ypred.append(y_pred_el_pos)                
                dcg+=relevance(y_true_el, y_pred_el_pos)/(math.log(1+pos))
            else:
                dcg+=0
            
            pos+=1
            
        idcg = 22.42461597
        ncdg_li.append(dcg/idcg)
    
    return ncdg_li
 
def relevance(y, y_hat):
    
    domain_y = metadata[y]['domain_id']
    domain_y_hat = metadata[y_hat]['domain_id']
    
    if y == y_hat:
        return 12
    elif domain_y == domain_y_hat:
        return 1
    else:
        return 0

    
#######################################################
# probably exist better 
from datetime import datetime

def diff_date(d1_string, d2_string):
    
    d1_date= datetime(year = int(d1_string[0:4]), month=int(d1_string[5:7]), day =int(d1_string[8:10]), hour = int(d1_string[11:13]), 
                               minute = int(d1_string[14:16]), second = int(d1_string[17:19]))

    d2_date= datetime(year = int(d2_string[0:4]), month=int(d2_string[5:7]), day =int(d2_string[8:10]), hour = int(d2_string[11:13]), 
                               minute = int(d2_string[14:16]), second = int(d2_string[17:19]))

    return (d2_date-d1_date).total_seconds()/(3600*24)

        

## I.c. some helper functions
here, trying to take into account the words from search 

In [2]:
# search_style : "most common", "last search"
def relleno_search_based(row, search_style):

    search = [ev for ev in row['user_history'] if ev['event_type'] == 'search']

    all_words_searched = Counter()
    words_last_search = []
    words_most_common_search = []
    
    if len(search)>0:

        for ev in search:
            all_words_searched[ev['event_info']]+=1
            words_last_search = ev['event_info'].split()

        if search_style == "most common":
            words_studied = all_words_searched.most_common(1)[0][0].split()
        elif search_style == "last search":
            words_studied = words_last_search
        all_domain_words_relation = defaultdict(int)

        for dom in list_domain_to_study:

            value_words = 0
            
            for word in words_studied:
                
                if len(word)>3:
                    
                    word_upper = unidecode.unidecode(word).upper()
                    if (word_upper in metadata_domain_words[dom].keys()) or (word_upper[:len(word_upper)-1] in metadata_domain_words[dom].keys()) or (word_upper[:len(word_upper)-2] in metadata_domain_words[dom].keys()):
                        value_words+= metadata_domain_words[dom][word_upper]

              
            all_domain_words_relation[dom]+=value_words

        
        all_domain_words_relation = dict(sorted(all_domain_words_relation.items(), key=lambda item: item[1], reverse = True))

        first_dom = list(islice(all_domain_words_relation, 1))[0]  
        list_items = defaultdict(int)
        
        for item_id, words in metadata_domain_item_words[first_dom].items():
            
            for word in words_studied: 
                if len(word)>3:
                    word_upper = unidecode.unidecode(word).upper()
                    if (word_upper in words) or (word_upper[:len(word_upper)-1] in words) or  (word_upper[:len(word_upper)-2] in words):
                        list_items[item_id]+=1
        
        if len(list_items)!=0:
            list_items = dict(sorted(list_items.items(), key=lambda item: item[1], reverse = True))        
           
        else:
            list_items = dict(sorted(metadata_domain_id_with_item[first_dom].items(), key=lambda item: item[1], reverse = True))      
 
        return [item[0] for item in list(islice(list_items.items(), 10))[:10]]
   

here, generating the various ranks, here, through the use of pandas dataframe ==>> not the best way to handle data, may be done probably with less lines

In [5]:
# function which produces the variables that will be used to predict the domain
def df_rows(rows, with_target=False):

    i=0
    j=0
    data_domains = []
    data_items=[]
    for row in tqdm(rows):


        i+=1
        
        
        ### search part
        search = [ev for ev in row['user_history'] if ev['event_type'] == 'search']

        all_words_searched = Counter()
        words_last_search = []
        words_most_common_search = []
        metadata_domain_info_sorted ={}
        
        if len(search)>0:
        

            for ev in search:
                all_words_searched[ev['event_info']]+=1
                words_last_search = ev['event_info'].split()
                
            search_style = "most common"
            if search_style == "most common":
                words_studied = all_words_searched.most_common(1)[0][0].split()
            elif search_style == "last search":
                words_studied = words_last_search
            all_domain_words_relation = defaultdict(int)

            for dom in list_domain_to_study:

                value_words = 0
            
                for word in words_studied:
                
                    if len(word)>3:
                    
                        word_upper = unidecode.unidecode(word).upper()
                        if (word_upper in metadata_domain_words[dom].keys()) or (word_upper[:len(word_upper)-1] in metadata_domain_words[dom].keys()) or (word_upper[:len(word_upper)-2] in metadata_domain_words[dom].keys()):
                            value_words+= metadata_domain_words[dom][word_upper]

              
                all_domain_words_relation[dom]+=value_words

        
            metadata_domain_info_sorted = dict(sorted(all_domain_words_relation.items(), key=lambda item: item[1], reverse = True))

        
        ### viewed part
        viewed = [ev for ev in row['user_history'] if ev['event_type'] == 'view']
        if len(viewed)==0:  
            continue
            
        domains_list = defaultdict(int)
        item_list_by_domains_list = defaultdict(lambda: defaultdict(lambda:[]))

        for item in viewed:
            if metadata[item['event_info']]['domain_id']!=None:
                domains_list[metadata[item['event_info']]['domain_id']]+=1
                item_list_by_domains_list[metadata[item['event_info']]['domain_id']][item['event_info']].append(item['event_timestamp'])

        if len(domains_list)==0:  
            continue
            
        data_domain = []
        
        for dom,dom_info in item_list_by_domains_list.items():

            dom_val_search = 0
            
            if dom in metadata_domain_info_sorted.keys():
                dom_val_search = metadata_domain_info_sorted[dom]
                
            if with_target:
                if metadata[row['item_bought']]['domain_id'] == dom:    
                    in_domain_only_from_item_bought = 1
                else:
                    in_domain_only_from_item_bought = 0
            
                if metadata[row['item_bought']]['domain_id'] in item_list_by_domains_list.keys():    
                    in_1domain_of_domain_list = 1
                else:
                    in_1domain_of_domain_list = 0
            else:
                in_domain_only_from_item_bought =0
                in_1domain_of_domain_list =0
                
                
                
            data_item=[]
            in_domain_and_item_bought = 0
            for item, item_event in dom_info.items():

                if with_target:
                    if row['item_bought'] == item:
                        item_bought = 1
                        in_domain_and_item_bought =1
                        j+=1
                    else:
                        item_bought = 0
                else:
                    item_bought =0
                    in_domain_and_item_bought =0

                data_item.append([item, len(item_event), item_event[len(item_event)-1],item_bought, dom, i, 0,
                                 metadata[item]['title'], metadata[item]['price'],
                                 metadata[item]['category_id']])

            sort_item_by_item = sorted(data_item, key = lambda x:(-x[0]))
            sort_item_by_nb_event = sorted(sort_item_by_item, key = lambda x:(-x[1]))
            max_nb_event = sort_item_by_nb_event[0][1]
            sum_nb_event =0
            prom_nb_event=0
            for ev_ in sort_item_by_nb_event:
                sum_nb_event += ev_[1]
            prom_nb_event = sum_nb_event/len(sort_item_by_nb_event)
            #max_nb_event =sum_nb_event
            sort_item_by_date_event = sorted(data_item, key = lambda x:(x[2]), reverse=True)
            last_date= sort_item_by_date_event[0][2]   
            first_date = sort_item_by_date_event[len(sort_item_by_date_event)-1][2]
            time_domain= diff_date(first_date, last_date)


                        

            data_domain.append([i, dom, len(item_list_by_domains_list.keys()), len(data_item),
                                max_nb_event,last_date, in_domain_and_item_bought,
                                in_domain_only_from_item_bought, in_1domain_of_domain_list, 0,
                               time_domain, dom_val_search])
            data_items = data_items + data_item
        
        # rank by dom_val_search
        sort_domain_by_dom_val_search = sorted(data_domain, key = lambda x:(-x[11]))
        r = 0
        sort_domain_by_dom_val_search_with_rank = []
        for el in sort_domain_by_dom_val_search:
            r+=1 
            sort_domain_by_dom_val_search_with_rank.append(el+[ r])
            
            
            
        # rank by last date
        sort_domain_by_date_event = sorted(sort_domain_by_dom_val_search_with_rank, key = lambda x:(x[5]), reverse=True)
        max_time= diff_date(sort_domain_by_date_event[len(sort_domain_by_date_event)-1][5], sort_domain_by_date_event[0][5])
        r = 0
        sort_domain_by_date_event_with_rank = []
        for el in sort_domain_by_date_event:
            r+=1 
            sort_domain_by_date_event_with_rank.append(el+[max_time, r])

        # rank by nb event of 1 item
        sort_domain_by_nb_event = sorted(sort_domain_by_date_event_with_rank, key = lambda x:(-x[4]))
        r=0
        sort_domain_by_nb_event_with_rank=[]
        for el in sort_domain_by_nb_event:
            r+=1        
            sort_domain_by_nb_event_with_rank.append(el+[r])


        # rank by qty item
        sort_domain_by_qty_item = sorted(sort_domain_by_nb_event_with_rank, key = lambda x:(-x[3]))
        r=0
        sort_domain_by_qty_item_with_rank=[]
        for el in sort_domain_by_qty_item:
            r+=1        
            sort_domain_by_qty_item_with_rank.append(el+[r])            


        data_domains =data_domains+sort_domain_by_qty_item_with_rank



    return (pd.DataFrame(data_domains, columns=['i', 'dom name', 'qty domain history', 'qty item viewed', 
                                                'max nb event of 1 item', 'last date', 
                                               'item bought in this domain', 'in domain only from item bought', 
                                                'in_1domain_of_domain_list','no view','time_domain', 'dom_val_search',
                                                'rank by dom val search', 'max time',
                                                'rank by last date', 'rank by max nb event of 1 item', 'rank by qty item'
                                                ]),
            pd.DataFrame(data_items, columns = ['item', 'len(item_event)', 'last_event',
                                                    'is item bought', 'dom name', 'i', 'no view',
                                               'title', 'price', 'category id']))

   
            

# II. calibration of the logistic regression model
the idea was to use this model, to predict the next domain, based on the ranks values of each domain candidates

## a. importing lib, generating train/test dataset

In [6]:
# loading the data to build the model to predict the domain
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from IPython.display import display, HTML


n = 10000

X_train, X_test  = train_test_split(rows, test_size=0.5, random_state=523)
df_train_domains, df_train_items = df_rows(X_train[:n], True)
df_train_domains = df_train_domains[df_train_domains['in_1domain_of_domain_list'] ==1]
df_test_domains, df_test_items = df_rows(X_test[:n], True)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




## b. selecting the feature, fitting, calibrating, ...

In [7]:
# learning the model 
import pickle
import joblib
from sklearn import metrics

X = df_train_domains[['rank by last date', 
                      'rank by max nb event of 1 item', 'rank by qty item', 'rank by dom val search']]
y = df_train_domains['in domain only from item bought']

X_test_values = df_test_domains[[ 'rank by last date', 
                                 'rank by max nb event of 1 item', 'rank by qty item', 'rank by dom val search']]
y_test = df_test_domains['in domain only from item bought']


logreg = LogisticRegression(C = 1.0)
logreg.fit(X, y)


y_pred = logreg.predict(X_test_values)
y_pred_p = logreg.predict_proba(X_test_values)


print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test_values, y_test)))
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print(metrics.classification_report(y_test, y_pred))



Accuracy of logistic regression classifier on test set: 0.79
[[21779  5224]
 [ 1353  3484]]
              precision    recall  f1-score   support

           0       0.94      0.81      0.87     27003
           1       0.40      0.72      0.51      4837

    accuracy                           0.79     31840
   macro avg       0.67      0.76      0.69     31840
weighted avg       0.86      0.79      0.81     31840



# Main : getting data from user history of test set, making the prediction, computing the ncdg

In [8]:
import numpy as np
import pandas as pd
from sklearn import model_selection 
from IPython.display import display, HTML

y_true_ncdg = [row['item_bought'] for row in X_test[:n]]
y_pred_ncdg =[]

df_test_domains_ncdg_calc = pd.DataFrame(df_test_domains)
df_test_domains_ncdg_calc[['y_pred_p is not dom','y_pred_p is dom'] ]= y_pred_p

for i in tqdm(range(1,n+1)):

    domains = df_test_domains_ncdg_calc[df_test_domains_ncdg_calc['i']==i]
    items = df_test_items[df_test_items['i']==i]
    row = X_test[i-1]
    n_viewed = len([ev for ev in row['user_history'] if ev['event_type'] == 'view'])
    n_search = len([ev for ev in row['user_history'] if ev['event_type'] == 'search']) 
     
    if (n_viewed/(n_viewed+n_search)<=0.05) or (len(domains)==0): 
        
        rellenos = relleno_search_based(row, "most common")    
        if rellenos == None:       
            rellenos = random.choices(all_items, k=10) 
        if len(rellenos)<10:
            rellenos = rellenos + random.choices(all_items, k=10-len(rellenos)) 
            
        y_pred_ncdg.append(rellenos)
 
    else:

        recom=[]
        domains_list_sorted = domains.sort_values(by=['y_pred_p is dom'], ascending=False) 
        first_dom = domains_list_sorted['dom name'].values[0]
               
        #item_in_first_domain = items[items['dom name'] == first_dom].sort_values(by=['len(item_event)'], ascending=False)
        item_in_first_domain = items[items['dom name'] == first_dom].sort_values(by=['last_event'], ascending=False)                
        recom1 = []

        for item in item_in_first_domain['item']:
            recom1.append(item)        
        
        if len(domains_list_sorted)>1:
            sec_dom = domains_list_sorted['dom name'].values[1]

            item_in_sec_domain = items[items['dom name'] == sec_dom].sort_values(by=['len(item_event)'], ascending=False)            
            #item_in_sec_domain = items[items['dom name'] == sec_dom].sort_values(by=['last_event'], ascending=False)    
                
            recom2 = []
            for item in item_in_sec_domain['item']:
                if item not in recom1:
                    recom2.append(item) 
             
            if len(recom1)<6:
                recom=recom1+recom2
            else:
                recom = recom1[:5]+recom2

        else:
                        
            recom = recom1
                    
        if len(recom)>=10:
            recom = recom[:10]
        else:
            k = 10 - len(recom)
            #relleno = random.choices(list(metadata_domain_id_with_item[first_dom].keys()), k=k)   
            relleno = sorted(metadata_domain_id_with_item[first_dom].items(), key=lambda item: item[1], reverse = True)[:k]
            relleno = [r[0] for r in relleno]   
            recom = recom + relleno 

            
            if recom == None:
                recom = random.choices(all_items, k=10) 
            if len(recom)<10:
                recom = recom + random.choices(all_items, k=10-len(recom)) 
            
        y_pred_ncdg.append(recom)


score = ndcg(y_pred_ncdg, y_true_ncdg)
score = sum(score)/len(score)
print("ncdg :",score)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))


ncdg : 0.25154042051587294
