### UTILS

In [1]:
import pickle
import torch
import os
import random
import time
from contextlib import contextmanager
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch.nn.functional as F
from scipy.special import erfinv
from ordered_set import OrderedSet
import scipy
from collections import Counter
# from timezonefinder import TimezoneFinder
# from geopy.geocoders import Nominatim
# import pycountry
import datetime
import pytz



# tzf = TimezoneFinder()


activation_getter = {'iden': lambda x: x, 'relu': F.relu, 'tanh': torch.tanh, 'sigm': torch.sigmoid}


def platform2country(platform):
    '''
    return country name given platform
    '''
    
    if pycountry.countries.get(alpha_2=platform) != None:
        try:
            return pycountry.countries.get(alpha_2=platform).common_name
        except:
            return pycountry.countries.get(alpha_2=platform).name 
        
            
    else:
        return np.nan


def location2utc_offset(location):
    '''
    return the utc offset given the location
    '''
    geolocator = Nominatim(user_agent=str(location))
    # print(location)
    location = geolocator.geocode(location)
    
    if location == None:
        return np.nan
    try:
        lat = location.latitude
        lon = location.longitude
        offset_sec = datetime.datetime.now(pytz.timezone(tzf.timezone_at(lng=lon, lat=lat)))
        return offset_sec.utcoffset().total_seconds()/60/60
    except:
        return np.nan

def find_longest_repetitive_sequences(sequence):
    '''
    returns a dict that maps each element with the length of its longest repetitive sequneces in the list
    args:
        sequence: list
    
    '''
    counter = Counter()
    current_element = None

    # iterate the sequence
    for element in sequence:

        if current_element == None:
            current_element = element
            current_rep = 1
        elif element == current_element:
            current_rep += 1
        elif element != current_element:
            # update the element with the longest rep 
            if counter[current_element]  < current_rep:
                counter[current_element] = current_rep
            current_rep = 1
            current_element = element
    # update the element with the longest rep outside the loop
    if len(sequence) > 0 and counter[current_element]  < current_rep:
        counter[current_element] = current_rep

    return counter




def qcut_safe(prices, q):
    nbins=min(q, len(prices))
    result = pd.qcut(prices, nbins, labels=np.arange(nbins) )

    return result



class GaussRankScaler():

    def __init__( self ):
        self.epsilon = 1e-9
        self.lower = -1 + self.epsilon
        self.upper =  1 - self.epsilon
        self.range = self.upper - self.lower

    def fit_transform( self, X ):

        i = np.argsort( X, axis = 0 )
        j = np.argsort( i, axis = 0 )

        assert ( j.min() == 0 ).all()
        assert ( j.max() == len( j ) - 1 ).all()

        j_range = len( j ) - 1
        self.divider = j_range / self.range

        transformed = j / self.divider
        transformed = transformed - self.upper
        transformed = scipy.special.erfinv( transformed )
        ############
        # transformed = transformed - np.mean(transformed)

        return transformed
    
def seed_everything(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def compute_rank(inp, to_np=False):
    sorted_inp = sorted(inp)
    out = [sorted_inp.index(i) for i in inp]
    if to_np:
        out = np.array(out)
    return out

def set_seed(seed, cuda=False):

    np.random.seed(seed)
    random.seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)
    else:
        torch.manual_seed(seed)


class CategoricalEncoder():
    '''
    This class is for those operating on large data, in which sklearn's LabelEncoder class may take too much time.
    This encoder is only suitable for 1-d array/ list. You may modify it to become n-d compatible.
    '''
    def __init__(self):
        self.f_dict = {}
        self.r_dict = {}

    def fit(self, array):
        '''
        :param array: list or np array
        :return: None
        '''

        unique_elements = OrderedSet(array)
        # unique_elements = sorted(unique_elements)
        # print(DUMMY_ITEM in unique_elements)
        # print('-1' in unique_elements)
        self.n_elements = 0
        self.f_dict = {}
        self.r_dict = {}

        for e in unique_elements:
            self.f_dict[e] = self.n_elements
            self.r_dict[self.n_elements] = e
            self.n_elements += 1


    def continue_fit(self, array):
        '''
        Do not refresh n_elements, count from the latest n_elements.
        :param array:
        :return: None
        '''
        unique_elements = set(array)
        for e in unique_elements:
            if e not in self.f_dict:
                self.f_dict[e] = self.n_elements
                self.r_dict[self.n_elements] = e
                self.n_elements += 1


    def reverse_transform(self, transformed_array, to_np=False):
        '''
        :param transformed_array: list or np array
        :return: array: np array with the same shape as input
        '''


        array = [self.r_dict[e] for e in transformed_array]
        if to_np:
            array = np.array(array)
        return array


    def transform(self, array, to_np=False):
        '''
        :param array: array list or np array
        :return: list or np array with the same shape as the input
        '''
        transformed_array = [self.f_dict[e] for e in array]
        if to_np:
            transformed_array = np.array(transformed_array)
        return transformed_array

    def fit_transform(self, array, to_np=False):
        '''
        :param array: array list or np array
        :return: list or np array with the same shape as the input
        '''
        self.fit(array)
        return self.transform(array, to_np)

def str2bool(v):
    return v.lower() in ('true')

def use_optimizer(network, params):
    if params['optimizer'] == 'adam':
        optimizer = torch.optim.Adam(network.parameters(), lr=params['learning_rate'] , weight_decay=params['weight_decay'],  eps=1e-07, amsgrad=True)
    elif params['optimizer'] == 'rmsprop':
        optimizer = torch.optim.RMSprop(network.parameters(),
                                        lr=params['learning_rate'],)
    elif params['optimizer'] == 'sgd':
        optimizer = torch.optim.SGD(network.parameters(), lr=params['learning_rate'], weight_decay=params['weight_decay'])
    return optimizer

def get_attn_key_pad_mask(seq_k, seq_q, transformed_dummy_value):
    ''' For masking out the padding part of key sequence. '''

    # Expand to fit the shape of key query attention matrix.
    len_q = seq_q.size(1)
    padding_mask = seq_k.eq(transformed_dummy_value)
    padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1)  # b x lq x lk

    return padding_mask

def compute_mean_reciprocal_rank(rs):
    '''
    rs: 2d array
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    '''

    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print('[{}] done in {:.5f} s'.format(name,(time.time() - t0)))


In [2]:
DUMMY_ACTION = 'DUMMMY_A'
DUMMY_USER = -1
DUMMY_ITEM = -1
DUMMY_PRICE_RANK=25
DUMMY_IMPRESSION_INDEX = 25

In [4]:
import pandas as pd
import numpy as np
import collections
import time
from sklearn.model_selection import train_test_split
from ordered_set import OrderedSet
from tqdm import tqdm
from config import *
import xgboost as xgb
import multiprocessing
import gc

In [5]:
configuration = XGBConfiguration()

model_name='xgb_gic_lic_wosh_lf350_lr002_v2'

if configuration.sub_sample:
    model_name += '_140k'
else:
    model_name += '_all'

if configuration.use_test:
    model_name += '_ut'

seed_everything(42)

In [6]:
train = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\additional_resources\\2019-master\\data\\train.csv',
                       sep=','
                        ,nrows=1000
                   )
train['id']= np.arange(len(train))
train[:3]

Unnamed: 0,index,session_id,timestamp,user_id,step,action_type,reference,platform,city,device,current_filters,impressions,prices,id
0,0,b6b4a3c02db0c,1541030408,4JK19KX9RU36,1,search for destination,"Londrina, Brazil",BR,"Londrina, Brazil",desktop,,,,0
1,1,b6b4a3c02db0c,1541030410,4JK19KX9RU36,2,search for destination,"Londrina, Brazil",BR,"Londrina, Brazil",desktop,,,,1
2,2,5b578bc20be9f,1541030412,KQ7YR1O2APO1,1,search for destination,"Vienna, Austria",RO,"Vienna, Austria",mobile,,,,2


In [7]:
test = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\additional_resources\\2019-master\\data\\test.csv',
                       sep=',',
                           nrows=1000
                  )
test['id'] = np.arange(len(train), len(train)+len(test))
test[:3]

Unnamed: 0,user_id,timestamp,session_id,step,action_type,reference,platform,city,device,current_filters,impressions,prices,id
0,MUOOQC23R8N9,1541453208,9c57f365a02a7,1,change of sort order,interaction sort button,ES,"Lisbon, Portugal",desktop,,,,1000
1,L1IOIJ47C0S6,1541453208,ee9489aaeca70,1,interaction item image,2757973,BR,"Penha, Brazil",desktop,,,,1001
2,L1IOIJ47C0S6,1541453208,ee9489aaeca70,2,interaction item image,2757973,BR,"Penha, Brazil",desktop,,,,1002


In [8]:
item_meta = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\trivagoRecSysChallengeData2019_v2\\item_metadata.csv',
                       sep=',')
item_meta['properties'] = item_meta.properties.apply(lambda x: x.split('|'))
item_meta['item_id'] = item_meta['item_id'].apply(str)
item_meta[:3]

Unnamed: 0,item_id,properties
0,5101,"[Satellite TV, Golf Course, Airport Shuttle, C..."
1,5416,"[Satellite TV, Cosmetic Mirror, Safe (Hotel), ..."
2,5834,"[Satellite TV, Cosmetic Mirror, Safe (Hotel), ..."


In [9]:
from contextlib import contextmanager
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print('[{}] done in {:.5f} s'.format(name,(time.time() - t0)))

In [10]:
with timer("preprocessing"): 
    # change columns name
    train.rename(columns={'reference': 'item_id', 'action_type': 'action'}, inplace=True)
    test.rename(columns={'reference': 'item_id', 'action_type': 'action'}, inplace=True)

#     dla akcji typu: change of sort order, filter selec
    # concatenate the action and reference in string for    as these refernce are not actually item id
    train.loc[train.action=='change of sort order','action'] = train.loc[train.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1)
    test.loc[test.action=='change of sort order','action'] = test.loc[test.action=='change of sort order'].apply(lambda row: row.action + str(row.item_id), axis=1)

    train.loc[train.action=='filter selection','action'] = train.loc[train.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1)
    test.loc[test.action=='filter selection','action'] = test.loc[test.action=='filter selection'].apply(lambda row: row.action + str(row.item_id), axis=1)
    
#     wyczyszczenie kolumny item_id, ustawienie na -1, gdzie action typu: change of sort order, filter selection, search for poi, search for destination
    # wipe out the item id associated with these actions, reason same as the above
    train.loc[train.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM
    test.loc[test.action.str.contains('change of sort order'), 'item_id'] = DUMMY_ITEM

    train.loc[train.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM
    test.loc[test.action.str.contains('search for poi'), 'item_id'] = DUMMY_ITEM        

    train.loc[train.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM
    test.loc[test.action.str.contains('filter selection'), 'item_id'] = DUMMY_ITEM        

    train.loc[train.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM
    test.loc[test.action.str.contains('search for destination'), 'item_id'] = DUMMY_ITEM  
    
    # remove training example where clicked item is not in the impressions
    train['in_impressions'] = True
    train.loc[~train.impressions.isna(), 'in_impressions'] = train.loc[~train.impressions.isna()].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1)
    train = train.loc[train.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True)
    
    test['in_impressions'] = True
    test.loc[(~test.impressions.isna()) & (~test.item_id.isna()), 'in_impressions'] = test.loc[(~test.impressions.isna())& (~test.item_id.isna())].apply(lambda row:row.item_id in row.impressions.split('|'), axis=1)
    test = test.loc[test.in_impressions].drop('in_impressions', axis=1).reset_index(drop=True)
    
     # parse impressions and prices list from string to list
    train['item_id'] = train['item_id'].apply(str)
    train.loc[~train.impressions.isna(),'impressions'] = train.loc[~train.impressions.isna()].impressions.apply(lambda x: x.split('|'))
    train.loc[~train.prices.isna(), 'prices'] = train.loc[~train.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x])

    test['item_id'] = test['item_id'].apply(str)
    test.loc[~test.impressions.isna(),'impressions'] = test.loc[~test.impressions.isna()].impressions.apply(lambda x: x.split('|'))
    test.loc[~test.prices.isna(),'prices'] = test.loc[~test.prices.isna()].prices.apply(lambda x: x.split('|')).apply(lambda x: [int(p) for p in x])
    
#     wydobycie 3 ostatnich item_id, z którymi była interakcja
    # compute the last interacted item by shifted the item_id by 1 position
    train['last_item'] = np.nan
    test['last_item'] = np.nan
    
    train_shifted_item_id = [DUMMY_ITEM] + train.item_id.values[:-1].tolist()
    test_shifted_item_id = [DUMMY_ITEM] + test.item_id.values[:-1].tolist()
    
    # compute the last interacted item by shifted the item_id by 2 position
    train['last_item'] = train_shifted_item_id
    test['last_item'] = test_shifted_item_id

    train_shifted_item_id = [DUMMY_ITEM] *2 + train.item_id.values[:-2].tolist()
    test_shifted_item_id = [DUMMY_ITEM] *2  + test.item_id.values[:-2].tolist()
    
    # compute the last interacted item by shifted the item_id by 3 position
    train['second_last_item'] = train_shifted_item_id
    test['second_last_item'] = test_shifted_item_id

    train_shifted_item_id = [DUMMY_ITEM] *3 + train.item_id.values[:-3].tolist()
    test_shifted_item_id = [DUMMY_ITEM] *3  + test.item_id.values[:-3].tolist()

    train['third_last_item'] = train_shifted_item_id
    test['third_last_item'] = test_shifted_item_id
    
    # mask out the last interacted item if that interaction comes first in its session
    train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=True)
    test['step_rank'] = test.groupby('session_id')['step'].rank(method='max', ascending=True)
    
    # fill the invalid shifted last n item with a constant number
    train.loc[(train.step_rank == 1) & (train.action == 'clickout item'), 'last_item'] = DUMMY_ITEM
    test.loc[(test.step_rank == 1) & (test.action == 'clickout item'), 'last_item'] = DUMMY_ITEM

    train.loc[(train.step_rank == 2) & (train.action == 'clickout item'), 'second_last_item'] = DUMMY_ITEM
    test.loc[(test.step_rank == 2) & (test.action == 'clickout item'), 'second_last_item'] = DUMMY_ITEM

    train.loc[(train.step_rank == 3) & (train.action == 'clickout item'), 'third_last_item'] = DUMMY_ITEM
    test.loc[(test.step_rank == 3) & (test.action == 'clickout item'), 'third_last_item'] = DUMMY_ITEM
    
    # ignore this
    keep_columns = ['session_id', 'user_id','item_id', 'impressions','prices', 'city', 'step', 'last_item']
    all_cat_columns = ['item_id', 'city', 'platform', 'device','country','country_platform','action','device_platform']

    
    # generate country from city
    train['country'] = train.city.apply(lambda x:x.split(',')[-1])
    test['country'] = test.city.apply(lambda x:x.split(',')[-1])
    
#     stworzenie kolumny skonkatenowanej z country oraz platform, to samo z device i platform
    # concate country and platform in string format as a new feature
    train['country_platform'] = train.apply(lambda row: row.country + row.platform, axis=1)
    test['country_platform'] = test.apply(lambda row: row.country + row.platform, axis=1)

    train['device_platform'] = train.apply(lambda row: row.device + row.platform, axis=1)
    test['device_platform'] = test.apply(lambda row: row.device + row.platform, axis=1)
    # filter out rows where reference doesn't present in impression
    # train = train.loc[train.apply(lambda row:row.item_id in row.impressions, axis=1),:]
    print("train.shape:", train.shape)
    print("test.shape:", test.shape)

train.shape: (1000, 21)
test.shape: (1000, 20)
[preprocessing] done in 0.31856 s


In [35]:
# Mapowanie session_id do listy występujących: item_id, action oraz step

# concat train and test
data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)

# compute a dicationary that maps session id to the sequence of item ids in that session
train_session_interactions = dict(train.groupby('session_id')['item_id'].apply(list))
test_session_interactions = dict(test.groupby('session_id')['item_id'].apply(list))

# compute a dicationary that maps session id to the sequence of action in that session
train_session_actions = dict(train.groupby('session_id')['action'].apply(list))
test_session_actions = dict(test.groupby('session_id')['action'].apply(list))

# compute session session step since the "step" column in some session is not correctly order
train['sess_step'] = train.groupby('session_id')['timestamp'].rank(method='max').apply(int)
test['sess_step'] = test.groupby('session_id')['timestamp'].rank(method='max').apply(int)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [36]:
# 'time_diff" - czas między akcjami
# 'time_diff2" - czas między akcjami z pominięciem 2 wierszy (akcji)
# wyciągnięcie godziny z timestampu

data_feature = data.loc[:,['id','step','session_id', 'timestamp','platform','country']].copy()

# compute the time difference between each step
data_feature['time_diff'] = data.groupby('session_id')['timestamp'].diff()

# compute the difference of time difference between each step
data_feature['time_diff_diff'] = data_feature.groupby('session_id')['time_diff'].diff()

# compute the difference of the difference of time difference between each step
data_feature['time_diff_diff_diff'] = data_feature.groupby('session_id')['time_diff_diff'].diff()

# compute the time difference from 2 steps ahead
data_feature['time_diff_2'] = data.groupby('session_id')['timestamp'].diff().shift(1)

# compute the time difference from 3 steps ahead
data_feature['time_diff_3'] = data.groupby('session_id')['timestamp'].diff().shift(2)

data_feature['hour']= pd.to_datetime(data_feature.timestamp, unit='s').dt.hour//4

# # map platform to country
# data_feature['mapped_country'] = data_feature.platform.apply(platform2country)

# # load the precomputed country to utc offsets from geopy
# with open('../input/country2offsets_dict.p','rb') as f:
#     platform_country2offsets_dict = pickle.load(f)
# data_feature['platform2country_utc_offsets'] = data_feature.mapped_country.map(platform_country2offsets_dict)


# trasnform time difference with rank gauss
data_feature['rg_time_diff'] = GaussRankScaler().fit_transform(data_feature['time_diff'].values)

# compute the log of step
data_feature['step_log'] = np.log1p(data_feature['step'])

# drop the useless columns
# data_feature = data_feature.drop(['session_id','step','timestamp','hour','platform','country','mapped_country'], axis=1)
data_feature = data_feature.drop(['session_id','step','timestamp','hour','platform','country'], axis=1)

In [37]:
# merge train, test with data_feature
train = train.merge(data_feature, on='id', how='left')
test = test.merge(data_feature, on='id', how='left')
print("train.shape:", train.shape)
print("test.shape:", test.shape)

train.shape: (1000, 29)
test.shape: (1000, 28)


In [38]:
# compute the sequence of time difference in each session
train_session_time_diff = dict(train.groupby('session_id')['time_diff'].apply(list))
test_session_time_diff = dict(test.groupby('session_id')['time_diff'].apply(list))

In [49]:
# Utworzenie listy zawierającej enkodery dla każdej cechy, tego potrzebującej

# encode the categorical feture
cat_encoders = {}
for col in all_cat_columns:
    cat_encoders[col] = CategoricalEncoder()


all_items = []
for imp in data.loc[~data.impressions.isna()].impressions.tolist() + [data.item_id.apply(str).tolist()] :
    all_items += imp

# all_items = map(int, all_items)
unique_items = OrderedSet(all_items)
unique_actions = OrderedSet(data.action.values)

cat_encoders['item_id'].fit(list(unique_items) + [DUMMY_ITEM])
cat_encoders['action'].fit( list(unique_actions) + [DUMMY_ACTION])
for col in  ['city', 'platform', 'device','country','country_platform', 'device_platform']:
    cat_encoders[col].fit(data[col].tolist() )
    
# transform all the categorical columns to continuous integer
for col in all_cat_columns:
    train[col] = cat_encoders[col].transform(train[col].values)
    test[col] = cat_encoders[col].transform(test[col].values)

# get the encoded action
transformed_clickout_action = cat_encoders['action'].transform(['clickout item'])[0]
transformed_dummy_item = cat_encoders['item_id'].transform([DUMMY_ITEM])[0]
transformed_dummy_action = cat_encoders['action'].transform([DUMMY_ACTION])[0]
transformed_interaction_image = cat_encoders['action'].transform(['interaction item image'])[0]
transformed_interaction_deals = cat_encoders['action'].transform(['interaction item deals'])[0]
transformed_interaction_info = cat_encoders['action'].transform(['interaction item info'])[0]
transformed_interaction_rating = cat_encoders['action'].transform(['interaction item rating'])[0]

In [50]:
# lista rozpoczynająca się 30 zakodowanymi item_id dla -1 
# + lista zakodowanych item_id występujących w danej sesji w określony sposób

# transform session interactions and pad dummy in front of all of them
for session_id, item_list in train_session_interactions.items():
    train_session_interactions[session_id] = [transformed_dummy_item] * configuration.sess_length + list(cat_encoders['item_id'].transform(item_list))

for session_id, item_list in test_session_interactions.items():
    test_session_interactions[session_id] = [transformed_dummy_item] * configuration.sess_length + list(cat_encoders['item_id'].transform(item_list))
    
for session_id, action_list in train_session_actions.items():
    train_session_actions[session_id] = [transformed_dummy_action] * configuration.sess_length + list(cat_encoders['action'].transform(action_list))

for session_id, action_list in test_session_actions.items():
    test_session_actions[session_id] = [transformed_dummy_action] * configuration.sess_length + list(cat_encoders['action'].transform(action_list)) 

In [51]:
# import itertools
# ### compute co-occurence matrix
# implicit_train = train.loc[train.action != transformed_clickout_action, :]
# implicit_test = test.loc[test.action != transformed_clickout_action, :]

# # get all interacted items in a session
# implicit_all = pd.concat([implicit_train , implicit_test], axis=0)
# # a list of list containing items in the same session
# co_occ_items = implicit_all.groupby('session_id').item_id.apply(list).to_dict().values()
# co_occ_permutes = [list(itertools.permutations(set(items), 2)) for items in co_occ_items]

# #aggregate co-ocurrence across sessions
# co_occ_coordinates = []
# for coordinates in  co_occ_permutes:
#     co_occ_coordinates += coordinates

# #construct csr
# row, col, values = zip(*((i,j,1) for i,j in co_occ_coordinates ))
# co_occ_matrix= csr_matrix((values, (row, col)), shape=(cat_encoders['item_id'].n_elements, cat_encoders['item_id'].n_elements), dtype=np.float32)

# co_occ_matrix_csc = co_occ_matrix.tocsc()

# print("max entry: ", co_occ_matrix.max())

In [52]:
# categorically encode last, second last and third item
train['last_item'] = cat_encoders['item_id'].transform(train['last_item'].astype(str).values)
test['last_item'] = cat_encoders['item_id'].transform(test['last_item'].astype(str).values)

train['second_last_item'] = cat_encoders['item_id'].transform(train.second_last_item.astype(str).values)
test['second_last_item'] = cat_encoders['item_id'].transform(test.second_last_item.astype(str).values)

train['third_last_item'] = cat_encoders['item_id'].transform(train.third_last_item.astype(str).values)
test['third_last_item'] = cat_encoders['item_id'].transform(test.third_last_item.astype(str).values)

In [53]:
# genetate item properties features 
item_meta = item_meta.loc[item_meta.item_id.isin(unique_items),:]
# item_meta multi-hot
item_meta['item_id'] = cat_encoders['item_id'].transform(item_meta['item_id'].values)
item_meta['star'] = np.nan
item_meta.loc[item_meta.properties.apply(lambda x: '1 Star' in x), 'star'] = 1
item_meta.loc[item_meta.properties.apply(lambda x: '2 Star' in x), 'star'] = 2
item_meta.loc[item_meta.properties.apply(lambda x: '3 Star' in x), 'star'] = 3
item_meta.loc[item_meta.properties.apply(lambda x: '4 Star' in x), 'star'] = 4
item_meta.loc[item_meta.properties.apply(lambda x: '5 Star' in x), 'star'] = 5
item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Excellent Rating' in y) ), 'star'] = 9
item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Very Good Rating' in y) ), 'star'] = 8
item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Good Rating' in y) ), 'star'] = 7
item_meta.loc[(item_meta.star.isna()) & (item_meta.properties.apply(lambda y: 'Satisfactory Rating' in y) ), 'star'] = 6

item_meta['rating'] = np.nan
item_meta.loc[item_meta.properties.apply(lambda x: 'Satisfactory Rating' in x), 'rating'] = 7.0
item_meta.loc[item_meta.properties.apply(lambda x: 'Good Rating' in x), 'rating'] = 7.5
item_meta.loc[item_meta.properties.apply(lambda x: 'Very Good Rating' in x), 'rating'] = 8.0
item_meta.loc[item_meta.properties.apply(lambda x: 'Excellent Rating' in x), 'rating'] = 8.5

# get binary properties feature
item_properties_df = pd.DataFrame()
item_properties_df['item_id'] = item_meta.item_id
item_properties_df['num_properties'] = item_meta.properties.apply(len)
item_properties_df['star'] = item_meta.star
item_properties_df['item_Beach'] = item_meta.properties.apply(lambda x: 'Beach' in x).astype(np.float16)
item_properties_df['item_Bed & Breakfast'] = item_meta.properties.apply(lambda x: 'Bed & Breakfast' in x).astype(np.float16)
item_properties_df['rating'] = item_meta['rating']


item_star_map = item_properties_df.loc[:,['item_id','star']].set_index('item_id').to_dict()['star']
item_rating_map = item_properties_df.loc[:,['item_id','rating']].set_index('item_id').to_dict()['rating']

In [54]:
del  item_meta
gc.collect()

139

In [55]:
# # ignore filter_df , not using, consume huge memory yet increase a little
# filter_df = data.loc[ ~data.current_filters.isna(), ['id', 'current_filters']]
# filter_df['current_filters'] = filter_df.current_filters.apply(lambda x:x.split('|'))

# # filter_df.loc[filter_df.current_filters.apply(lambda x: '3 Star' in x), 'nights'] = 3
# filter_df['nights']=np.nan
# filter_df.loc[filter_df.current_filters.apply(lambda x: '2 Nights' in x), 'nights'] = 1
# filter_df.loc[filter_df.current_filters.apply(lambda x: '3 Nights' in x), 'nights'] = 2

# filter_set = list(set(np.hstack(filter_df['current_filters'].to_list())))

# cat_encoders['filters'] = CategoricalEncoder()
# cat_encoders['filters'].fit(filter_set)

# # get binary filter feature
# filters_df = pd.DataFrame()
# filters_df['id'] = filter_df.id
# filters_df['num_filters'] = filter_df.current_filters.apply(len)
# filters_df['breakfast_included'] = filter_df.current_filters.apply( lambda x: 'Breakfast Included' in x).astype(np.float16)
# filters_df['filters_Sort By Price'] = filter_df.current_filters.apply( lambda x: 'Sort by Price' in x).astype(np.float16)
# filters_df['filters_Sort By Popularity'] = filter_df.current_filters.apply( lambda x: 'Sort By Popularity' in x).astype(np.float16)

In [56]:
# zliczanie interakcji dla każdego item_id dla różnych akcji
# ile razy wystąpił item_id w impressions
# GaussRankScaler dla price

# compute interaction image count for each item across train/ test
interaction_image_item_ids = train.loc[train.action == transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == transformed_interaction_image, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
unique_interaction_image_items, counts = np.unique(interaction_image_item_ids, return_counts=True)
global_image_count_dict = dict(zip(unique_interaction_image_items, counts))  

# compute interaction count for each item across train/ test
interaction_item_ids = train.loc[train.action != transformed_clickout_action, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action != transformed_clickout_action, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
unique_interaction_items, counts = np.unique(interaction_item_ids, return_counts=True)
global_interaction_count_dict = dict(zip(unique_interaction_items, counts)) 

# compute interaction deals count for each item across train/ test
interaction_deals_item_ids = train.loc[train.action == transformed_interaction_deals, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist() + test.loc[test.action == transformed_interaction_deals, :].drop_duplicates(subset=['session_id','item_id','action']).item_id.tolist()
unique_interaction_deals_items, counts = np.unique(interaction_deals_item_ids, return_counts=True)
global_deals_count_dict = dict(zip(unique_interaction_deals_items, counts))

# compute step rank to identify the last row in each session for train/ val split
train = train.loc[train.action == transformed_clickout_action,:]
test = test.loc[test.action == transformed_clickout_action,:]
train['step_rank'] = train.groupby('session_id')['step'].rank(method='max', ascending=False)


# compute the impression count for each item - views?
item_ids = np.hstack([np.hstack(train['impressions'].values), np.hstack(test.impressions.values)])
unique_items, counts = np.unique(item_ids, return_counts=True)
impression_count_dict = dict(zip(unique_items, counts))

# compute the rank gauss transformed prices
unique_prices = np.unique(np.hstack([np.hstack(train.prices.values), np.hstack(test.prices.values)]) )
rg_unique_prices = GaussRankScaler().fit_transform(unique_prices)
price_rg_price_dict = dict(zip(unique_prices, rg_unique_prices))

print("train.shape:", train.shape)
print("test.shape:", test.shape)

train.shape: (139, 29)
test.shape: (139, 28)


In [57]:
#train/ val split
# if configuration.debug:
#     val = train.loc[train.step_rank == 1,:].iloc[:5]
# else:
val = train.loc[train.step_rank == 1,:].iloc[:50000]

val_index = val.index
train = train.loc[~train.index.isin(val_index),:]

train = train.drop('step_rank', axis=1)
val = val.drop('step_rank', axis=1)

print("train.shape:", train.shape)
print("test.shape:", test.shape)
print("val.shape:", val.shape)

train.shape: (24, 28)
test.shape: (139, 28)
val.shape: (115, 28)


In [58]:
# get the encoded nan item
transformed_nan_item = cat_encoders['item_id'].transform(['nan'])[0]

In [59]:
from collections import defaultdict, Counter
session_clickout_count_dict = {}
past_interaction_dict = {}
last_click_sess_dict = {}
last_impressions_dict = {}
sess_last_imp_idx_dict={}
sess_last_price_dict  = {}
sess_time_diff_dict ={}
sess_step_diff_dict = {}

cumulative_click_dict = defaultdict(lambda : 0)

In [61]:
# for idx, row in enumerate(tqdm(train[:2].itertuples())):
#     print(type(row))
#     print((row.index))

In [123]:
def parse_impressions(df, session_interactions, session_actions, session_time_diff, training=True):
    # parse the data into a binary classification task, generate 1 example for each item in the impression list
    df_list = []
    label_test_df_list = []
    # parse impressions for train set
    past_interaction_rows = []
    past_interaction_columns = []
    for idx, row in enumerate(tqdm(df.itertuples())):

        if row.session_id not in session_clickout_count_dict:
            session_clickout_count_dict[row.session_id] = 0

        if row.user_id not in past_interaction_dict:
            past_interaction_dict[row.user_id] = []
        
        
        sess_step = row.sess_step
        session_id = row.session_id

        # compute the categorically encoded impression list
        transformed_impressions = cat_encoders['item_id'].transform(row.impressions, to_np=True) 

        current_rows = np.zeros([len(row.impressions), 66], dtype=object)

        # compute rank of price this clickout
        price_rank = compute_rank(row.prices)

        #compute the number of interactions associated with the last interacted item in this session
        equal_last_item_indices = np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1]) == row.last_item
        last_item_interaction = len(set(np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1])[equal_last_item_indices]))

        # zlicza liczbę interakcji dla każdej akcji innej niż clickout item
        #compute the local interaction count for each item id
        interaction_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) != transformed_clickout_action
        interaction_item =  np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_indices]
        sess_unique_items, counts = np.unique(interaction_item, return_counts=True)
        interaction_count_dict = dict(zip(sess_unique_items, counts))

        #compute the local interaction image count for each item id
        interaction_image_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_image
        interaction_image_item =  np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_image_indices]
        sess_unique_image_items, counts = np.unique(interaction_image_item, return_counts=True)
        interaction_image_count_dict = dict(zip(sess_unique_image_items, counts))

        #compute the local interaction deals count for each item id
        interaction_deals_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_deals
        interaction_deals_item =  np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_deals_indices]
        sess_unique_deals_items, counts = np.unique(interaction_deals_item, return_counts=True)
        interaction_deals_count_dict = dict(zip(sess_unique_deals_items, counts))

        #compute the local clickout count for each item id
        interaction_clickout_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_clickout_action
        interaction_clickout_item =  np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_clickout_indices]
        sess_unique_clickout_items, counts = np.unique(interaction_clickout_item, return_counts=True)
        interaction_clickout_count_dict = dict(zip(sess_unique_clickout_items, counts))

        #compute the local interaction rating count for each item id
        interaction_rating_indices = np.array(session_actions[session_id][:configuration.sess_length+ sess_step -1]) == transformed_interaction_rating
        interaction_rating_item =  np.array(session_interactions[session_id][:configuration.sess_length+ sess_step -1])[interaction_rating_indices]
        sess_unique_rating_items, counts = np.unique(interaction_rating_item, return_counts=True)
        interaction_rating_count_dict = dict(zip(sess_unique_rating_items, counts))

        # array time_diff, bez nan, chyba
        # get the time diffference array in this session for later computing the average of it
        finite_time_diff_indices = np.isfinite(session_time_diff[session_id][:sess_step -1])
        finite_time_diff_array = np.array(session_time_diff[session_id][:sess_step -1])[finite_time_diff_indices]
        

    
        # unpad the interactions
        unpad_interactions = session_interactions[session_id][configuration.sess_length:configuration.sess_length+ sess_step -1]
        unique_interaction = pd.unique(session_interactions[session_id][:configuration.sess_length+ sess_step -1])
        
        
        # time elapse of within two steps for each item before the clickout
        item_time_elapse_dict = {}
        for it, elapse in zip(unpad_interactions[:-1], session_time_diff[session_id][1:sess_step -1]):
            if it not in item_time_elapse_dict: 
                item_time_elapse_dict[it] = [elapse]
                
            else:
                item_time_elapse_dict[it].append(elapse)

        # compute time_diff for each item in the session
        interact_diff = [unpad_interactions[::-1].index(imp) if imp in unpad_interactions else np.nan for imp in transformed_impressions]
        item_time_diff =  np.array([ sum(session_time_diff[session_id][sess_step - diff -1 :sess_step]) if np.isfinite(diff) else np.nan for diff in interact_diff])

        target_index = transformed_impressions.tolist().index(row.item_id) if training else np.nan

#         #(imp len, num items)        
#         current_co_occ = co_occ_matrix[transformed_impressions,:]

        
#         #(imp len, num unique items in the session b4 this clickout)
#         current_co_occ = current_co_occ[:,sess_unique_items].toarray()

#         # (1, num unique items in the session b4 this clickout)
#         # print(current_co_occ.dtype)

#         norm =  (1 + co_occ_matrix_csc[:, sess_unique_items].sum(axis=0).reshape(-1))

        # #(imp len, num items)        
        # imp_current_co_occ = imp_co_occ_matrix[transformed_impressions,:]

        
        # #(imp len, num unique items in the session b4 this clickout)
        # imp_current_co_occ = imp_current_co_occ[:,sess_unique_items].toarray()

        # # (1, num unique items in the session b4 this clickout)
        # # print(current_co_occ.dtype)

        # imp_norm =  (1 + imp_co_occ_matrix_csc[:, sess_unique_items].sum(axis=0).reshape(-1))

        # norm_imp_current_co_occ = imp_current_co_occ / imp_norm

        # the position of the last interacted item in the current impression list
        if row.last_item in transformed_impressions:
            last_interact_index = transformed_impressions.tolist().index(row.last_item)
        else:
            last_interact_index = np.nan

        # the position of the second last interacted item in the current impression list
        if row.second_last_item in transformed_impressions:
            second_last_interact_index = transformed_impressions.tolist().index(row.second_last_item)
        else:
            second_last_interact_index = np.nan

        # the position of the third last interacted item in the current impression list
        if row.third_last_item in transformed_impressions:
            third_last_interact_index = transformed_impressions.tolist().index(row.third_last_item)
        else:
            third_last_interact_index = np.nan

        # initialize dictionaries
        if row.session_id not in last_click_sess_dict:
            last_click_sess_dict[row.session_id] = transformed_dummy_item

        if row.session_id not in last_impressions_dict:
            last_impressions_dict[row.session_id] = None

        if row.session_id not in sess_last_imp_idx_dict:
            sess_last_imp_idx_dict[row.session_id] = DUMMY_IMPRESSION_INDEX

        if row.session_id not in sess_last_price_dict:
            sess_last_price_dict[row.session_id] = None
        
        if row.session_id not in sess_time_diff_dict:
            sess_time_diff_dict[row.session_id] = None
        
        if row.session_id not in sess_step_diff_dict:
            sess_step_diff_dict[row.session_id] = None

        
        # item id
        current_rows[:, 0] = transformed_impressions
        
        # label
        current_rows[:, 1] = transformed_impressions == row.item_id
        current_rows[:, 2] = row.session_id
        
        # whether current item id equal to the last interacted item id
        current_rows[:, 3] = transformed_impressions == row.last_item 
        current_rows[:, 4] = price_rank
        current_rows[:, 5] = row.platform
        current_rows[:, 6] = row.device
        current_rows[:, 7] = row.city
        current_rows[:, 8] = row.prices
        current_rows[:, 9] = row.country
        
        # impression index
        current_rows[:, 10] = np.arange(len(row.impressions))
        current_rows[:, 11] = row.step
        current_rows[:, 12] = row.id
        
        # last_click_item: last clickout item id
        current_rows[:, 13] = last_click_sess_dict[row.session_id]
        
        # equal_last_impressions: current impression list is eactly the same as the last one that the user encountered 
        current_rows[:, 14] = last_impressions_dict[row.session_id] == transformed_impressions.tolist() 

         
        current_rows[:, 15] = sess_last_imp_idx_dict[row.session_id]
        # last_interact_index
        current_rows[:, 16] = last_interact_index

        # price_diff
        current_rows[:, 17] = row.prices - sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np.nan

        # last_price
        current_rows[:, 18] = sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np.nan

        # price_ratio
        current_rows[:, 19] = row.prices / sess_last_price_dict[row.session_id] if sess_last_price_dict[row.session_id] else np. nan

        # clickout_time_diff
        current_rows[:, 20] = row.timestamp - sess_time_diff_dict[row.session_id] if sess_time_diff_dict[row.session_id] else np.nan

        # country_platform
        current_rows[:, 21] = row.country_platform

        # impression_count
        current_rows[:, 22] = [impression_count_dict[imp] for imp in row.impressions]
        
        # is_interacted: if that item has been interaced in the current session
        current_rows[:, 23] = [imp in session_interactions[session_id][:configuration.sess_length+ sess_step -1] for imp in transformed_impressions]
        
        # local_interaction_image_count
        current_rows[:, 24] = [interaction_image_count_dict[imp] if imp in interaction_image_count_dict else 0 for imp in transformed_impressions] 
        # local_interaction_deals_count
        current_rows[:, 25] = [interaction_deals_count_dict[imp] if imp in interaction_deals_count_dict else 0 for imp in transformed_impressions] 

        # local_interaction_clickout_count
        current_rows[:, 26] = [interaction_clickout_count_dict[imp] if imp in interaction_clickout_count_dict else 0 for imp in transformed_impressions] 

        # global_interaction_image_count
        current_rows[:, 27] = [global_image_count_dict[imp] if imp in global_image_count_dict else 0 for imp in transformed_impressions] 

        # global_interaction_deals_count
        current_rows[:, 28] = [global_deals_count_dict[imp] if imp in global_deals_count_dict else 0 for imp in transformed_impressions] 

        # is_clicked
        current_rows[:, 29] = [imp in past_interaction_dict[row.user_id] for imp in transformed_impressions]

        # click_diff - jak dawno miał interakcje z danym impressions
        current_rows[:, 30] = [past_interaction_dict[row.user_id][::-1].index(imp) if imp in past_interaction_dict[row.user_id] else np.nan for imp in transformed_impressions]

        # average of the previous features
        for i in range(31, 38):
            current_rows[:, i]  = np.mean(current_rows[:, i-8])

        # impression_avg_prices
        current_rows[:, 38] = np.mean(row.prices)
        current_rows[:, 39] = row.device_platform

        # equal_max_liic: euqal the maximum of local interaction image count
        current_rows[:, 40] = np.array(current_rows[:, 24]) == np.max(current_rows[:, 24]) if sum(current_rows[:, 24]) >0 else False

        # num_interacted_items
        current_rows[:, 41] = len(np.unique(session_interactions[session_id][:configuration.sess_length+ sess_step -1]))

        # equal_second_last_item
        current_rows[:, 42] = transformed_impressions == row.second_last_item 

        # last_action
        current_rows[:, 43] = session_actions[session_id][configuration.sess_length+ sess_step -2]

        # last_second_last_imp_idx_diff
        current_rows[:, 44] = last_interact_index - second_last_interact_index

        # predicted_next_imp_idx (the idea is to trace your eyeball, last_interact_index + (last_interact_index - second_last_interact_index))
        current_rows[:, 45] = 2 * last_interact_index - second_last_interact_index

        # list_len
        current_rows[:, 46] = len(row.impressions)
        
        # imp_idx_velocity
        current_rows[:, 47] = last_interact_index - 2 * second_last_interact_index + third_last_interact_index

        # time_diff_sess_avg
        current_rows[:, 48] = np.mean(finite_time_diff_array)
        
        # max_time_elapse
        current_rows[:, 49] = [ max(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions]

        # sum_time_elapse
        current_rows[:, 50] = [ sum(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions]

        # avg_time_elapse
        current_rows[:, 51] = [ np.mean(item_time_elapse_dict[imp]) if imp in item_time_elapse_dict else np.nan for imp in transformed_impressions]

        # item_time_diff  
        current_rows[:, 52] = item_time_diff

        # global_interaction_count
        current_rows[:, 53] = [global_interaction_count_dict[imp] if imp in global_interaction_count_dict else 0 for imp in transformed_impressions] 

        # average global_interaction_count
        current_rows[:, 54] = np.mean(current_rows[:, 53])

        # std of global interaction image count
        current_rows[:, 55] = np.std(current_rows[:, 27])
        
        # std of glocal interaction conut
        current_rows[:, 56] = np.std(current_rows[:, 53])

        # local_interaction_count
        current_rows[:, 57] = [interaction_count_dict[imp] if imp in interaction_count_dict else 0 for imp in transformed_impressions] 
        current_rows[:, 58] = target_index

        # target price
        current_rows[:, 59] = row.prices[target_index] if not np.isnan(target_index) else np.nan

        # normalized co-occurence statistics
#         current_rows[:, 60] = np.mean(current_co_occ/ norm, axis=1).reshape(-1)
#         current_rows[:, 61] = np.min(current_co_occ/ norm, axis=1).reshape(-1)
#         current_rows[:, 62] = np.max(current_co_occ/norm, axis=1).reshape(-1)
#         current_rows[:, 63] = np.median(current_co_occ/norm, axis=1).reshape(-1)

        # last_item_interaction
        current_rows[:, 64] = last_item_interaction

        # target price rank
        current_rows[:, 65] = price_rank[target_index] if not np.isnan(target_index) else np.nan
        # current_rows[:, 66] = np.mean(norm_imp_current_co_occ, axis=1).reshape(-1)
        # current_rows[:, 67] = np.min(norm_imp_current_co_occ, axis=1).reshape(-1)
        # current_rows[:, 68] = np.max(norm_imp_current_co_occ, axis=1).reshape(-1)
        # current_rows[:, 69] = np.median(norm_imp_current_co_occ, axis=1).reshape(-1)
        
        if training or row.item_id == transformed_nan_item:
            df_list.append(current_rows)
        else:
            label_test_df_list.append(current_rows) 
        # cumulative_click_dict[row.item_id] += 1
        past_interaction_dict[row.user_id].append(row.item_id)
        last_click_sess_dict[row.session_id] = row.item_id
        last_impressions_dict[row.session_id] = transformed_impressions.tolist()
        sess_time_diff_dict[row.session_id] = row.timestamp
        sess_step_diff_dict[row.session_id] = row.step
        if row.item_id != transformed_nan_item:
            sess_last_imp_idx_dict[row.session_id] = (transformed_impressions == row.item_id).tolist().index(True)
            sess_last_price_dict[row.session_id] = np.array(row.prices)[ transformed_impressions == row.item_id ][0]
            # cumulative_click_dict[row.item_id]  += 1
    
    data = np.vstack(df_list)
    df_columns = ['item_id', 'label', 'session_id', 'equal_last_item', 'price_rank', 'platform', 'device', 'city', 'price', 'country', 'impression_index','step', 'id','last_click_item','equal_last_impressions', 'last_click_impression','last_interact_index','price_diff','last_price','price_ratio','clickout_time_diff','country_platform','impression_count','is_interacted','local_interaction_image_count','local_interaction_deals_count','local_interaction_clickout_count','global_interaction_image_count','global_interaction_deals_count','is_clicked','click_diff', 'avg_is_interacted','avg_liic', 'avg_lidc','avg_licc','avg_giic','avg_gdc','avg_is_clicked','impression_avg_prices','device_platform','equal_max_liic','num_interacted_items','equal_second_last_item','last_action','last_second_last_imp_idx_diff','predicted_next_imp_idx', 'list_len','imp_idx_velocity','time_diff_sess_avg','max_time_elapse','sum_time_elapse','avg_time_elapse','item_time_diff','global_interaction_count','avg_gic','std_giic','std_gic','local_interaction_count','target_index','target_price','co_occ_mean_norm','co_occ_min_norm','co_occ_max_norm','co_occ_median_norm','last_item_interaction','target_price_rank']
    dtype_dict = {"item_id":"int32", "label": "int8", "equal_last_item":"int8", "step":"int16", "price_rank": "int32","impression_index":"int32", "platform":"int32","device":"int32","city":"int32", "id":"int32", "country":"int32", "price":"int16", "last_click_item":"int32", "equal_last_impressions":"int8", 'last_click_impression':'int16', 'last_interact_index':'float32', 'price_diff':'float16','last_price':'float16','price_ratio':'float32','clickout_time_diff':'float16','country_platform':'int32','impression_count':'int32','is_interacted':'int8','local_interaction_image_count':'int32','local_interaction_deals_count':'int32','local_interaction_clickout_count':'int32','global_interaction_image_count':'int32','global_interaction_deals_count':'int32','is_clicked':'int8','click_diff':'float32'\
                , 'avg_is_interacted':'float16' ,'avg_liic':'float16', 'avg_lidc':'float32','avg_licc':'float32','avg_giic':'float32','avg_gdc':'float32','avg_is_clicked':'float32','impression_avg_prices':'float32','device_platform':'int32','equal_max_liic':'int8','num_interacted_items':'int32','equal_second_last_item':'int8','last_action':'int32','last_second_last_imp_idx_diff':'float32', 'predicted_next_imp_idx': 'float32','list_len':'int16','imp_idx_velocity':'float32','time_diff_sess_avg':'float32','max_time_elapse':'float32','sum_time_elapse':'float32','avg_time_elapse':'float32','item_time_diff':'float32','global_interaction_count':'float32','avg_gic':'float32','std_giic':'float32','std_gic':'float32','local_interaction_count':'int32','target_index':'float32','target_price':'float32','co_occ_mean_norm':'float32','co_occ_min_norm':'float32','co_occ_max_norm':'float32','co_occ_median_norm':'float32','last_item_interaction':'int32','target_price_rank':'float32'} 
    df = pd.DataFrame(data, columns=df_columns)
    df = df.astype(dtype=dtype_dict )
    if training:
        return df
    else:
        label_test = np.vstack(label_test_df_list)
        label_test = pd.DataFrame(label_test, columns=df_columns)
        label_test = label_test.astype(dtype= dtype_dict)
        return df, label_test

In [63]:
train.sort_values('timestamp',inplace=True)
val.sort_values('timestamp',inplace=True)
test.sort_values('timestamp',inplace=True)

In [122]:
# print("sorted!!")
train = parse_impressions(train, train_session_interactions, train_session_actions, train_session_time_diff)
val = parse_impressions(val, train_session_interactions, train_session_actions, train_session_time_diff)
test, label_test = parse_impressions(test, test_session_interactions, test_session_actions, test_session_time_diff, training=False)

print("train.shape:", train.shape)
print("test.shape:", test.shape)
print("val.shape:", val.shape)
print("label_test.shape:", label_test.shape)

0it [00:00, ?it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 112, 112, 113, 113, 113]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=21, index=21, session_id='6c4ba6d174134', timestamp=1541030436, user_id='CJ32S9QQ04RI', step=1, action=2, item_id=109, platform=5, city=16, device=1, current_filters=nan, impressions=['3143733', '103507', '5452024', '103600', '4775550', '2266812', '508561', '95011', '153936', '8162422', '980287', '103458', '1816153', '103672', '2111996', '2430542', '4774596', '1816151', '906417', '3132670', '824196', '103495', '103620', '8165060', '8350208'], prices=[164, 251, 188, 149, 133, 404, 113, 156, 84, 102, 126, 406, 9

11it [00:00, 106.06it/s]

30
3
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 2277, 2277, 470, 470, 470, 470, 470, 470]
[2277, 2277]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 2277, 2277]
[5433 2277]
Pandas(Index=246, index=246, session_id='c46caf82e4cfc', timestamp=1541030504, user_id='251AFZASCPNY', step=3, action=2, item_id=470, platform=3, city=22, device=0, current_filters=nan, impressions=['948643', '100302', '34351', '34337', '740596', '43174', '42894', '42363', '1350878', '34338', '42334', '42952', '5653536', '42402', '34343', '42450', '34348', '18230', '34342', '1346370', '34350', '43160', '42446', '2196006'], prices=[70, 93, 31, 53, 28, 44, 90, 54, 169, 55, 38, 46, 34, 87, 44, 170, 81, 108, 40, 392, 34, 138, 33, 57], id=246, last_item

21it [00:00, 101.71it/s]

30
2
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 1410, 1410, 1410, 1410]
[1410]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 1410]
[5433 1410]
Pandas(Index=572, index=572, session_id='8f81080d71569', timestamp=1541030558, user_id='QR045W70VKZ3', step=2, action=2, item_id=1410, platform=25, city=112, device=1, current_filters=nan, impressions=['1314005', '2809180', '2080036', '6402926', '1103070', '4626938', '4659618', '6787016'], prices=[68, 34, 44, 50, 97, 64, 71, 45], id=572, last_item=1497, second_last_item=5321, third_last_item=1493, country=31, country_platform=49, device_platform=42, sess_step=2, time_diff=6.0, time_diff_diff=nan, time_diff_diff_diff=nan, time_diff_2=nan, time_diff_3=10.0, rg_time_diff=0.1171306

24it [00:00, 95.12it/s] 
0it [00:00, ?it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 6]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=5, index=5, session_id='90ccf1b651b92', timestamp=1541030423, user_id='F0NEAMNR1WK0', step=1, action=2, item_id=6, platform=0, city=4, device=1, current_filters=nan, impressions=['507861', '2176280', '8280296', '1830637', '1944129', '7315132', '1669587', '6450704', '1887971', '2771364', '2874560', '8184784', '7880000', '1638907', '2226102', '8269412', '1431081', '3217450', '1889223', '1885013', '8902246', '1946951', '6452918', '6084136', '3951644'], prices=[214, 81, 158, 117, 152, 54, 105, 113, 88, 96, 79, 99, 79, 196, 102, 106, 143, 80, 96, 84, 95, 117, 79, 61, 85], id=5, last_item=5321, second_last

11it [00:00, 105.04it/s]

30
2
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 363, 363]
[363]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 363]
[5433  363]
Pandas(Index=107, index=107, session_id='713adad4927e3', timestamp=1541030472, user_id='GF8ANZZ3H9TI', step=2, action=2, item_id=363, platform=0, city=43, device=1, current_filters=nan, impressions=['965967', '3805414', '2784958', '4648454', '4501352', '8467488', '6405988', '9437848', '6692058', '7781972', '5601364'], prices=[43, 37, 61, 45, 47, 57, 114, 43, 54, 72, 57], id=107, last_item=5321, second_last_item=5321, third_last_item=5321, country=0, country_platform=0, device_platform=4, sess_step=2, time_diff=2.0, time_diff_diff=nan, time_diff_diff_diff=nan, time_diff_2=nan, time_diff_3=nan, 

22it [00:00, 103.56it/s]

30
2
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5321, 651]
[5321]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5321]
[5433 5321]
Pandas(Index=243, index=243, session_id='eb948b62169fb', timestamp=1541030504, user_id='UO6DEU1KN6G7', step=2, action=2, item_id=651, platform=3, city=48, device=1, current_filters=nan, impressions=['2455862', '5116230', '42864', '932875', '9234374', '2645140', '936559', '42931', '42692', '1363450', '349091', '2564824', '3148690', '2628052', '2233644', '42662', '8183468', '3479202', '10154074', '5970348', '4342578', '43133', '2552826', '4954872', '7145594'], prices=[87, 181, 117, 31, 91, 64, 34, 38, 69, 54, 32, 20, 70, 47, 13, 56, 48, 48, 25, 65, 48, 44, 21, 25, 44], id=243, last_item=964, 

33it [00:00, 104.30it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 928, 928]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=320, index=320, session_id='f22208b45282a', timestamp=1541030516, user_id='XP0ZA762RR61', step=1, action=2, item_id=928, platform=13, city=80, device=1, current_filters=nan, impressions=['2212698', '5458982', '3532752', '3463408', '5010144', '595406', '2629306', '1388828', '4669238', '4105146', '1985343', '399536', '595421', '714341', '9448386', '2709926', '2762675', '4418286', '1173292', '1101190', '6621660', '5098328', '5115654', '2379600', '1107534'], prices=[53, 104, 118, 80, 57, 130, 308, 281, 100, 274, 76, 219, 85, 67, 250, 74, 138, 98, 83, 60, 89, 96, 91, 99, 103], id=320, last_item=532

43it [00:00, 101.80it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 1129]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=440, index=440, session_id='dc21fffe0bc27', timestamp=1541030535, user_id='2ZJT3X9VXAUJ', step=1, action=2, item_id=1129, platform=19, city=91, device=0, current_filters=nan, impressions=['7875360', '7126898', '3534986', '352416', '1952071', '7243590', '1625363', '5656778', '95185', '97748', '105653', '1156735', '1216906', '1552471', '2508762', '3136154', '4770336', '7195380', '1832403', '2713384', '2402373', '105671', '363316', '105672', '1810893'], prices=[71, 94, 49, 31, 135, 128, 74, 43, 231, 164, 164, 47, 154, 78, 62, 158, 89, 78, 622, 57, 193, 299, 83, 137, 50], id=440, last_item=5321, secon

54it [00:00, 101.33it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 1235]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=496, index=496, session_id='c040d3f0060b7', timestamp=1541030546, user_id='NWWJSEGXU68Y', step=1, action=2, item_id=1235, platform=6, city=99, device=1, current_filters=nan, impressions=['1449201', '7107198', '44548', '3135017', '5861070', '44076', '44681', '44355', '6227386', '44372', '44153', '43371', '43946', '88550', '44493', '1021779', '44922', '43459', '44423', '101001', '44680', '44790', '88688', '44781', '44544'], prices=[107, 97, 134, 93, 103, 27, 102, 114, 73, 146, 70, 128, 138, 51, 77, 140, 93, 48, 65, 72, 75, 93, 134, 147, 83], id=496, last_item=5321, second_last_item=5335, third_last_

65it [00:00, 102.42it/s]

30
5
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5321, 5321, 5321, 5321, 1569]
[5321, 5321, 5321, 5321]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5321, 5321, 5321, 5321]
[5433 5321]
Pandas(Index=592, index=592, session_id='4fe40cb96a3d2', timestamp=1541030560, user_id='OZT9NAKEQMNW', step=5, action=2, item_id=1569, platform=8, city=26, device=0, current_filters='Sort by Price', impressions=['5148390', '892233', '2208510', '5842768', '5837014', '137512', '41958', '1033596', '2628242', '8403260', '149717', '4087002', '1054624', '8271090', '6448348', '10588590', '14024', '5888194', '9366736', '927037', '2459696', '4068978', '10113736', '3135551', '8760006'], prices=[21, 36, 52, 52, 55, 64, 74, 75, 77, 79, 81, 81, 83, 

76it [00:00, 103.48it/s]

30
2
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 1825, 1825, 1825, 1825, 1825]
[1825]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 1825]
[5433 1825]
Pandas(Index=676, index=676, session_id='136bbc9a8e1df', timestamp=1541030570, user_id='HW3JCL2KCKQ5', step=2, action=2, item_id=1825, platform=28, city=128, device=1, current_filters=nan, impressions=['908501', '15256', '95749', '17751', '81107', '18349', '6801664', '1695323', '25882', '101718', '132104', '4275286', '95748', '2161540', '7151220', '55248', '9283', '13844', '9275', '14802', '1374606', '1036356', '5681254', '15382', '18271'], prices=[52, 45, 45, 57, 124, 110, 35, 33, 118, 189, 125, 199, 60, 191, 62, 154, 73, 80, 44, 39, 125, 125, 85, 164, 64], id=676, last_

86it [00:00, 99.74it/s] 

30
2
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 817, 820]
[817]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 817]
[5433  817]
Pandas(Index=771, index=771, session_id='e2cd4a48cf65d', timestamp=1541030579, user_id='FXFSX09BGZ70', step=2, action=2, item_id=820, platform=16, city=71, device=1, current_filters=nan, impressions=['8280226', '8238358', '2812948', '7851766', '8256252'], prices=[92, 35, 35, 21, 92], id=771, last_item=2017, second_last_item=5321, third_last_item=2169, country=25, country_platform=32, device_platform=29, sess_step=2, time_diff=69.0, time_diff_diff=nan, time_diff_diff_diff=nan, time_diff_2=nan, time_diff_3=10.0, rg_time_diff=0.5398080067578629, step_log=1.0986122886681098)
30
1
[5433, 5433, 5433

96it [00:00, 98.72it/s]

30
4
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 1410, 1410, 1410, 1410]
[1410, 1410, 1410]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 1410, 1410, 1410]
[5433 1410]
Pandas(Index=834, index=834, session_id='8f81080d71569', timestamp=1541030589, user_id='QR045W70VKZ3', step=4, action=2, item_id=1410, platform=25, city=112, device=1, current_filters=nan, impressions=['1314005', '2809180', '2080036', '6402926', '1103070', '4626938', '4659618', '6787016'], prices=[68, 34, 44, 50, 97, 64, 71, 45], id=834, last_item=2154, second_last_item=5357, third_last_item=2133, country=31, country_platform=49, device_platform=42, sess_step=4, time_diff=28.0, time_diff_diff=25.0, time_diff_diff_diff=28.0, time_diff_2=5.0, time_diff_3=1

106it [00:01, 98.30it/s]

30
3
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5321, 5321, 2351]
[5321, 5321]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5321, 5321]
[5433 5321]
Pandas(Index=901, index=901, session_id='68e8bf6644593', timestamp=1541030594, user_id='221QBQ570YW4', step=3, action=2, item_id=2351, platform=3, city=111, device=0, current_filters=nan, impressions=['1954371', '67865', '66179', '57485', '881845', '687611', '426596', '10259770', '6488328', '441426', '5479518', '2635335', '71310', '376996', '893913', '2353934', '60242', '7925550', '71015', '67537', '60758', '3844108', '9270038', '1109080', '77050'], prices=[163, 264, 264, 220, 281, 135, 177, 211, 415, 211, 180, 102, 273, 49, 218, 332, 132, 203, 141, 62, 75, 840, 135, 141,

115it [00:01, 100.18it/s]
0it [00:00, ?it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 2508]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=6, user_id='ZZLBJK50TFPS', timestamp=1541453209, session_id='5c2ef481c5164', step=1, action=2, item_id=2508, platform=29, city=116, device=1, current_filters=nan, impressions=['14610', '12076', '12245', '12299', '49977', '12075', '49261', '1201264', '32725', '2009673', '99968', '82322', '2267432', '11914', '49533', '49607', '48481', '2179674', '48405', '50648', '918913', '1061826', '5180114', '6452196', '4452280'], prices=[1916, 476, 830, 2086, 546, 682, 731, 736, 1656, 696, 926, 856, 488, 688, 1415, 670, 958, 625, 688, 625, 373, 504, 596, 462, 792], id=1006, last_item=5321, second_last_item=2617,

11it [00:00, 106.05it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5368]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=24, user_id='PC3K486NASTS', timestamp=1541453217, session_id='26ae80da84843', step=1, action=2, item_id=5368, platform=10, city=167, device=0, current_filters=nan, impressions=['2813583', '8927002', '2812010', '4274890', '4274898', '4284726', '9358826'], prices=[36, 53, 57, 61, 61, 61, 88], id=1024, last_item=5321, second_last_item=2713, third_last_item=2691, step_rank=1.0, country=17, country_platform=20, device_platform=17, sess_step=1, time_diff=nan, time_diff_diff=nan, time_diff_diff_diff=nan, time_diff_2=nan, time_diff_3=nan, rg_time_diff=0.834174747032414, step_log=0.6931471805599453)
30
1
[

21it [00:00, 103.60it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 2956]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=68, user_id='L99256TEN5QM', timestamp=1541453225, session_id='6072408e453cc', step=1, action=2, item_id=2956, platform=2, city=184, device=1, current_filters=nan, impressions=['9903484', '60147', '74830', '73604', '60682', '1022827', '60819', '4440068', '71266', '57875', '813306', '65387', '73605', '64901', '899865', '77021', '61018', '62733', '831461', '67305', '78498', '153152', '64337', '58131', '3147142'], prices=[158, 251, 175, 162, 102, 281, 132, 135, 161, 288, 221, 154, 196, 218, 235, 142, 153, 259, 189, 207, 110, 135, 117, 281, 591], id=1068, last_item=5321, second_last_item=5368, third_la

32it [00:00, 101.17it/s]

30
2
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 3197, 5368, 3197, 3197, 3197, 3197]
[3197]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 3197]
[5433 3197]
Pandas(Index=123, user_id='4WK8BPN2GOXW', timestamp=1541453233, session_id='f4e49b42ebcf5', step=2, action=2, item_id=5368, platform=0, city=64, device=1, current_filters=nan, impressions=['478011', '2068758', '2627243', '918935', '929539', '925941', '104881', '5849680', '1130489', '1208722', '8128170', '1994909', '924697', '903277', '2734473', '1770325', '1713735', '1947769', '4528384', '1471997', '4986828', '3882130', '2005245', '4595526', '1921529'], prices=[23, 18, 21, 21, 20, 16, 16, 21, 19, 22, 18, 22, 21, 17, 19, 17, 10, 15, 19, 18, 18, 22, 22, 23, 17], id=11

42it [00:00, 99.10it/s] 

30
2
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5321, 3393, 5321]
[5321]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5321]
[5433 5321]
Pandas(Index=190, user_id='BR27X4XGEK89', timestamp=1541453245, session_id='648230faf612a', step=2, action=2, item_id=3393, platform=16, city=221, device=2, current_filters=nan, impressions=['9433978', '4678476', '1500637', '2441955', '2772569', '2626992', '103353', '406946', '1960971', '103350', '4700312', '5735888', '2123698', '2724682', '3142572', '1627111', '2756094', '515436', '2646856', '6445660', '5948952', '3102990', '3135080', '512206', '1472157'], prices=[41, 12, 18, 14, 29, 103, 40, 39, 26, 71, 13, 14, 39, 37, 46, 46, 18, 22, 19, 41, 13, 51, 53, 37, 32], id=1190, last_item

52it [00:00, 97.14it/s]

30
2
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 1279, 5368]
[1279]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 1279]
[5433 1279]
Pandas(Index=249, user_id='ZLX4ODP074FY', timestamp=1541453252, session_id='37d5b748a9202', step=2, action=2, item_id=5368, platform=8, city=51, device=0, current_filters=nan, impressions=['8642', '986971', '2835834', '8628', '1811137', '61700', '1455521', '1104798', '1285525', '8813', '2861604', '3132652', '8134318', '10032092', '10064344', '8772', '2300750', '8729', '8793', '8738', '8771', '8777', '2499104', '2891353', '3120550'], prices=[218, 348, 338, 316, 228, 234, 342, 278, 207, 240, 404, 409, 247, 349, 234, 214, 451, 372, 334, 246, 330, 1008, 442, 211, 254], id=1249, last_item=3881,

63it [00:00, 98.32it/s]

30
3
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 3856, 3856, 3856]
[3856, 3856]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 3856, 3856]
[5433 3856]
Pandas(Index=352, user_id='Q0CLYL25MB0D', timestamp=1541453261, session_id='8e97114eb7e74', step=3, action=2, item_id=3856, platform=33, city=226, device=0, current_filters=nan, impressions=['1016411', '1066248', '2273920', '607351', '2891322', '7532814', '1085676', '2027033', '1142393', '1133071', '1016409', '3855928', '1226658', '1297156', '3119208', '1279744', '949745', '1875621', '4070392', '1288867', '4256688', '8314020', '9655450', '5627340', '4518406'], prices=[60, 111, 135, 81, 111, 51, 93, 35, 100, 21, 84, 98, 104, 43, 77, 56, 196, 59, 37, 56, 56, 23, 26, 58, 35]

74it [00:00, 98.89it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5368]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=442, user_id='KV2CBP2MA4YV', timestamp=1541453267, session_id='7895aef8a5f35', step=1, action=2, item_id=5368, platform=15, city=166, device=1, current_filters=nan, impressions=['936331', '19817', '46184', '46161', '1413032', '46094', '448196', '344741', '109994', '1474435', '468671', '19815', '46101', '46045', '5857346', '6452284', '6033592'], prices=[99, 137, 105, 108, 98, 125, 115, 189, 99, 120, 87, 129, 109, 99, 192, 179, 135], id=1442, last_item=5321, second_last_item=4226, third_last_item=4226, step_rank=1.0, country=23, country_platform=29, device_platform=55, sess_step=1, time_diff=nan, ti

84it [00:00, 98.14it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5368]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=513, user_id='SQ4EY9KC4O2O', timestamp=1541453274, session_id='243b4f053f713', step=1, action=2, item_id=5368, platform=0, city=278, device=1, current_filters=nan, impressions=['3950624', '313616', '1710653', '1555637', '9004316', '1654267', '519801', '1130829', '2124014', '908571', '1394840', '2369087', '1157950', '1088680', '505976', '10114862', '1452733', '1885959', '1387960', '1051034', '106925', '1157904', '1446837', '3822122', '1314785'], prices=[29, 26, 38, 74, 19, 38, 49, 32, 40, 42, 27, 23, 33, 51, 44, 35, 44, 35, 26, 37, 38, 36, 32, 37, 42], id=1513, last_item=5321, second_last_item=5404

94it [00:00, 96.21it/s]

30
3
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 4472, 4472, 5368]
[4472, 4472]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 4472, 4472]
[5433 4472]
Pandas(Index=586, user_id='A0QYP6RQ4GRI', timestamp=1541453279, session_id='473fc5f2cacec', step=3, action=2, item_id=5368, platform=29, city=269, device=1, current_filters=nan, impressions=['924863', '99746', '50248', '48394', '141363', '1503425', '49774', '556606', '99869', '49146', '1288605', '99747', '393216', '2952396', '5417302', '1214850', '627371', '99820', '658361', '141364', '6575614', '4947946', '659886', '394876', '894181'], prices=[82, 59, 65, 79, 35, 79, 65, 56, 50, 70, 65, 61, 59, 81, 68, 38, 46, 42, 49, 55, 81, 81, 55, 36, 58], id=1586, last_item=5395, sec

104it [00:01, 96.55it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 4664]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=680, user_id='M99Z7IYKDX4I', timestamp=1541453286, session_id='e2fa79e8a8202', step=1, action=2, item_id=4664, platform=37, city=302, device=1, current_filters=nan, impressions=['1240081', '5647680', '37714', '9500346', '8276346', '3184770', '5482466', '37711', '2029079', '4500498', '5773306', '94475', '1293576', '9504042', '4957186', '942927', '1298549', '1413728', '96783', '486771', '5188184', '10131972', '1875551', '13348', '80263'], prices=[72, 74, 173, 208, 113, 57, 135, 81, 48, 53, 38, 127, 40, 120, 41, 29, 125, 25, 103, 92, 93, 50, 63, 144, 130], id=1680, last_item=5321, second_last_item=54

114it [00:01, 97.07it/s]

30
2
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 4867, 5368]
[4867]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 4867]
[5433 4867]
Pandas(Index=741, user_id='NS51O96JPR0R', timestamp=1541453293, session_id='98646f2f9083c', step=2, action=2, item_id=5368, platform=0, city=139, device=0, current_filters=nan, impressions=['393591', '104632', '5008004', '2759718', '104828', '9317618', '9772462', '6953508', '3795382', '4504514', '5174478', '104902', '1754159', '2083424', '2061814', '9473574', '4089430', '9386646', '5122358', '1245372', '2790558', '4567728', '9761738', '1668407', '2535386'], prices=[327, 181, 44, 67, 68, 67, 30, 68, 40, 53, 67, 81, 39, 65, 33, 25, 34, 40, 37, 35, 39, 31, 23, 40, 44], id=1741, last_item=4850

125it [00:01, 98.79it/s]

30
1
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5368]
[]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433]
[5433]
Pandas(Index=896, user_id='Z1P7JSI0CXLA', timestamp=1541453302, session_id='f5944a12ace50', step=1, action=2, item_id=5368, platform=10, city=320, device=1, current_filters=nan, impressions=['325366', '171647', '171622', '1254610', '325286', '1209040', '1255389', '171597', '8798012', '495256', '885275', '506836', '2560944', '220621', '2882044', '888481', '2860916', '811616', '5824670', '2200602', '4873000', '4688542', '2680580', '4689366', '682806'], prices=[77, 74, 93, 51, 71, 101, 27, 47, 84, 41, 85, 61, 26, 84, 36, 52, 19, 270, 74, 37, 93, 67, 25, 39, 33], id=1896, last_item=5321, second_last_item=5427, thi

135it [00:01, 98.07it/s]

30
10
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5321, 5321, 5321, 5321, 5404, 5321, 5321, 5321, 5297, 5297]
[5321, 5321, 5321, 5321, 5404, 5321, 5321, 5321, 5297]
[5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5433, 5321, 5321, 5321, 5321, 5404, 5321, 5321, 5321, 5297]
[5433 5321 5404 5297]
Pandas(Index=996, user_id='KJHA48TLPE4B', timestamp=1541453309, session_id='ba870fe896c9e', step=10, action=2, item_id=5297, platform=33, city=219, device=1, current_filters='Sort by Price', impressions=['1214732', '2862500', '2818066', '2036955', '5828952', '1214722', '6441418', '991391', '4058366', '933537', '3578602', '1875615', '7014392', '7716132', '1765699', '11088266', '1235395', '2198950', '6511874', '7883588', '1479157', '224331

139it [00:01, 97.87it/s]


train.shape: (516, 66)
test.shape: (1565, 66)
val.shape: (2570, 66)
label_test.shape: (1666, 66)


In [141]:
# if configuration.use_test:
#     train = pd.concat([train, label_test], axis=0)
    
print("train.shape:", train.shape)
print("test.shape:", test.shape)
print("val.shape:", val.shape)

train.shape: (516, 66)
test.shape: (1565, 66)
val.shape: (2570, 66)


In [143]:
print("test before merge", test.shape)
train = train.merge(item_properties_df, on="item_id", how="left")
val = val.merge(item_properties_df, on="item_id", how="left")
test = test.merge(item_properties_df, on="item_id", how="left")


print("test ", test.shape)
# train = train.merge(filters_df, on='id', how="left")
# val = val.merge(filters_df, on='id', how="left")
# test = test.merge(filters_df, on='id', how="left")


# print("test ", test.shape)
# print("test before merge data_feature", test.shape)

test before merge (1565, 66)
test  (1565, 71)


In [144]:
train = train.merge(data_feature, on='id', how="left")
val = val.merge(data_feature, on='id', how="left")
test = test.merge(data_feature, on='id', how="left")

print("train.shape:", train.shape)
print("test.shape:", test.shape)
print("val.shape:", val.shape)

train.shape: (516, 78)
test.shape: (1565, 78)
val.shape: (2570, 78)


In [145]:
# del filters_df
del data_feature
del data
gc.collect()

294

In [146]:
# target encoding
agg_cols = [ 'price_rank', 'city', 'platform', 'device', 'country', 'impression_index','star']
for c in agg_cols:
    gp = train.groupby(c)['label']
    mean = gp.mean()
    train[f'{c}_label_avg'] = train[c].map(mean)
    val[f'{c}_label_avg'] = val[c].map(mean)
    test[f'{c}_label_avg'] = test[c].map(mean)

In [147]:
agg_cols = ['city','impression_index', 'platform']
for c in agg_cols:
    gp = train.groupby(c)['price']
    mean = gp.mean()
    train[f'{c}_price_avg'] = train[c].map(mean)
    val[f'{c}_price_avg'] = val[c].map(mean)
    test[f'{c}_price_avg'] = test[c].map(mean)

In [148]:
agg_cols = ['city']
for c in agg_cols:
    gp = train.groupby(c)['rg_time_diff']
    mean = gp.mean()
    train[f'{c}_td_avg'] = train[c].map(mean)
    val[f'{c}_td_avg'] = val[c].map(mean)
    test[f'{c}_td_avg'] = test[c].map(mean)

In [149]:
train['rg_price'] = train.price.map(price_rg_price_dict)
val['rg_price'] = val.price.map(price_rg_price_dict)
test['rg_price'] = test.price.map(price_rg_price_dict)

In [150]:
#price cut within city

data = pd.concat([train,val,test], axis=0).reset_index()
data = data.loc[:,['city','price']].drop_duplicates(['city','price'])
data['city_price_bin'] = data.groupby('city').price.apply(lambda x: qcut_safe(x, q = 40).astype(str))
data['city_price_bin'] = data.apply( lambda x: str(x.city) + x.city_price_bin,axis=1)
data['city_price_bin'] = data['city_price_bin'].factorize()[0]

In [151]:
train = train.merge(data,  on=['city','price'], how='left')
val = val.merge(data,  on=['city','price'], how='left')
test = test.merge(data,  on=['city','price'], how='left')

print("train", train.shape)
print("val", val.shape)
print("test", test.shape)
# test = test.merge(item_properties_df, on="item_id", how="left")

train (516, 91)
val (2570, 91)
test (1565, 91)


In [152]:
data_drop_columns= ['label', 'session_id', 'step', 'id']
data_drop_columns+= ['target_index','target_price','target_price_rank']
# data_drop_columns+= ['avg_lidc','avg_licc']

train_label = train.label

val_label = val.label

In [154]:
d_train = xgb.DMatrix(data=train.drop(data_drop_columns, axis=1), label=train_label.values, silent=True, nthread=-1, feature_names=train.drop(data_drop_columns, axis=1).columns.tolist())
d_val = xgb.DMatrix(data=val.drop(data_drop_columns, axis=1), label=val_label.values, silent=True, nthread= -1, feature_names=train.drop(data_drop_columns, axis=1).columns.tolist())
d_test = xgb.DMatrix(test.drop(data_drop_columns, axis=1), nthread=-1, feature_names=train.drop(data_drop_columns, axis=1).columns.tolist())


In [155]:
cat_cols = [ 'item_id', "price_rank", 'city', 'platform', 'device', 'country', 'impression_index','star','last_click_impression','last_click_item','last_interact_index','country_platform']

for col in cat_cols:
    if (train[col] < 0).sum() > 0:
        print("contains negative ", col)

In [156]:
del  train
gc.collect()

159

In [157]:
params={
    'eta': 0.02,  # 0.03,
  "booster": "gbtree",
  'tree_method':'hist',
  'max_leaves': 350, 
  'max_depth': 10,  # 18
  "nthread": multiprocessing.cpu_count() -1,
  'subsample': 0.9,
  'colsample_bytree': 0.8,
  'colsample_bylevel': 0.8,
  'min_child_weight': 2,
  'alpha': 1,
  'objective': 'binary:logistic',
  'eval_metric': 'logloss',
  'random_state': 5478,
  'verbosity': 0,
}

In [158]:
watchlist = [ (d_train, 'train'), (d_val, 'valid')]
clf = xgb.train(
    params=params,
    dtrain=d_train,
    num_boost_round=50000, #11927
    evals= watchlist,
    early_stopping_rounds=500,
    verbose_eval=500,
    # categorical_feature= cat_cols
)


[0]	train-logloss:0.675696	valid-logloss:0.676206
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 500 rounds.
[500]	train-logloss:0.009446	valid-logloss:0.206726
Stopping. Best iteration:
[146]	train-logloss:0.036199	valid-logloss:0.176897



In [159]:
def evaluate(val_df, clf, d_val):
    val_df['scores'] = clf.predict(d_val)
    grouped_val = val_df.groupby('session_id')
    rss = []
    for _, group in grouped_val:

        scores = group.scores
        sorted_arg = np.flip(np.argsort(scores))
        rss.append( group['label'].values[sorted_arg])
        
    mrr = compute_mean_reciprocal_rank(rss)
    return mrr

mrr = evaluate(val, clf, d_val)

print("Val MRR score: ", mrr)

Val MRR score:  0.5565912858885608


In [160]:
imp = clf.get_score( importance_type='gain')
imp_df = pd.DataFrame.from_dict(imp, orient='index').reset_index()

imp_df.columns=['name','importance']
imp_df.sort_values('importance', ascending=False, inplace=True)



print(imp_df.head(20))

                              name  importance
3                    is_interacted   19.161229
2                       is_clicked   17.411168
4                       click_diff   13.283170
8            last_click_impression    4.805091
10        global_interaction_count    3.333767
1                      price_ratio    2.698560
0                 impression_index    1.197605
5                 country_platform    1.114625
6       impression_index_label_avg    0.813311
11                      price_diff    0.549449
9   global_interaction_image_count    0.226105
7             price_rank_label_avg    0.131746


In [169]:
predictions = []
session_ids = []

test['score'] = clf.predict(d_test)
save_test = test.copy()
save_test['item_id'] = cat_encoders['item_id'].reverse_transform(save_test.item_id.values)
with open(f'{model_name}_test_score.p', 'wb') as f:
    pickle.dump( save_test.loc[:,['score', 'session_id', 'item_id', 'step']],f, protocol=4)

In [170]:
grouped_test = test.groupby('session_id')
for session_id, group in grouped_test:
    scores = group['score']
    sorted_arg = np.flip(np.argsort(scores))
    sorted_item_ids = group['item_id'].values[sorted_arg]
    sorted_item_ids = cat_encoders['item_id'].reverse_transform(sorted_item_ids)
    sorted_item_string = ' '.join([str(i) for i in sorted_item_ids])
    predictions.append(sorted_item_string)
    session_ids.append(session_id)
        
prediction_df = pd.DataFrame()
prediction_df['session_id'] = session_ids
prediction_df['item_recommendations'] = predictions


In [None]:
print("pred df shape", prediction_df.shape)
sub_df = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\submission_popular.csv')
sub_df.drop('item_recommendations', axis=1, inplace=True)
sub_df = sub_df.merge(prediction_df, on="session_id")
# sub_df['item_recommendations'] = predictions

sub_df.to_csv(f'D:\\Dokumenty\\Systemy_rekomendacyjne\\additional_resources\\2019-master\\data\\{model_name}_1mln_all.csv', index=None)   


### WYNIKI
1mln trening, wszystko test - MRR: 0.6666014120446141<br>
100k trening, wszystko test - MRR: 0.64919857565502<br>
10k trening, wszystko test - MRR: 0.6182944544861741<br>
1k trening, wszystko test - MRR: ~0.51<br>

### UWAGI

In [None]:
import collections
train = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\additional_resources\\2019-master\\data\\train.csv',
                       sep=','
#                         ,nrows=100000
                   )
train['id']= np.arange(len(train))
train['in_impressions'] = True
train.loc[~train.impressions.isna(), 'in_impressions'] = train.loc[~train.impressions.isna()].apply(lambda row:row.reference in row.impressions.split('|'), axis=1)
collections.Counter(train['in_impressions'])