# Data preparation

In [1]:
import pandas as pd

pd.set_option('display.max_columns', 50)

track = pd.concat(
    (
        pd.read_csv('data/train_tracking.csv'),
        pd.read_csv('data/test_tracking.csv')
    ),
    ignore_index=True,
    axis='rows'
)

track['duration'] = pd.to_timedelta(track['duration'])

In [2]:
import numpy as np

train = pd.read_csv('data/train_session.csv')
test = pd.read_csv('data/random_submission.csv')
train['is_train'] = True
test['is_train'] = False
test['target'] = np.nan

df = pd.concat((train, test), ignore_index=True)

In [3]:
df.head()

Unnamed: 0,sid,target,is_train
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True


# Feature extraction

In [4]:
track['unit'] = 1
track['time_spent'] = track.groupby('sid')['duration'].diff(1).shift(-1).dt.seconds
track['time_spent'] = track['time_spent'].fillna(track['time_spent'].mean())

In [5]:
df = df.join(pd.pivot_table(track, index='sid', columns='type_simplified', values='unit', aggfunc=np.sum).fillna(0).add_prefix('type_simple_'), on='sid')
df = df.join(pd.pivot_table(track, index='sid', columns='type_simplified', values='time_spent', aggfunc=np.sum).fillna(0).add_prefix('type_simple_time_'), on='sid')

In [6]:
df = df.join(pd.pivot_table(track, index='sid', columns='type', values='unit', aggfunc=np.sum).fillna(0).add_prefix('type_'), on='sid')
df = df.join(pd.pivot_table(track, index='sid', columns='type', values='time_spent', aggfunc=np.sum).fillna(0).add_prefix('type_time_'), on='sid')

In [7]:
df = df.join(track.groupby('sid').size().rename('n_pages'), on='sid')

Amount of time spent looking at product lists.

In [8]:
df = df.join(track.query('type == "LIST_PRODUCT"').groupby('sid')['time_spent'].sum().rename('leche_vitrine'), on='sid')
df['leche_vitrine'] = df['leche_vitrine'].fillna(0)

Screen resolution.

In [9]:
track['resolution'] = track['rw'] * track['rh']
df = df.join(track.groupby('sid')['resolution'].nunique().rename('n_resolutions'), on='sid')
df = df.join(track.groupby('sid')['resolution'].max().rename('max_resolution'), on='sid')

Number of results pages seen.

In [59]:
df = df.join(track.groupby('sid')['pn'].nunique().rename('n_unique_pages'), on='sid')
df = df.join(track.groupby('sid')['pn'].max().rename('max_pn'), on='sid')
df = df.join(track.groupby('sid')['rcount'].sum().rename('rcount_sum'), on='sid')

ValueError: columns overlap but no suffix specified: Index(['n_unique_pages'], dtype='object')

In [11]:
import ast

track['n_facets'] = track['facets'].apply(lambda x: len(ast.literal_eval(x)) if pd.notnull(x) else 0)
df = df.join(track.groupby('sid')['n_facets'].max().rename('max_n_facets'), on='sid')

Max quantity in basket.

In [12]:
df = df.join(track.groupby('sid')['quantity'].max().rename('max_size_basket').fillna(0), on='sid')

Number of unique actions.

In [13]:
df = df.join(track.groupby('sid')['type'].nunique().rename('n_unique_actions').fillna(0), on='sid')

Total quantity in basket.

In [14]:
df = df.join(track.groupby('sid')['quantity'].sum().rename('sum_size_basket').fillna(0), on='sid')

Unique items in basket.

In [15]:
df['sum_over_unique'] = df['sum_size_basket'] / (df['type_simple_ADD_TO_BASKET'] + 1)

Mean quantity by type.

In [16]:
track = track.join(track.groupby(['type_simplified'])['quantity'].mean().fillna(0).rename('mean_quantity_type'), on=['type_simplified'])
df = df.join(track.groupby('sid')['mean_quantity_type'].mean(), on='sid')

track = track.join(track.groupby(['type'])['quantity'].sum().rename('sum_quantity_type').fillna(0), on = ['type'])
df = df.join(track.groupby('sid')['sum_quantity_type'].sum(), on = 'sid')

track = track.join(track.groupby(['type'])['quantity'].max().rename('max_quantity_type').fillna(0), on = ['type'])
df = df.join(track.groupby('sid')['max_quantity_type'].mean(), on = 'sid')

track = track.join(track.groupby(['type_simplified'])['quantity'].max().rename('max_quantity_category').fillna(0), on=['type_simplified'])
df = df.join(track.groupby('sid')['max_quantity_category'].mean(), on = 'sid')

Total time spent.

In [17]:
df = df.join(track.groupby('sid')['duration'].max().rename('seconds_spent').dt.seconds, on='sid')
df = df.join(track.groupby('sid')['time_spent'].max().rename('time_spent_max'), on='sid')
df = df.join(track.groupby('sid')['time_spent'].mean().rename('time_spent_mean'), on='sid')

Average time spent per page.

In [18]:
df = df.join(track.groupby('sid').apply(lambda x: x['duration'].max().seconds / len(x)).rename('average_duration'), on='sid')

First and last actions.

In [19]:
df = df.join(track.groupby('sid').nth(0)['type'].rename('action_0').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(1)['type'].rename('action_1').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(-2)['type'].rename('action_n-1').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(-1)['type'].rename('action_n').astype('category'), on='sid')

In [20]:
df['first_2_actions'] = df['action_0'].str.cat(df['action_1'], sep='_').astype('category')
df['last_2_actions'] = df['action_n-1'].str.cat(df['action_n'], sep='_').astype('category')

TF-IDF.

In [21]:
from sklearn import feature_extraction

actions = track.groupby('sid')['type'].apply(lambda x: ' '.join(x))
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 2), min_df=10, max_features=60)
vectorizer.fit(actions)
tfidf = pd.DataFrame(vectorizer.transform(actions).todense(), columns=vectorizer.get_feature_names(), index=actions.index).add_prefix('tfidf')
df = df.join(tfidf, on='sid')

Device.

In [22]:
df = df.join(track.groupby('sid')['device'].first().fillna('missing').astype('category').rename('device'), on='sid')
df = df.join(track.groupby('sid')['siteid'].first().fillna('missing').astype('category').rename('siteid'), on='sid')
df['site_device'] = df['siteid'].str.cat(df['device'], sep='_').astype('category')

Total number of products visited during session.

In [23]:
df = df.join(track.groupby('sid').sku.nunique().rename('n_products'), on='sid')

Total number of products visited during session normalized by number of pages visited.

In [24]:
df['n_products_normed'] = df['n_products'] / df['n_pages']
df['seconds_per_product'] = df['n_products'] / (df['seconds_spent'] + 1)

Stype counts.

In [25]:
track['stype'] = track['stype'].fillna('missing')
df = df.join(pd.pivot_table(track, index='sid', columns='stype', values='unit', aggfunc=np.sum).fillna(0).add_prefix('stype_'), on='sid')

True if the seller is not Cdiscount but storage and delivery are provided by Cdiscount.

In [26]:
track['ff'] = track['ff'].fillna('missing')
df = df.join(pd.pivot_table(track, index='sid', columns='ff', values='unit', aggfunc=np.sum).fillna(0).add_prefix('ff_'), on='sid')

Total pages per type visited normalized by number of pages visited/session duration

In [27]:
import itertools

page_types = ['type_' + elem for elem in track.type.unique().tolist()]
page_types += ['type_simple_' + elem for elem in track.type_simplified.unique().tolist()]

# remove duplicate types
for page_type in ['CAROUSEL', 'PA', 'SEARCH', 'LIST_PRODUCT']:
    page_types.remove('type_' + page_type)

denominators = ["n_pages", "seconds_spent"]

# create features
for page_type, denominator in itertools.product(page_types, denominators):
    df["{}_normed_by_{}".format(page_type, denominator)] = df[page_type] / df[denominator]

Target encoding.

In [29]:
track['previous_type'] = track.groupby('sid')['type_simplified'].shift(-1).fillna('START')
track['type_simplified_sequence'] = track['type_simplified'].str.cat(track['previous_type'], sep='_').astype('category')

In [30]:
import xam

track = track.join(df.set_index('sid')['target'], how = 'left',on ='sid')

encoder = xam.feature_extraction.SmoothTargetEncoder(
    columns = [
        'siteid',
        'device',
        'type_simplified',
        'type',
        'type_simplified_sequence',
        'idcar'
    ],
    prior_weight=100,
    suffix='target_encode'
)
track = encoder.fit_transform(track, track['target'])

col_encode = [c for c in track.columns if c.endswith('target_encode')]
df = df.join(track.groupby('sid')[col_encode].mean(), on='sid')

In [31]:
df = df.join(track.groupby('sid')[col_encode].max().add_prefix('max_'), on='sid')
df = df.join(track.groupby('sid')[col_encode].agg(np.ptp).add_prefix('ptp_'), on='sid')
df = df.join(track.groupby('sid')[col_encode].std().add_prefix('std_'), on='sid')

Number of products purchased during the session

In [32]:
df = df.join(track.query("type_simplified == 'PURCHASE_PRODUCT'").groupby('sid')['quantity'].sum().rename('n_products_purchased'), on='sid')
df["n_products_in_basket"] = df['sum_size_basket'] - df['n_products_purchased']

In [33]:
encoder = xam.feature_extraction.SmoothTargetEncoder(
    columns = [
        'action_n',
        'last_2_actions',
        'site_device'
    ],
    prior_weight=100,
    suffix='target_encode'
)
df = encoder.fit_transform(df, df['target'])

In [34]:
df.head()

Unnamed: 0,sid,target,is_train,type_simple_ADD_TO_BASKET,type_simple_CAROUSEL,type_simple_LIST_PRODUCT,type_simple_PA,type_simple_PRODUCT,type_simple_PURCHASE_PRODUCT,type_simple_SEARCH,type_simple_SHOW_CASE,type_simple_time_ADD_TO_BASKET,type_simple_time_CAROUSEL,type_simple_time_LIST_PRODUCT,type_simple_time_PA,type_simple_time_PRODUCT,type_simple_time_PURCHASE_PRODUCT,type_simple_time_SEARCH,type_simple_time_SHOW_CASE,type_ADD_TO_BASKET_CAROUSEL,type_ADD_TO_BASKET_LP,type_ADD_TO_BASKET_LR,type_ADD_TO_BASKET_PA,type_ADD_TO_BASKET_SHOW_CASE,type_CAROUSEL,...,type_simplified_sequencetarget_encode,idcartarget_encode,max_siteidtarget_encode,max_devicetarget_encode,max_type_simplifiedtarget_encode,max_typetarget_encode,max_type_simplified_sequencetarget_encode,max_idcartarget_encode,ptp_siteidtarget_encode,ptp_devicetarget_encode,ptp_type_simplifiedtarget_encode,ptp_typetarget_encode,ptp_type_simplified_sequencetarget_encode,ptp_idcartarget_encode,std_siteidtarget_encode,std_devicetarget_encode,std_type_simplifiedtarget_encode,std_typetarget_encode,std_type_simplified_sequencetarget_encode,std_idcartarget_encode,n_products_purchased,n_products_in_basket,action_ntarget_encode,last_2_actionstarget_encode,site_devicetarget_encode
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,251.34747,0.0,0.0,6.0,0.0,4152.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.133331,0.131889,0.140389,0.140517,0.150284,0.156344,0.152207,0.135867,0.0,0.0,0.016377,0.022437,0.032889,0.015913,0.0,0.0,0.007815,0.011474,0.013734,0.007957,,,0.119207,0.088601,0.097847
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True,0.0,4.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,333.34747,0.0,0.0,34.0,0.0,443.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.135798,0.130971,0.140389,0.140517,0.150284,0.156344,0.170143,0.135867,0.0,0.0,0.016377,0.035574,0.050824,0.015913,0.0,0.0,0.007337,0.01174,0.013852,0.007644,,,0.119207,0.088601,0.097847
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,51.0,54.0,0.0,360.34747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.123451,0.135867,0.140389,0.140517,0.136935,0.156344,0.152207,0.135867,0.0,0.0,0.02958,0.048988,0.08483,0.0,0.0,0.0,0.013854,0.020035,0.038855,0.0,,,0.067281,0.072089,0.097847
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,324.34747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.114393,0.135867,0.140389,0.140517,0.133907,0.133907,0.130065,0.135867,0.0,0.0,0.0,0.0,0.062688,0.0,0.0,0.0,0.0,0.0,0.031344,0.0,,,0.067281,0.058267,0.097847
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,288.34747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109169,0.135867,0.140389,0.140517,0.133907,0.133907,0.130065,0.135867,0.0,0.0,0.0,0.0,0.062688,0.0,0.0,0.0,0.0,0.0,0.036193,0.0,,,0.067281,0.058267,0.097847


In [36]:
cols = [
    'ptp_idcartarget_encode',
    'ptp_type_simplifiedtarget_encode',
    'ptp_typetarget_encode',
    'max_idcartarget_encode',
    'max_type_simplifiedtarget_encode',
    'max_typetarget_encode',
    'max_quantity_type',
    'action_ntarget_encode',
    'last_2_actionstarget_encode'
]

for (c1, c2) in itertools.combinations(cols, 2):
    df['{}_{}'.format(c1, c2)] = df[c1] / (1 + df[c2])

# Learning

Prepare the datasets.

In [60]:
to_drop = ['sid', 'is_train']

X_train = df.query('is_train == 1').drop(to_drop + ['target'], axis='columns')
y_train = df.query('is_train == 1')['target']
X_test = df.query('is_train == 0').drop(to_drop + ['target'], axis='columns')
submission = df.query('is_train == 0')['sid'].to_frame()
submission['target'] = 0

Do some sanity checks.

In [61]:
assert len(X_train) == 133123
assert len(y_train) == 133123
assert len(X_test) == 88750
assert len(submission) == 88750
assert len(X_train.columns) == len(X_test.columns)

Find the optimal number of boosting rounds through cross-validation.

In [70]:
import time

import lightgbm as lgbm
import numpy as np
from sklearn import model_selection
import xam


# https://lightgbm.readthedocs.io/en/latest/Parameters.html
params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary',
    'num_threads': 8,
    'num_leaves': 2 ** 5,
    'max_depth': 10,
    'min_data_per_group': 50,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 4,
    'cat_smooth': 40,
    'cat_l2': 10,
    'max_bin': 511,
    'min_data_in_bin': 3,
    'scale_pos_weight': 1,
    'min_data_in_leaf': 40,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'feature_fraction_seed': 42,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'bagging_seed': 42,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'verbosity': 2
}

n_splits = 5
cv = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=42)
submission['target'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)
feature_importances_ = pd.DataFrame(index=X_train.columns)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.Dataset
    fit_set = lgbm.Dataset(X_fit, y_fit)
    val_set = lgbm.Dataset(X_val, y_val)

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.train
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        num_boost_round=10000,
        verbose_eval=50,
        early_stopping_rounds=100,
        evals_result=evals_result
    )

    # Store the feature importances
    feature_importances_[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances_[f'split_{i}'] = model.feature_importance('split')
    
    # Store the training scores
    fit_scores[i] = evals_result['fit']['binary_logloss'][-1]
    val_scores[i] = evals_result['val']['binary_logloss'][-1]
    
    # Accumulate test predictions
    submission['target'] += (model.predict(X_test) / n_splits)

print('- Fit logloss: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('- Val logloss: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))



Training until validation scores don't improve for 100 rounds.
[50]	fit's binary_logloss: 0.274642	val's binary_logloss: 0.281056
[100]	fit's binary_logloss: 0.261997	val's binary_logloss: 0.269751
[150]	fit's binary_logloss: 0.256017	val's binary_logloss: 0.265119
[200]	fit's binary_logloss: 0.252615	val's binary_logloss: 0.263091
[250]	fit's binary_logloss: 0.250265	val's binary_logloss: 0.262015
[300]	fit's binary_logloss: 0.248402	val's binary_logloss: 0.261498
[350]	fit's binary_logloss: 0.246744	val's binary_logloss: 0.261197
[400]	fit's binary_logloss: 0.245284	val's binary_logloss: 0.261013
[450]	fit's binary_logloss: 0.243905	val's binary_logloss: 0.260893
[500]	fit's binary_logloss: 0.242602	val's binary_logloss: 0.260858
[550]	fit's binary_logloss: 0.241394	val's binary_logloss: 0.260816
[600]	fit's binary_logloss: 0.240242	val's binary_logloss: 0.260741
[650]	fit's binary_logloss: 0.23907	val's binary_logloss: 0.260795
Early stopping, best iteration is:
[594]	fit's binary_l

- Fit logloss: 0.23900 (+/- 0.00259)
- Val logloss: 0.25744 (+/- 0.00379)

In [64]:
val_pred = model.predict(X_val)

In [65]:
mistakes = pd.DataFrame({
    'diff': (y_val - val_pred).abs(),
    'y_val': y_val,
    'y_pred': val_pred,
    'sid': df.iloc[val_idx]['sid']
}).sort_values('diff', ascending=False)
mistakes.head(10)

Unnamed: 0,diff,y_val,y_pred,sid
49237,0.990109,1.0,0.009891,XqaSjdOV9icBKQlxjVHsAS0PelncvO8hy3ODb9vDuYX8SM...
90266,0.989661,1.0,0.010339,YW3O32QhgG2i+kXDWICxgEJhvsxxdoMhrI9U3edE5CLSqY...
44252,0.98884,1.0,0.01116,BnQ8rM9sSsXaJgOhvk390TnkVJNygtCSrHROh54EDOP0y0...
93685,0.98741,1.0,0.01259,3hFOJne6kTO0R19S+hzRP7dUL+glnPQO7vl8Yqz25miTYt...
131290,0.9873,1.0,0.0127,WY3swNlGPy7N54hUxi1bS7suW2L33qIbL7Je4tEiWMlULE...
84600,0.986222,1.0,0.013778,R3UINGzrqsmBR3aG/pP8hSe1Dv6CwMOz8uBSkBi7qNIIEN...
130896,0.985603,1.0,0.014397,+o2ubpkA39WBUHqu5JACCwqpSmKqTmRfcmFRY1P7yio9vx...
14234,0.985594,1.0,0.014406,DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOv...
11025,0.985324,1.0,0.014676,IttMXS9a0t3zZ3wg2siTIUVwqXwFpRQ1EWqLgPaA+cf7p5...
51756,0.985104,1.0,0.014896,jA28D/PVAItz8uRG3c6OKUIYGt1eHxig8zAMeZChNdsXJF...


In [66]:
mistakes.loc[14234]['sid']

'DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOvaQc+SNJbSv8laaDB+3'

In [68]:
df.query('sid == "DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOvaQc+SNJbSv8laaDB+3"')

Unnamed: 0,sid,target,is_train,type_simple_ADD_TO_BASKET,type_simple_CAROUSEL,type_simple_LIST_PRODUCT,type_simple_PA,type_simple_PRODUCT,type_simple_PURCHASE_PRODUCT,type_simple_SEARCH,type_simple_SHOW_CASE,type_simple_time_ADD_TO_BASKET,type_simple_time_CAROUSEL,type_simple_time_LIST_PRODUCT,type_simple_time_PA,type_simple_time_PRODUCT,type_simple_time_PURCHASE_PRODUCT,type_simple_time_SEARCH,type_simple_time_SHOW_CASE,type_ADD_TO_BASKET_CAROUSEL,type_ADD_TO_BASKET_LP,type_ADD_TO_BASKET_LR,type_ADD_TO_BASKET_PA,type_ADD_TO_BASKET_SHOW_CASE,type_CAROUSEL,...,ptp_type_simplifiedtarget_encode_action_ntarget_encode,ptp_type_simplifiedtarget_encode_last_2_actionstarget_encode,ptp_typetarget_encode_max_idcartarget_encode,ptp_typetarget_encode_max_type_simplifiedtarget_encode,ptp_typetarget_encode_max_typetarget_encode,ptp_typetarget_encode_max_quantity_type,ptp_typetarget_encode_action_ntarget_encode,ptp_typetarget_encode_last_2_actionstarget_encode,max_idcartarget_encode_max_type_simplifiedtarget_encode,max_idcartarget_encode_max_typetarget_encode,max_idcartarget_encode_max_quantity_type,max_idcartarget_encode_action_ntarget_encode,max_idcartarget_encode_last_2_actionstarget_encode,max_type_simplifiedtarget_encode_max_typetarget_encode,max_type_simplifiedtarget_encode_max_quantity_type,max_type_simplifiedtarget_encode_action_ntarget_encode,max_type_simplifiedtarget_encode_last_2_actionstarget_encode,max_typetarget_encode_max_quantity_type,max_typetarget_encode_action_ntarget_encode,max_typetarget_encode_last_2_actionstarget_encode,max_quantity_type_action_ntarget_encode,max_quantity_type_last_2_actionstarget_encode,action_ntarget_encode_last_2_actionstarget_encode,max_pn,rcount_sum
14234,DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOv...,1.0,True,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,17.0,293.34747,0.0,40.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.060915,0.060821,0.055773,0.055074,0.055074,0.06335,0.060915,0.060821,0.118116,0.118116,0.135867,0.130645,0.130444,0.130649,0.150284,0.144507,0.144285,0.150284,0.144507,0.144285,0.0,0.0,0.038381,1.0,188.0


In [69]:
track.query('sid == "DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOvaQc+SNJbSv8laaDB+3"')

Unnamed: 0,sid,type,query,nb_query_terms,rcount,pn,facets,products,dproducts,rh,rw,device,idcar,carproducts,sku,offerid,quantity,stype,sname,ff,oidcar,ocarproducts,oquery,orcount,ofacets,opn,odproducts,oproducts,siteid,duration,type_simplified,unit,time_spent,resolution,n_facets,mean_quantity_type,sum_quantity_type,max_quantity_type,max_quantity_category,previous_type,type_simplified_sequence,target,siteidtarget_encode,devicetarget_encode,type_simplifiedtarget_encode,typetarget_encode,type_simplified_sequencetarget_encode,idcartarget_encode
1202655,DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOv...,LIST_PRODUCT,,,47.0,1.0,,[{'sku': '4+6KkeXk7y/jJst9Ovva1ep6NPTGBfCKCRdX...,,643,1024,TvwEXZP7a45jv48s/MIm/w==,,,,,,missing,,missing,,,,,,,,,5QmFu8A6HVU6cIT8YqnAZg==,00:00:00,LIST_PRODUCT,1,42.0,658432,0,0.0,0.0,0.0,0.0,PRODUCT,LIST_PRODUCT_PRODUCT,1.0,0.104309,0.10434,0.086934,0.086934,0.098779,0.135867
1202656,DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOv...,PRODUCT_LP,,,47.0,1.0,,[{'sku': '4+6KkeXk7y/jJst9Ovva1ep6NPTGBfCKCRdX...,,643,1024,TvwEXZP7a45jv48s/MIm/w==,,,AbZ398R2H7G03ynGUQcPOQ==,ZfFVsbR4iilL5Np1t15R7A==,,i40+rMlxCCxz1hgpPEHVCw==,,0,,,,,,,,,5QmFu8A6HVU6cIT8YqnAZg==,00:00:42.116319,PRODUCT,1,17.0,658432,0,0.0,0.0,0.0,0.0,SHOW_CASE,PRODUCT_SHOW_CASE,1.0,0.104309,0.10434,0.136935,0.10473,0.088373,0.135867
1202657,DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOv...,SHOW_CASE,,,,,,,,643,1024,TvwEXZP7a45jv48s/MIm/w==,,,,,,missing,,missing,,,,,,,,,5QmFu8A6HVU6cIT8YqnAZg==,00:00:59.696721,SHOW_CASE,1,4.0,658432,0,0.0,0.0,0.0,0.0,PRODUCT,SHOW_CASE_PRODUCT,1.0,0.104309,0.10434,0.091013,0.091013,0.085815,0.135867
1202658,DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOv...,PRODUCT_LP,,,47.0,1.0,,[{'sku': '4+6KkeXk7y/jJst9Ovva1ep6NPTGBfCKCRdX...,,643,1024,TvwEXZP7a45jv48s/MIm/w==,,,AbZ398R2H7G03ynGUQcPOQ==,ZfFVsbR4iilL5Np1t15R7A==,,i40+rMlxCCxz1hgpPEHVCw==,,0,,,,,,,,,5QmFu8A6HVU6cIT8YqnAZg==,00:01:03.876006,PRODUCT,1,23.0,658432,0,0.0,0.0,0.0,0.0,CAROUSEL,PRODUCT_CAROUSEL,1.0,0.104309,0.10434,0.136935,0.10473,0.131733,0.135867
1202659,DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOv...,CAROUSEL,,,,,,,,643,1024,TvwEXZP7a45jv48s/MIm/w==,bKdGrrBA6Eg3OY46ALMJ3A==,"[{'sku': 'WLwh2oTRUiP5Y3GChKpo1A==', 'offerid'...",,,,missing,,missing,,,,,,,,,5QmFu8A6HVU6cIT8YqnAZg==,00:01:27.413709,CAROUSEL,1,17.0,658432,0,0.0,0.0,0.0,0.0,LIST_PRODUCT,CAROUSEL_LIST_PRODUCT,1.0,0.104309,0.10434,0.150284,0.150284,0.103862,0.119954
1202660,DUKueAl31YTLYBAwsioScpdNRy/n4aZZcMQUslWIlOIhOv...,LIST_PRODUCT,,,47.0,1.0,,[{'sku': '4+6KkeXk7y/jJst9Ovva1ep6NPTGBfCKCRdX...,,643,1024,TvwEXZP7a45jv48s/MIm/w==,,,,,,missing,,missing,,,,,,,,,5QmFu8A6HVU6cIT8YqnAZg==,00:01:45.218701,LIST_PRODUCT,1,251.34747,658432,0,0.0,0.0,0.0,0.0,START,LIST_PRODUCT_START,1.0,0.104309,0.10434,0.086934,0.086934,0.040419,0.135867


Display feature importance.

In [41]:
feature_importances_.sort_values('gain_0', ascending=False)

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
max_idcartarget_encode,75542.120274,84,70394.558125,88,61739.411800,92,77540.093965,97,130140.030012,87
max_typetarget_encode_action_ntarget_encode,47893.615672,50,39030.222054,25,49769.937434,53,8280.958338,29,40192.290555,52
max_type_simplified_sequencetarget_encode,44145.781678,137,116363.087234,174,62748.415594,176,74818.901404,152,55390.550990,168
max_typetarget_encode,43891.445765,35,22373.808797,28,10508.692725,18,31994.878632,16,6231.382395,23
max_idcartarget_encode_max_typetarget_encode,31276.389190,43,2940.762684,12,26667.587083,39,40835.779187,38,30166.331051,32
sum_quantity_type,29438.182278,103,13590.309638,67,1670.218302,110,9866.250575,135,15560.905457,117
idcartarget_encode,19284.564395,373,10453.008757,313,10379.099764,571,19004.572279,425,14660.382334,469
max_idcartarget_encode_action_ntarget_encode,18165.890067,53,32858.749152,61,26907.521141,74,18111.660658,47,5691.139068,46
max_idcartarget_encode_max_quantity_type,17559.580612,97,14976.743210,55,29079.410480,124,21977.456398,81,11874.488858,74
last_2_actions,16566.083791,876,12083.965266,605,17870.952547,1148,13838.153542,779,16726.207648,1045


Make predictions.

In [42]:
submission.head()

Unnamed: 0,sid,target
133123,EhjG5b8h+RHDgxkQpkMc9sECXbdnA3JOPS07CRYvWmwYSJ...,0.024086
133124,34lu87wJJunsPz2c0SxM/aLQ9x+2dlj5W96R95DIG9yRRe...,0.019368
133125,xkSYpSt3qRk8X6Ev1W8d72Vj6oyXbI8DKjkCqHmvcaI36F...,0.040228
133126,Tu9ylHPEk5Lw5K/8TpuJOEdJv3ZUzJu0zz2sfBrJDQpmyg...,0.081275
133127,s6gbPkykrrtcYDB/OidLEdkEsk/bsWIejziprzhq2wJBij...,0.019256


In [71]:
submission.to_csv('submissions/lgbm_{:.5f}_{:.5f}_{:.5f}_{:.5f}.csv'.format(fit_scores.mean(), fit_scores.std(), val_scores.mean(), val_scores.std()), index=False)

# Parameter tuning

In [92]:
from sklearn import model_selection
import scipy.stats as stats


# Parameter spaces can be lists or statistical distributions from
# https://docs.scipy.org/doc/scipy/reference/stats.html
grid = model_selection.ParameterSampler(
    param_distributions={
        'application': ['binary'],
        'boosting_type': ['gbdt'],
        'metric': ['auc'],
        'num_threads': [8],
        'num_leaves': [2 ** 3, 2 ** 4, 2 ** 5],
        'min_data_per_group': [30],
        'max_cat_threshold': [32],
        'max_cat_to_onehot': [4],
        'cat_smooth': [5],
        'cat_l2': [10],
        'max_bin': [255],
        'min_data_in_bin': [3],
        'scale_pos_weight': [1],
        'min_data_in_leaf': [30],
        'learning_rate': stats.uniform(0.1, 0.2),
        'feature_fraction': [1],
        'feature_fraction_seed': [42],
        'bagging_fraction': [1],
        'bagging_seed': [42],
        'lambda_l1': [0],
        'lambda_l2': [0],
        'verbosity': [2]
    },
    n_iter=2,
    random_state=42
)

for config in grid:
    print(config)

{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.1749080237694725, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}
{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.13668695797323277, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}


# Stacking

In [60]:
import catboost as cb
from sklearn import impute
from sklearn import ensemble
from sklearn import linear_model 
from sklearn import metrics
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
import xam
import xgboost as xgb


cat_cols = X_train.select_dtypes('category').columns.tolist()
num_cols = list(set(X_train.columns) - set(cat_cols))

models = {
    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier
    'LightGBM': lgbm.LGBMClassifier(**params),
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
    'LR': pipeline.Pipeline([
        ('select', xam.pipeline.ColumnSelector(num_cols)),  
        ('impute', impute.SimpleImputer()),  
        ('logistic', linear_model.LogisticRegression(solver='lbfgs', C=1, max_iter=500)),
    ])
}

stack = xam.ensemble.BaggedStackingClassifier(
    models=models,
    meta_model=linear_model.LinearRegression(fit_intercept=True),
    metric=metrics.log_loss,
    use_base_features=False,
    use_probas=True,
    fit_handlers={
        'LightGBM': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'eval_names': ['fit', 'val'],
            'early_stopping_rounds': 50,
            'verbose': False
        },
        'XGBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'early_stopping_rounds': 80,
            'verbose': False
        },
    }
)

In [62]:
print(metrics.log_loss(y_train, stack.predict(X_train)))

0.25849757572185905


In [61]:
stack.fit(X_train, y_train, verbose=False)
print(metrics.log_loss(y_train, stack.predict(X_train)))



ValueError: Found input variables with inconsistent numbers of samples: [133123, 110936]

In [None]:
submission

In [21]:
Fit AUC: 0.94612 (+/- 0.00661)
Val AUC: 0.87672 (+/- 0.03101)

SyntaxError: invalid syntax (<ipython-input-21-ff1a4e09e53e>, line 1)