# Data preparation

In [64]:
import pandas as pd

pd.set_option('display.max_columns', 50)

track = pd.concat(
    (
        pd.read_csv('data/train_tracking.csv'),
        pd.read_csv('data/test_tracking.csv')
    ),
    ignore_index=True,
    axis='rows'
)

track['duration'] = pd.to_timedelta(track['duration'])

In [113]:
import numpy as np

train = pd.read_csv('data/train_session.csv')
test = pd.read_csv('data/random_submission.csv')
train['is_train'] = True
test['is_train'] = False
test['target'] = np.nan

df = pd.concat((train, test), ignore_index=True)

In [67]:
df.head()

Unnamed: 0,sid,target,is_train
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True


# Feature extraction

In [114]:
track['unit'] = 1

In [115]:
df = df.join(pd.pivot_table(track, index='sid', columns='type_simplified', values='unit', aggfunc=np.sum).fillna(0).add_prefix('type_simple_'), on='sid')

In [116]:
df = df.join(pd.pivot_table(track, index='sid', columns='type', values='unit', aggfunc=np.sum).fillna(0).add_prefix('type_'), on='sid')

In [117]:
df = df.join(track.groupby('sid').size().rename('n_pages'), on='sid')

Screen resolution.

In [118]:
track['resolution'] = track['rw'] * track['rh']
df = df.join(track.groupby('sid')['resolution'].nunique().rename('n_resolutions'), on='sid')
df = df.join(track.groupby('sid')['resolution'].max().rename('max_resolution'), on='sid')

Number of results pages seen.

In [119]:
df = df.join(track.groupby('sid')['pn'].nunique().rename('n_unique_pages'), on='sid')

In [120]:
import ast


def count_facets(x):
    try:
        return len(ast.literal_eval(x))
    except ValueError:
        return 0


track['n_facets'] = track['facets'].apply(count_facets)
df = df.join(track.groupby('sid')['n_facets'].max().rename('max_n_facets'), on='sid')

Max quantity in basket.

In [121]:
df = df.join(track.groupby('sid')['quantity'].max().rename('max_size_basket').fillna(0), on = 'sid')

Total quantity in basket.

In [122]:
df = df.join(track.groupby('sid')['quantity'].sum().rename('sum_size_basket').fillna(0), on = 'sid')

Mean quantity by type.

In [138]:
track = track.join(track.groupby(['type_simplified'])['quantity'].mean().rename('mean_quantity_type'), on=['type_simplified'])
df = df.join(track.groupby('sid')['mean_quantity_type'].mean(), on='sid')

In [141]:
track = track.join(track.groupby(['type_simplified'])['quantity'].max().rename('max_quantity_category').fillna(0), on = ['type_simplified'])
df = df.join(track.groupby('sid')['max_quantity_category'].mean(), on = 'sid')

Total time spent.

In [123]:
df = df.join(track.groupby('sid')['duration'].max().rename('seconds_spent').dt.seconds, on='sid')

Average time spent per page.

In [124]:
df = df.join(track.groupby('sid').apply(lambda x: x['duration'].max().seconds / len(x)).rename('average_duration'), on='sid')

Last action.

In [158]:
df.drop(['first_action'], axis='columns', inplace=True)

In [159]:
df = df.join(track.groupby('sid').first()['type'].rename('first_action').astype('category'), on='sid')
df = df.join(track.groupby('sid').last()['type'].rename('last_action').astype('category'), on='sid')

In [125]:
from sklearn import feature_extraction

actions = track.groupby('sid')['type'].apply(lambda x: ' '.join(x))
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 2), min_df=10)
vectorizer.fit(actions)
tfidf = pd.DataFrame(vectorizer.transform(actions).todense(), columns=vectorizer.get_feature_names(), index=actions.index).add_prefix('tfidf')
df = df.join(tfidf, on='sid')

In [127]:
df.head()

Unnamed: 0,sid,target,is_train,type_simple_ADD_TO_BASKET,type_simple_CAROUSEL,type_simple_LIST_PRODUCT,type_simple_PA,type_simple_PRODUCT,type_simple_PURCHASE_PRODUCT,type_simple_SEARCH,type_simple_SHOW_CASE,type_ADD_TO_BASKET_CAROUSEL,type_ADD_TO_BASKET_LP,type_ADD_TO_BASKET_LR,type_ADD_TO_BASKET_PA,type_ADD_TO_BASKET_SHOW_CASE,type_CAROUSEL,type_LIST_PRODUCT,type_PA,type_PRODUCT_CAROUSEL,type_PRODUCT_LP,type_PRODUCT_LR,type_PRODUCT_PA,type_PRODUCT_SHOW_CASE,type_PURCHASE_PRODUCT_CAROUSEL,...,max_size_basket,sum_size_basket,seconds_spent,average_duration,tfidfadd_to_basket_carousel,tfidfadd_to_basket_lp,tfidfadd_to_basket_lr,tfidfadd_to_basket_pa,tfidfadd_to_basket_show_case,tfidfcarousel,tfidflist_product,tfidfpa,tfidfproduct_carousel,tfidfproduct_lp,tfidfproduct_lr,tfidfproduct_pa,tfidfproduct_show_case,tfidfpurchase_product_carousel,tfidfpurchase_product_lp,tfidfpurchase_product_lr,tfidfpurchase_product_pa,tfidfpurchase_product_show_case,tfidfpurchase_product_unknow_origin,tfidfsearch,tfidfshow_case
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,4159,1039.75,0.0,0.0,0.0,0.0,0.0,0.382993,0.0,0.0,0.0,0.0,0.516153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.766096,0.0
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True,0.0,4.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,565,43.461538,0.0,0.0,0.0,0.0,0.0,0.512122,0.0,0.0,0.244686,0.0,0.517634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.640245,0.0
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,215,53.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.448262,0.0,0.0,0.499475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.741341,0.0
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,75,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,38,12.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [160]:
df.corr()['target'].sort_values()

tfidflist_product                     -0.054357
tfidfshow_case                        -0.047770
tfidfproduct_pa                       -0.027792
tfidfproduct_lp                       -0.025089
tfidfproduct_show_case                -0.022797
tfidfpa                               -0.022292
average_duration                      -0.020962
type_PRODUCT_PA                       -0.012290
tfidfproduct_carousel                 -0.012006
tfidfsearch                           -0.009421
type_PRODUCT_SHOW_CASE                -0.004875
type_simple_LIST_PRODUCT              -0.004404
type_LIST_PRODUCT                     -0.004404
type_SHOW_CASE                        -0.003709
type_simple_SHOW_CASE                 -0.003709
n_resolutions                          0.004350
tfidfpurchase_product_pa               0.004637
max_resolution                         0.006043
type_PRODUCT_LP                        0.006148
tfidfadd_to_basket_pa                  0.006605
type_PURCHASE_PRODUCT_PA               0

In [88]:
useless = [
    'type_PURCHASE_PRODUCT_SHOW_CASE',
    'type_PURCHASE_PRODUCT_PA',
    'type_PURCHASE_PRODUCT_LP'
]

df = df.drop(useless, axis='columns')

Checkpoint.

In [11]:
df.to_feather('data/features.ftr')

ValueError: feather does not support serializing a non-default index for the index; you can .reset_index() to make the index into column(s)

# Learning

In [None]:
df = pd.read_feather('data/features.ftr')

Prepare the datasets.

In [161]:
to_drop = ['sid', 'is_train']

X_train = df.query('is_train == 1').drop(to_drop + ['target'], axis='columns')
y_train = df.query('is_train == 1')['target']
X_test = df.query('is_train == 0').drop(to_drop + ['target'], axis='columns')
submission = df.query('is_train == 0')['sid'].to_frame()
submission['target'] = 0

Do some sanity checks.

In [162]:
assert len(X_train) == 133123
assert len(y_train) == 133123
assert len(X_test) == 88750
assert len(submission) == 88750
assert len(X_train.columns) == len(X_test.columns)

Find the optimal number of boosting rounds through cross-validation.

In [163]:
import time

import lightgbm as lgbm
import numpy as np
from sklearn import model_selection
import xam


# https://lightgbm.readthedocs.io/en/latest/Parameters.html
params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary',
    'num_threads': 8,
    'num_leaves': 2 ** 4,
    'min_data_per_group': 30,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 4,
    'cat_smooth': 5,
    'cat_l2': 10,
    'max_bin': 255,
    'min_data_in_bin': 3,
    'scale_pos_weight': 1,
    'min_data_in_leaf': 120,
    'learning_rate': 0.02,
    'feature_fraction': 1,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_seed': 42,
    'lambda_l1': 1,
    'lambda_l2': 2,
    'verbosity': 2
}

n_splits = 5
cv = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=42)
submission['target'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)
feature_importances_ = pd.DataFrame(index=X_train.columns)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.Dataset
    fit_set = lgbm.Dataset(X_fit, y_fit)
    val_set = lgbm.Dataset(X_val, y_val)

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.train
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        num_boost_round=10000,
        verbose_eval=50,
        early_stopping_rounds=40,
        evals_result=evals_result
    )

    # Store the feature importances
    feature_importances_[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances_[f'split_{i}'] = model.feature_importance('split')
    
    # Store the training scores
    fit_scores[i] = evals_result['fit']['binary_logloss'][-1]
    val_scores[i] = evals_result['val']['binary_logloss'][-1]
    
    # Accumulate test predictions
    submission['target'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))



Training until validation scores don't improve for 40 rounds.
[50]	fit's binary_logloss: 0.272799	val's binary_logloss: 0.278661
[100]	fit's binary_logloss: 0.26657	val's binary_logloss: 0.273145
[150]	fit's binary_logloss: 0.264343	val's binary_logloss: 0.271617
[200]	fit's binary_logloss: 0.262892	val's binary_logloss: 0.270744
[250]	fit's binary_logloss: 0.261805	val's binary_logloss: 0.270354
[300]	fit's binary_logloss: 0.260935	val's binary_logloss: 0.270216
[350]	fit's binary_logloss: 0.260141	val's binary_logloss: 0.270127
[400]	fit's binary_logloss: 0.259418	val's binary_logloss: 0.270063
[450]	fit's binary_logloss: 0.258721	val's binary_logloss: 0.270046
[500]	fit's binary_logloss: 0.258067	val's binary_logloss: 0.269984
[550]	fit's binary_logloss: 0.257426	val's binary_logloss: 0.269948
[600]	fit's binary_logloss: 0.256801	val's binary_logloss: 0.269901
[650]	fit's binary_logloss: 0.256187	val's binary_logloss: 0.269899
[700]	fit's binary_logloss: 0.255587	val's binary_loglos

- Fit AUC: 0.26127 (+/- 0.00123)
- Val AUC: 0.26857 (+/- 0.00393)

Display feature importance.

In [164]:
feature_importances_.sort_values('gain_0')

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
type_SEARCH,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
tfidfpurchase_product_lp,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
type_SHOW_CASE,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
tfidfpurchase_product_pa,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
type_PURCHASE_PRODUCT_SHOW_CASE,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
type_PURCHASE_PRODUCT_PA,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
type_PURCHASE_PRODUCT_LP,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
type_PA,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
type_LIST_PRODUCT,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0
type_CAROUSEL,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0


Make predictions.

In [58]:
submission.head()

Unnamed: 0,sid,target
0,EhjG5b8h+RHDgxkQpkMc9sECXbdnA3JOPS07CRYvWmwYSJ...,0.021754
1,34lu87wJJunsPz2c0SxM/aLQ9x+2dlj5W96R95DIG9yRRe...,0.025196
2,xkSYpSt3qRk8X6Ev1W8d72Vj6oyXbI8DKjkCqHmvcaI36F...,0.03189
3,Tu9ylHPEk5Lw5K/8TpuJOEdJv3ZUzJu0zz2sfBrJDQpmyg...,0.05303
4,s6gbPkykrrtcYDB/OidLEdkEsk/bsWIejziprzhq2wJBij...,0.047834


In [165]:
submission.to_csv('submissions/lgbm_{:.5f}_{:.5f}_{:.5f}_{:.5f}.csv'.format(fit_scores.mean(), fit_scores.std(), val_scores.mean(), val_scores.std()), index=False)

# Parameter tuning

In [92]:
from sklearn import model_selection
import scipy.stats as stats


# Parameter spaces can be lists or statistical distributions from
# https://docs.scipy.org/doc/scipy/reference/stats.html
grid = model_selection.ParameterSampler(
    param_distributions={
        'application': ['binary'],
        'boosting_type': ['gbdt'],
        'metric': ['auc'],
        'num_threads': [8],
        'num_leaves': [2 ** 3, 2 ** 4, 2 ** 5],
        'min_data_per_group': [30],
        'max_cat_threshold': [32],
        'max_cat_to_onehot': [4],
        'cat_smooth': [5],
        'cat_l2': [10],
        'max_bin': [255],
        'min_data_in_bin': [3],
        'scale_pos_weight': [1],
        'min_data_in_leaf': [30],
        'learning_rate': stats.uniform(0.1, 0.2),
        'feature_fraction': [1],
        'feature_fraction_seed': [42],
        'bagging_fraction': [1],
        'bagging_seed': [42],
        'lambda_l1': [0],
        'lambda_l2': [0],
        'verbosity': [2]
    },
    n_iter=2,
    random_state=42
)

for config in grid:
    print(config)

{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.1749080237694725, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}
{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.13668695797323277, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}


# Stacking

In [26]:
import catboost as cb
from sklearn import ensemble
from sklearn import linear_model 
from sklearn import metrics
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
import xam
import xgboost as xgb


cat_cols = X_train.select_dtypes('category').columns.tolist()
num_cols = list(set(X_train.columns) - set(cat_cols))

models = {
    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier
    'LightGBM': lgbm.LGBMClassifier(**params),
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
    'XGBoost': xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.06,
        n_estimators=300,
        random_state=42
    )
}


stack = xam.ensemble.BaggedStackingClassifier(
    models=models,
    meta_model=linear_model.Regression(fit_intercept=True),
    metric=metrics.accuracy_score,
    use_base_features=False,
    use_probas=True,
    fit_handlers={
        'LightGBM': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'eval_names': ['fit', 'val'],
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'XGBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'CatBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'early_stopping_rounds': 80,
            'verbose': False
        }
    }
)

In [27]:
submission['Survived'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    stack.fit(X_fit, y_fit, verbose=False)
    
    # Store the training scores
    fit_scores[i] = metrics.roc_auc_score(y_fit, stack.predict(X_fit))
    val_scores[i] = metrics.roc_auc_score(y_val, stack.predict(X_val))
    
    # Accumulate test predictions
    submission['Survived'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))

Fit AUC: 0.89920 (+/- 0.00975)
Val AUC: 0.79175 (+/- 0.02826)


In [21]:
Fit AUC: 0.94612 (+/- 0.00661)
Val AUC: 0.87672 (+/- 0.03101)

SyntaxError: invalid syntax (<ipython-input-21-ff1a4e09e53e>, line 1)