# Data preparation

In [1]:
import pandas as pd

pd.set_option('display.max_columns', 50)

track = pd.concat(
    (
        pd.read_csv('data/train_tracking.csv'),
        pd.read_csv('data/test_tracking.csv')
    ),
    ignore_index=True,
    axis='rows'
)

track['duration'] = pd.to_timedelta(track['duration'])

In [2]:
import numpy as np

train = pd.read_csv('data/train_session.csv')
test = pd.read_csv('data/random_submission.csv')
train['is_train'] = True
test['is_train'] = False
test['target'] = np.nan

df = pd.concat((train, test), ignore_index=True)

In [3]:
df.head()

Unnamed: 0,sid,target,is_train
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True


# Feature extraction

In [4]:
track['unit'] = 1
track['time_spent'] = track.groupby('sid')['duration'].diff(-1).dt.seconds

In [5]:
df = df.join(pd.pivot_table(track, index='sid', columns='type_simplified', values='unit', aggfunc=np.sum).fillna(0).add_prefix('type_simple_'), on='sid')
df = df.join(pd.pivot_table(track, index='sid', columns='type_simplified', values='time_spent', aggfunc=np.sum).fillna(0).add_prefix('type_simple_time_'), on='sid')

In [6]:
df = df.join(pd.pivot_table(track, index='sid', columns='type', values='unit', aggfunc=np.sum).fillna(0).add_prefix('type_'), on='sid')
df = df.join(pd.pivot_table(track, index='sid', columns='type', values='time_spent', aggfunc=np.sum).fillna(0).add_prefix('type_time_'), on='sid')

In [7]:
df = df.join(track.groupby('sid').size().rename('n_pages'), on='sid')

Screen resolution.

In [8]:
track['resolution'] = track['rw'] * track['rh']
df = df.join(track.groupby('sid')['resolution'].nunique().rename('n_resolutions'), on='sid')
df = df.join(track.groupby('sid')['resolution'].max().rename('max_resolution'), on='sid')

Number of results pages seen.

In [9]:
df = df.join(track.groupby('sid')['pn'].nunique().rename('n_unique_pages'), on='sid')

In [10]:
import ast

track['n_facets'] = track['facets'].apply(lambda x: len(ast.literal_eval(x)) if pd.notnull(x) else 0)
df = df.join(track.groupby('sid')['n_facets'].max().rename('max_n_facets'), on='sid')

Max quantity in basket.

In [11]:
df = df.join(track.groupby('sid')['quantity'].max().rename('max_size_basket').fillna(0), on='sid')

Number of unique actions.

In [12]:
df = df.join(track.groupby('sid')['type'].nunique().rename('n_unique_actions').fillna(0), on='sid')

Total quantity in basket.

In [13]:
df = df.join(track.groupby('sid')['quantity'].sum().rename('sum_size_basket').fillna(0), on='sid')

Unique items in basket.

In [14]:
df['sum_over_unique'] = df['sum_size_basket'] / (df['type_simple_ADD_TO_BASKET'] + 1)

Mean quantity by type.

In [15]:
track = track.join(track.groupby(['type_simplified'])['quantity'].mean().fillna(0).rename('mean_quantity_type'), on=['type_simplified'])
df = df.join(track.groupby('sid')['mean_quantity_type'].mean(), on='sid')

track = track.join(track.groupby(['type'])['quantity'].sum().rename('sum_quantity_type').fillna(0), on = ['type'])
df = df.join(track.groupby('sid')['sum_quantity_type'].sum(), on = 'sid')

track = track.join(track.groupby(['type'])['quantity'].max().rename('max_quantity_type').fillna(0), on = ['type'])
df = df.join(track.groupby('sid')['max_quantity_type'].mean(), on = 'sid')

track = track.join(track.groupby(['type_simplified'])['quantity'].max().rename('max_quantity_category').fillna(0), on=['type_simplified'])
df = df.join(track.groupby('sid')['max_quantity_category'].mean(), on = 'sid')

Total time spent.

In [24]:
df = df.join(track.groupby('sid')['duration'].max().rename('seconds_spent').dt.seconds, on='sid')
df = df.join(track.groupby('sid')['time_spent'].max().rename('time_spent_max'), on='sid')
df = df.join(track.groupby('sid')['time_spent'].mean().rename('time_spent_mean'), on='sid')

Average time spent per page.

In [17]:
df = df.join(track.groupby('sid').apply(lambda x: x['duration'].max().seconds / len(x)).rename('average_duration'), on='sid')

First and last actions.

In [18]:
df = df.join(track.groupby('sid').nth(0)['type'].rename('action_0').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(1)['type'].rename('action_1').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(-2)['type'].rename('action_n-1').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(-1)['type'].rename('action_n').astype('category'), on='sid')

In [19]:
df['first_2_actions'] = df['action_0'].str.cat(df['action_1'], sep='_').astype('category')
df['last_2_actions'] = df['action_n-1'].str.cat(df['action_n'], sep='_').astype('category')

TF-IDF.

In [20]:
from sklearn import feature_extraction

actions = track.groupby('sid')['type'].apply(lambda x: ' '.join(x))
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 2), min_df=10, max_features=60)
vectorizer.fit(actions)
tfidf = pd.DataFrame(vectorizer.transform(actions).todense(), columns=vectorizer.get_feature_names(), index=actions.index).add_prefix('tfidf')
df = df.join(tfidf, on='sid')

Device.

In [31]:
df = df.join(track.groupby('sid')['device'].first().fillna('missing').astype('category').rename('device'), on='sid')
df = df.join(track.groupby('sid')['siteid'].first().fillna('missing').astype('category').rename('siteid'), on='sid')
df['site_device'] = df['siteid'].str.cat(df['device'], sep='_').astype('category')

Total number of products visited during session.

In [22]:
df = df.join(track.groupby('sid').sku.nunique().rename('n_products'), on='sid')

Total number of products visited during session normalized by number of pages visited.

In [25]:
df['n_products_normed'] = df['n_products'] / df['n_pages']
df['seconds_per_product'] = df['n_products'] / (df['seconds_spent'] + 1)

Stype counts.

In [26]:
track['stype'] = track['stype'].fillna('missing')
df = df.join(pd.pivot_table(track, index='sid', columns='stype', values='unit', aggfunc=np.sum).fillna(0).add_prefix('stype_'), on='sid')

True if the seller is not Cdiscount but storage and delivery are provided by Cdiscount.

In [27]:
track['ff'] = track['ff'].fillna('missing')
df = df.join(pd.pivot_table(track, index='sid', columns='ff', values='unit', aggfunc=np.sum).fillna(0).add_prefix('ff_'), on='sid')

Total pages per type visited normalized by number of pages visited/session duration

In [28]:
import itertools

page_types = ['type_' + elem for elem in track.type.unique().tolist()]
page_types += ['type_simple_' + elem for elem in track.type_simplified.unique().tolist()]

# remove duplicate types
for page_type in ['CAROUSEL', 'PA', 'SEARCH', 'LIST_PRODUCT']:
    page_types.remove('type_' + page_type)

denominators = ["n_pages", "seconds_spent"]

# create features
for page_type, denominator in itertools.product(page_types, denominators):
    df["{}_normed_by_{}".format(page_type, denominator)] = df[page_type] / df[denominator]

Target encoding.

In [33]:
import xam

track = track.join(df.set_index('sid')['target'], how = 'left',on ='sid')

encoder = xam.feature_extraction.SmoothTargetEncoder(
    columns = [
        'siteid',
        'device',
        'type_simplified',
        'type',
        'idcar'
    ],
    prior_weight=100,
    suffix='target_encode'
)
track = encoder.fit_transform(track, track['target'])

col_encode = [c for c in track.columns if c.endswith('target_encode')]
df = df.join(track.groupby('sid')[col_encode].mean(), on='sid')

In [40]:
df = df.join(track.groupby('sid')[col_encode].max().add_prefix('max_'), on='sid')
df = df.join(track.groupby('sid')[col_encode].agg(np.ptp).add_prefix('ptp_'), on='sid')
df = df.join(track.groupby('sid')[col_encode].std().add_prefix('std_'), on='sid')

Number of products purchased during the session

In [41]:
df = df.join(track.query("type_simplified == 'PURCHASE_PRODUCT'").groupby('sid')['quantity'].sum().rename('n_products_purchased'), on='sid')
df["n_products_in_basket"] = df['sum_size_basket'] - df['n_products_purchased']

In [42]:
def target_encode(df, on, by, m):
    """From https://www.wikiwand.com/en/Bayes_estimator#/Practical_example_of_Bayes_estimators"""
    
    agg = df.groupby(by)[on].agg(['count', 'sum'])
    R = agg['sum'] / agg['count']
    v = agg['count']
    C = df[on].mean()
    W = (R*v + C*m) / (v + m)
    
    # Automatically generate the feature's name
    if not isinstance(by, (list, tuple)):
        by = [by]
    name = "{}_mean_by_{}".format(on, '_'.join(by))  # e.g. 'interest_mean_by_bond_and_action'
    
    return W.rename(name)

track = track.join(target_encode(track,'target',['idcar','type'],100),on = ['idcar','type'])

df = df.join(track.groupby('sid')['target_mean_by_idcar_type'].mean(), on='sid')

ValueError: columns overlap but no suffix specified: Index(['target_mean_by_idcar_type'], dtype='object')

In [43]:
encoder = xam.feature_extraction.SmoothTargetEncoder(
    columns = [
        'action_n',
        'last_2_actions',
        'site_device'
    ],
    prior_weight=100,
    suffix='target_encode'
)
df = encoder.fit_transform(df, df['target'])

In [44]:
df.head()

Unnamed: 0,sid,target,is_train,type_simple_ADD_TO_BASKET,type_simple_CAROUSEL,type_simple_LIST_PRODUCT,type_simple_PA,type_simple_PRODUCT,type_simple_PURCHASE_PRODUCT,type_simple_SEARCH,type_simple_SHOW_CASE,type_simple_time_ADD_TO_BASKET,type_simple_time_CAROUSEL,type_simple_time_LIST_PRODUCT,type_simple_time_PA,type_simple_time_PRODUCT,type_simple_time_PURCHASE_PRODUCT,type_simple_time_SEARCH,type_simple_time_SHOW_CASE,type_ADD_TO_BASKET_CAROUSEL,type_ADD_TO_BASKET_LP,type_ADD_TO_BASKET_LR,type_ADD_TO_BASKET_PA,type_ADD_TO_BASKET_SHOW_CASE,type_CAROUSEL,...,devicetarget_encode,type_simplifiedtarget_encode,typetarget_encode,idcartarget_encode,target_mean_by_idcar_type,action_ntarget_encode,last_2_actionstarget_encode,site_devicetarget_encode,max_siteidtarget_encode,max_devicetarget_encode,max_type_simplifiedtarget_encode,max_typetarget_encode,max_idcartarget_encode,ptp_siteidtarget_encode,ptp_devicetarget_encode,ptp_type_simplifiedtarget_encode,ptp_typetarget_encode,ptp_idcartarget_encode,std_siteidtarget_encode,std_devicetarget_encode,std_type_simplifiedtarget_encode,std_typetarget_encode,std_idcartarget_encode,n_products_purchased,n_products_in_basket
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,86393.0,0.0,168646.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.140517,0.138758,0.14361,0.131889,0.119954,0.119207,0.088601,0.097847,0.140389,0.140517,0.150284,0.156344,0.135867,0.0,0.0,0.016377,0.022437,0.015913,0.0,0.0,0.007815,0.011474,0.007957,,
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True,0.0,4.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,259115.0,0.0,0.0,345562.0,0.0,431552.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.140517,0.139878,0.143113,0.130971,0.119954,0.119207,0.088601,0.097847,0.140389,0.140517,0.150284,0.156344,0.135867,0.0,0.0,0.016377,0.035574,0.015913,0.0,0.0,0.007337,0.01174,0.007644,,
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,86348.0,86345.0,0.0,86290.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.140517,0.128026,0.132878,0.135867,,0.067281,0.072089,0.097847,0.140389,0.140517,0.136935,0.156344,0.135867,0.0,0.0,0.02958,0.048988,0.0,0.0,0.0,0.013854,0.020035,0.0,,
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,259124.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.140517,0.133907,0.133907,0.135867,,0.067281,0.058267,0.097847,0.140389,0.140517,0.133907,0.133907,0.135867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,172761.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.140517,0.133907,0.133907,0.135867,,0.067281,0.058267,0.097847,0.140389,0.140517,0.133907,0.133907,0.135867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [45]:
df.corr()['target'].sort_values()

type_simple_LIST_PRODUCT_normed_by_n_pages         -0.058889
type_simple_SHOW_CASE_normed_by_n_pages            -0.054394
type_SHOW_CASE_normed_by_n_pages                   -0.054394
tfidflist_product                                  -0.053311
tfidfshow_case                                     -0.045724
type_simple_LIST_PRODUCT_normed_by_seconds_spent   -0.042339
time_spent_mean                                    -0.040491
tfidflist_product list_product                     -0.038050
type_simple_SEARCH_normed_by_seconds_spent         -0.033950
time_spent_max                                     -0.033577
type_simple_PA_normed_by_n_pages                   -0.031883
tfidfshow_case list_product                        -0.031561
type_simple_SEARCH_normed_by_n_pages               -0.031256
tfidfshow_case show_case                           -0.030242
type_SHOW_CASE_normed_by_seconds_spent             -0.029214
type_simple_SHOW_CASE_normed_by_seconds_spent      -0.029214
type_PRODUCT_LP_normed_b

In [46]:
cols = [
    'ptp_idcartarget_encode',
    'target_mean_by_idcar_type',
    'ptp_type_simplifiedtarget_encode',
    'ptp_typetarget_encode',
    'max_idcartarget_encode',
    'max_type_simplifiedtarget_encode',
    'max_typetarget_encode',
    'max_quantity_type',
    'action_ntarget_encode',
    'last_2_actionstarget_encode'
]

for (c1, c2) in itertools.combinations(cols, 2):
    df['{}_{}'.format(c1, c2)] = df[c1] / (1 + df[c2])

# Learning

Prepare the datasets.

In [47]:
to_drop = ['sid', 'is_train']

X_train = df.query('is_train == 1').drop(to_drop + ['target'], axis='columns')
y_train = df.query('is_train == 1')['target']
X_test = df.query('is_train == 0').drop(to_drop + ['target'], axis='columns')
submission = df.query('is_train == 0')['sid'].to_frame()
submission['target'] = 0

Do some sanity checks.

In [48]:
assert len(X_train) == 133123
assert len(y_train) == 133123
assert len(X_test) == 88750
assert len(submission) == 88750
assert len(X_train.columns) == len(X_test.columns)

Find the optimal number of boosting rounds through cross-validation.

In [49]:
import time

import lightgbm as lgbm
import numpy as np
from sklearn import model_selection
import xam


# https://lightgbm.readthedocs.io/en/latest/Parameters.html
params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary',
    'num_threads': 8,
    'num_leaves': 2 ** 5,
    'max_depth': 10,
    'min_data_per_group': 50,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 4,
    'cat_smooth': 40,
    'cat_l2': 10,
    'max_bin': 511,
    'min_data_in_bin': 3,
    'scale_pos_weight': 1,
    'min_data_in_leaf': 20,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'feature_fraction_seed': 42,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'bagging_seed': 42,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'verbosity': 2
}

n_splits = 5
cv = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=42)
submission['target'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)
feature_importances_ = pd.DataFrame(index=X_train.columns)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.Dataset
    fit_set = lgbm.Dataset(X_fit, y_fit)
    val_set = lgbm.Dataset(X_val, y_val)

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.train
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        num_boost_round=10000,
        verbose_eval=50,
        early_stopping_rounds=100,
        evals_result=evals_result
    )

    # Store the feature importances
    feature_importances_[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances_[f'split_{i}'] = model.feature_importance('split')
    
    # Store the training scores
    fit_scores[i] = evals_result['fit']['binary_logloss'][-1]
    val_scores[i] = evals_result['val']['binary_logloss'][-1]
    
    # Accumulate test predictions
    submission['target'] += (model.predict(X_test) / n_splits)

print('- Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('- Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))



Training until validation scores don't improve for 100 rounds.
[50]	fit's binary_logloss: 0.274743	val's binary_logloss: 0.281302
[100]	fit's binary_logloss: 0.262206	val's binary_logloss: 0.270075
[150]	fit's binary_logloss: 0.256258	val's binary_logloss: 0.265462
[200]	fit's binary_logloss: 0.252826	val's binary_logloss: 0.263492
[250]	fit's binary_logloss: 0.250478	val's binary_logloss: 0.262545
[300]	fit's binary_logloss: 0.248615	val's binary_logloss: 0.26211
[350]	fit's binary_logloss: 0.246953	val's binary_logloss: 0.261818
[400]	fit's binary_logloss: 0.245461	val's binary_logloss: 0.261626
[450]	fit's binary_logloss: 0.244063	val's binary_logloss: 0.261426
[500]	fit's binary_logloss: 0.24277	val's binary_logloss: 0.261433
[550]	fit's binary_logloss: 0.241541	val's binary_logloss: 0.261415
Early stopping, best iteration is:
[487]	fit's binary_logloss: 0.243099	val's binary_logloss: 0.261396
Training until validation scores don't improve for 100 rounds.
[50]	fit's binary_logloss:

- Fit AUC: 0.23967 (+/- 0.00281)
- Val AUC: 0.25772 (+/- 0.00390)

Display feature importance.

In [60]:
feature_importances_.sort_values('gain_0', ascending=False)

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
max_idcartarget_encode,68559.521290,79,68330.252863,83,65053.216497,73,82651.521495,92,136953.350730,94
max_typetarget_encode,60419.243761,46,49389.550770,37,23196.999593,48,85133.030742,54,13515.203154,45
max_typetarget_encode_action_ntarget_encode,55150.092732,61,68059.982800,48,46121.307423,67,22406.908130,45,46007.224591,58
max_idcartarget_encode_max_typetarget_encode,33197.842196,49,692.156277,11,36299.135240,30,40438.611877,33,19768.286019,43
max_quantity_type,23255.207603,42,37316.602908,34,45300.706661,47,7377.234693,30,8226.561025,44
max_idcartarget_encode_action_ntarget_encode,22071.857261,56,32147.943007,65,21234.466241,59,23560.242803,57,6806.380424,56
max_idcartarget_encode_max_quantity_type,21773.864398,101,27193.356444,69,33720.997959,138,31999.071686,101,12424.353162,80
action_ntarget_encode,21067.697495,134,14910.172369,107,14177.483215,132,12519.915674,102,19917.996862,97
max_quantity_type_action_ntarget_encode,20408.899992,45,5245.840532,27,431.271885,30,8079.758084,36,4731.856550,34
last_2_actions,19389.158182,1162,12233.501306,625,20047.784517,1585,16811.472963,1068,19235.307395,1254


Make predictions.

In [136]:
submission.head()

Unnamed: 0,sid,target
133123,EhjG5b8h+RHDgxkQpkMc9sECXbdnA3JOPS07CRYvWmwYSJ...,0.025495
133124,34lu87wJJunsPz2c0SxM/aLQ9x+2dlj5W96R95DIG9yRRe...,0.019041
133125,xkSYpSt3qRk8X6Ev1W8d72Vj6oyXbI8DKjkCqHmvcaI36F...,0.041879
133126,Tu9ylHPEk5Lw5K/8TpuJOEdJv3ZUzJu0zz2sfBrJDQpmyg...,0.083346
133127,s6gbPkykrrtcYDB/OidLEdkEsk/bsWIejziprzhq2wJBij...,0.019004


In [50]:
submission.to_csv('submissions/lgbm_{:.5f}_{:.5f}_{:.5f}_{:.5f}.csv'.format(fit_scores.mean(), fit_scores.std(), val_scores.mean(), val_scores.std()), index=False)

# Parameter tuning

In [92]:
from sklearn import model_selection
import scipy.stats as stats


# Parameter spaces can be lists or statistical distributions from
# https://docs.scipy.org/doc/scipy/reference/stats.html
grid = model_selection.ParameterSampler(
    param_distributions={
        'application': ['binary'],
        'boosting_type': ['gbdt'],
        'metric': ['auc'],
        'num_threads': [8],
        'num_leaves': [2 ** 3, 2 ** 4, 2 ** 5],
        'min_data_per_group': [30],
        'max_cat_threshold': [32],
        'max_cat_to_onehot': [4],
        'cat_smooth': [5],
        'cat_l2': [10],
        'max_bin': [255],
        'min_data_in_bin': [3],
        'scale_pos_weight': [1],
        'min_data_in_leaf': [30],
        'learning_rate': stats.uniform(0.1, 0.2),
        'feature_fraction': [1],
        'feature_fraction_seed': [42],
        'bagging_fraction': [1],
        'bagging_seed': [42],
        'lambda_l1': [0],
        'lambda_l2': [0],
        'verbosity': [2]
    },
    n_iter=2,
    random_state=42
)

for config in grid:
    print(config)

{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.1749080237694725, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}
{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.13668695797323277, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}


# Stacking

In [60]:
import catboost as cb
from sklearn import impute
from sklearn import ensemble
from sklearn import linear_model 
from sklearn import metrics
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
import xam
import xgboost as xgb


cat_cols = X_train.select_dtypes('category').columns.tolist()
num_cols = list(set(X_train.columns) - set(cat_cols))

models = {
    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier
    'LightGBM': lgbm.LGBMClassifier(**params),
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
    'LR': pipeline.Pipeline([
        ('select', xam.pipeline.ColumnSelector(num_cols)),  
        ('impute', impute.SimpleImputer()),  
        ('logistic', linear_model.LogisticRegression(solver='lbfgs', C=1, max_iter=500)),
    ])
}

stack = xam.ensemble.BaggedStackingClassifier(
    models=models,
    meta_model=linear_model.LinearRegression(fit_intercept=True),
    metric=metrics.log_loss,
    use_base_features=False,
    use_probas=True,
    fit_handlers={
        'LightGBM': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'eval_names': ['fit', 'val'],
            'early_stopping_rounds': 50,
            'verbose': False
        },
        'XGBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'early_stopping_rounds': 80,
            'verbose': False
        },
    }
)

In [62]:
print(metrics.log_loss(y_train, stack.predict(X_train)))

0.25849757572185905


In [61]:
stack.fit(X_train, y_train, verbose=False)
print(metrics.log_loss(y_train, stack.predict(X_train)))



ValueError: Found input variables with inconsistent numbers of samples: [133123, 110936]

In [None]:
submission

In [21]:
Fit AUC: 0.94612 (+/- 0.00661)
Val AUC: 0.87672 (+/- 0.03101)

SyntaxError: invalid syntax (<ipython-input-21-ff1a4e09e53e>, line 1)