# Data preparation

In [1]:
import pandas as pd

pd.set_option('display.max_columns', 50)

track = pd.concat(
    (
        pd.read_csv('data/train_tracking.csv'),
        pd.read_csv('data/test_tracking.csv')
    ),
    ignore_index=True,
    axis='rows'
)

track['duration'] = pd.to_timedelta(track['duration'])

In [2]:
import numpy as np

train = pd.read_csv('data/train_session.csv')
test = pd.read_csv('data/random_submission.csv')
train['is_train'] = True
test['is_train'] = False
test['target'] = np.nan

df = pd.concat((train, test), ignore_index=True)

In [3]:
df.head()

Unnamed: 0,sid,target,is_train
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True


# Feature extraction

In [4]:
track['unit'] = 1

In [5]:
df = df.join(pd.pivot_table(track, index='sid', columns='type_simplified', values='unit', aggfunc=np.sum).fillna(0).add_prefix('type_simple_'), on='sid')

In [6]:
df = df.join(pd.pivot_table(track, index='sid', columns='type', values='unit', aggfunc=np.sum).fillna(0).add_prefix('type_'), on='sid')

In [7]:
df = df.join(track.groupby('sid').size().rename('n_pages'), on='sid')

Screen resolution.

In [8]:
track['resolution'] = track['rw'] * track['rh']
df = df.join(track.groupby('sid')['resolution'].nunique().rename('n_resolutions'), on='sid')
df = df.join(track.groupby('sid')['resolution'].max().rename('max_resolution'), on='sid')

Number of results pages seen.

In [9]:
df = df.join(track.groupby('sid')['pn'].nunique().rename('n_unique_pages'), on='sid')

In [10]:
import ast


def count_facets(x):
    try:
        return len(ast.literal_eval(x))
    except ValueError:
        return 0


track['n_facets'] = track['facets'].apply(count_facets)
df = df.join(track.groupby('sid')['n_facets'].max().rename('max_n_facets'), on='sid')

Max quantity in basket.

In [11]:
df = df.join(track.groupby('sid')['quantity'].max().rename('max_size_basket').fillna(0), on='sid')

Total quantity in basket.

In [12]:
df = df.join(track.groupby('sid')['quantity'].sum().rename('sum_size_basket').fillna(0), on='sid')

Unique items in basket.

In [14]:
df['sum_over_unique'] = df['sum_size_basket'] / (df['type_simple_ADD_TO_BASKET'] + 1)

Mean quantity by type.

In [15]:
track = track.join(track.groupby(['type_simplified'])['quantity'].mean().rename('mean_quantity_type'), on=['type_simplified'])
df = df.join(track.groupby('sid')['mean_quantity_type'].mean(), on='sid')

Max quantity by type.

In [16]:
track = track.join(track.groupby(['type_simplified'])['quantity'].max().rename('max_quantity_category').fillna(0), on=['type_simplified'])
df = df.join(track.groupby('sid')['max_quantity_category'].mean(), on = 'sid')

Total time spent.

In [17]:
df = df.join(track.groupby('sid')['duration'].max().rename('seconds_spent').dt.seconds, on='sid')

Average time spent per page.

In [18]:
df = df.join(track.groupby('sid').apply(lambda x: x['duration'].max().seconds / len(x)).rename('average_duration'), on='sid')

First and last actions.

In [19]:
df = df.join(track.groupby('sid').nth(0)['type'].rename('action_0').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(1)['type'].rename('action_1').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(-2)['type'].rename('action_n-1').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(-1)['type'].rename('action_n').astype('category'), on='sid')

In [21]:
df['first_2_actions'] = df['action_0'].str.cat(df['action_0'], sep='_').astype('category')
df['last_2_actions'] = df['action_n-1'].str.cat(df['action_n'], sep='_').astype('category')

TF-IDF.

In [24]:
from sklearn import feature_extraction

actions = track.groupby('sid')['type'].apply(lambda x: ' '.join(x))
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 2), min_df=10, max_features=50)
vectorizer.fit(actions)
tfidf = pd.DataFrame(vectorizer.transform(actions).todense(), columns=vectorizer.get_feature_names(), index=actions.index).add_prefix('tfidf')
df = df.join(tfidf, on='sid')

Device.

In [25]:
df = df.join(track.groupby('sid')['device'].first().fillna('missing').astype('category').rename('device'), on='sid')

Total number of products visited during session.

In [26]:
df = df.join(track.groupby('sid').sku.nunique().rename('n_products'), on='sid')

Total number of products visited during session normalized by number of pages visited.

In [27]:
df['n_products_normed'] = df['n_products'] / df['n_pages']

Stype counts.

In [28]:
df = df.join(pd.pivot_table(track, index='sid', columns='stype', values='unit', aggfunc=np.sum).fillna(0).add_prefix('stype_'), on='sid')

True if the seller is not Cdiscount but storage and delivery are provided by Cdiscount.

In [29]:
df = df.join(pd.pivot_table(track, index='sid', columns='ff', values='unit', aggfunc=np.sum).fillna(0).add_prefix('ff_'), on='sid')

Statistics about quantities regarding categories.

In [22]:
category = pd.read_csv('data/productid_category.csv')

In [24]:
track = track.join(track.groupby(['category_product_id_level1'])['quantity'].sum().rename('sum_quantity_category_1_sid').fillna(0), on=['category_product_id_level1'])
df = df.join(track.groupby('sid')['sum_quantity_category_1_sid'].mean(), on = 'sid')

track = track.join(track.groupby(['category_product_id_level2'])['quantity'].sum().rename('sum_quantity_category_2_sid').fillna(0), on=['category_product_id_level2'])
df = df.join(track.groupby('sid')['sum_quantity_category_2_sid'].mean(), on = 'sid')

track = track.join(track.groupby(['category_product_id_level3'])['quantity'].sum().rename('sum_quantity_category_3_sid').fillna(0), on=['category_product_id_level3'])
df = df.join(track.groupby('sid')['sum_quantity_category_3_sid'].mean(), on = 'sid')

track = track.join(track.groupby(['category_product_id_level3'])['quantity'].max().rename('max_quantity_category_3_sid').fillna(0), on=['category_product_id_level3'])
df = df.join(track.groupby('sid')['max_quantity_category_3_sid'].mean(), on = 'sid')

track = track.join(track.groupby(['category_product_id_level2'])['quantity'].max().rename('max_quantity_category_2_sid').fillna(0), on=['category_product_id_level2'])
df = df.join(track.groupby('sid')['max_quantity_category_2_sid'].mean(), on = 'sid')

track = track.join(track.groupby(['category_product_id_level1'])['quantity'].max().rename('max_quantity_category_1_sid').fillna(0), on=['category_product_id_level1'])
df = df.join(track.groupby('sid')['max_quantity_category_1_sid'].mean(), on = 'sid')

In [51]:
import xam

track = track.join(df.set_index('sid')['target'], how = 'left',on ='sid')

encoder = xam.feature_extraction.SmoothTargetEncoder(
            columns = ['device',
                       'type_simplified',
                       'type',
                      'idcar'],
            prior_weight=100,
            suffix='target_encode')
track = encoder.fit_transform(track,track['target'])

col_encode = ['devicetarget_encode','type_simplifiedtarget_encode','typetarget_encode', 'idcartarget_encode']
df = df.join(track.groupby('sid')[col_encode].mean(), on='sid')

In [30]:
df.head()

Unnamed: 0,sid,target,is_train,type_simple_ADD_TO_BASKET,type_simple_CAROUSEL,type_simple_LIST_PRODUCT,type_simple_PA,type_simple_PRODUCT,type_simple_PURCHASE_PRODUCT,type_simple_SEARCH,type_simple_SHOW_CASE,type_ADD_TO_BASKET_CAROUSEL,type_ADD_TO_BASKET_LP,type_ADD_TO_BASKET_LR,type_ADD_TO_BASKET_PA,type_ADD_TO_BASKET_SHOW_CASE,type_CAROUSEL,type_LIST_PRODUCT,type_PA,type_PRODUCT_CAROUSEL,type_PRODUCT_LP,type_PRODUCT_LR,type_PRODUCT_PA,type_PRODUCT_SHOW_CASE,type_PURCHASE_PRODUCT_CAROUSEL,...,tfidfproduct_lr product_lr,tfidfproduct_lr search,tfidfproduct_pa,tfidfproduct_show_case,tfidfproduct_show_case carousel,tfidfsearch,tfidfsearch add_to_basket_lr,tfidfsearch carousel,tfidfsearch pa,tfidfsearch product_lr,tfidfsearch search,tfidfsearch show_case,tfidfshow_case,tfidfshow_case carousel,tfidfshow_case list_product,tfidfshow_case product_show_case,tfidfshow_case search,tfidfshow_case show_case,device,n_products,n_products_normed,stype_i40+rMlxCCxz1hgpPEHVCw==,stype_nH9oPdOoBjQ6KgoScH5o4Q==,ff_0.0,ff_1.0
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.544708,0.0,0.0,0.0,0.368268,0.357044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9DP4L/tKWLkYUystNndR3g==,1,0.25,1.0,0.0,1.0,0.0
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True,0.0,4.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.435791,0.0,0.0,0.0,0.235705,0.342782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9DP4L/tKWLkYUystNndR3g==,3,0.230769,4.0,0.0,4.0,0.0
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.487751,0.0,0.0,0.0,0.499872,0.0,0.0,0.0,0.337955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9DP4L/tKWLkYUystNndR3g==,1,0.25,0.0,1.0,1.0,0.0
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.713065,0.0,0.0,0.0,0.0,0.701098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9DP4L/tKWLkYUystNndR3g==,0,0.0,,,,
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.752961,0.0,0.0,0.0,0.0,0.658066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9DP4L/tKWLkYUystNndR3g==,0,0.0,,,,


In [32]:
useless = [
    'type_ADD_TO_BASKET_PA',
    'type_ADD_TO_BASKET_SHOW_CASE',
    'type_CAROUSEL',
    'type_LIST_PRODUCT',
    'type_PA',
    'type_PURCHASE_PRODUCT_LP',
    'type_PURCHASE_PRODUCT_PA',
    'type_PURCHASE_PRODUCT_SHOW_CASE',
    'type_SEARCH',
    'type_SHOW_CASE'
]

df = df.drop(useless, axis='columns')

Checkpoint.

In [11]:
df.to_feather('data/features.ftr')

ValueError: feather does not support serializing a non-default index for the index; you can .reset_index() to make the index into column(s)

# Learning

In [None]:
df = pd.read_feather('data/features.ftr')

Prepare the datasets.

In [52]:
to_drop = ['sid', 'is_train']

X_train = df.query('is_train == 1').drop(to_drop + ['target'], axis='columns')
y_train = df.query('is_train == 1')['target']
X_test = df.query('is_train == 0').drop(to_drop + ['target'], axis='columns')
submission = df.query('is_train == 0')['sid'].to_frame()
submission['target'] = 0

Do some sanity checks.

In [54]:
assert len(X_train) == 133123
assert len(y_train) == 133123
assert len(X_test) == 88750
assert len(submission) == 88750
assert len(X_train.columns) == len(X_test.columns)

Find the optimal number of boosting rounds through cross-validation.

In [55]:
import time

import lightgbm as lgbm
import numpy as np
from sklearn import model_selection
import xam


# https://lightgbm.readthedocs.io/en/latest/Parameters.html
params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary',
    'num_threads': 8,
    'num_leaves': 2 ** 5,
    'min_data_per_group': 50,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 4,
    'cat_smooth': 40,
    'cat_l2': 10,
    'max_bin': 511,
    'min_data_in_bin': 3,
    'scale_pos_weight': 1,
    'min_data_in_leaf': 40,
    'learning_rate': 0.01,
    'feature_fraction': 1,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_seed': 42,
    'lambda_l1': 1,
    'lambda_l2': 2,
    'verbosity': 2
}

n_splits = 5
cv = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=42)
submission['target'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)
feature_importances_ = pd.DataFrame(index=X_train.columns)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.Dataset
    fit_set = lgbm.Dataset(X_fit, y_fit)
    val_set = lgbm.Dataset(X_val, y_val)

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.train
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        num_boost_round=10000,
        verbose_eval=50,
        early_stopping_rounds=40,
        evals_result=evals_result
    )

    # Store the feature importances
    feature_importances_[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances_[f'split_{i}'] = model.feature_importance('split')
    
    # Store the training scores
    fit_scores[i] = evals_result['fit']['binary_logloss'][-1]
    val_scores[i] = evals_result['val']['binary_logloss'][-1]
    
    # Accumulate test predictions
    submission['target'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))



Training until validation scores don't improve for 40 rounds.
[50]	fit's binary_logloss: 0.275847	val's binary_logloss: 0.282105
[100]	fit's binary_logloss: 0.263612	val's binary_logloss: 0.271024
[150]	fit's binary_logloss: 0.257965	val's binary_logloss: 0.266466
[200]	fit's binary_logloss: 0.254753	val's binary_logloss: 0.264279
[250]	fit's binary_logloss: 0.252711	val's binary_logloss: 0.26331
[300]	fit's binary_logloss: 0.251098	val's binary_logloss: 0.262711
[350]	fit's binary_logloss: 0.249647	val's binary_logloss: 0.262408
[400]	fit's binary_logloss: 0.248323	val's binary_logloss: 0.26222
[450]	fit's binary_logloss: 0.247154	val's binary_logloss: 0.262097
[500]	fit's binary_logloss: 0.246076	val's binary_logloss: 0.261996
[550]	fit's binary_logloss: 0.245088	val's binary_logloss: 0.261929
[600]	fit's binary_logloss: 0.244208	val's binary_logloss: 0.26187
[650]	fit's binary_logloss: 0.243389	val's binary_logloss: 0.261862
[700]	fit's binary_logloss: 0.242584	val's binary_logloss:

- Fit AUC: 0.25716 (+/- 0.00172)
- Val AUC: 0.26552 (+/- 0.00407)

Display feature importance.

In [40]:
feature_importances_.sort_values('gain_0', ascending=False)

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
mean_quantity_type,237833.869883,316,333683.002117,309,335898.510732,335,199910.411096,305,305819.554309,244
max_quantity_category,120185.074494,306,33160.988158,237,28086.741577,347,159347.086917,414,50149.039942,223
last_2_actions,80602.815437,4229,61399.296790,2001,86723.627518,4900,80935.540974,4347,72993.141889,3322
action_n,15824.550923,106,15536.468282,100,16605.493379,120,12799.082788,88,12125.436512,84
average_duration,15123.723035,1353,11333.710094,708,14483.754621,1513,14880.613343,1345,11121.358566,788
tfidfcarousel carousel,13227.261819,706,12855.086130,393,10388.697818,721,13006.828346,544,10820.897435,469
n_products_normed,12398.550203,614,9040.442481,354,12421.336407,882,10060.707519,573,10399.784157,427
seconds_spent,10600.214912,786,6976.926063,413,13665.451474,1286,11351.299180,946,8910.949534,609
max_resolution,9558.071371,1066,4608.453556,359,11157.117164,1470,9090.027534,1080,6683.664976,575
n_pages,7285.047448,482,4782.285008,248,9451.681765,737,6481.151825,471,7782.492428,387


Make predictions.

In [56]:
submission.head()

Unnamed: 0,sid,target
133123,EhjG5b8h+RHDgxkQpkMc9sECXbdnA3JOPS07CRYvWmwYSJ...,0.025669
133124,34lu87wJJunsPz2c0SxM/aLQ9x+2dlj5W96R95DIG9yRRe...,0.017837
133125,xkSYpSt3qRk8X6Ev1W8d72Vj6oyXbI8DKjkCqHmvcaI36F...,0.042671
133126,Tu9ylHPEk5Lw5K/8TpuJOEdJv3ZUzJu0zz2sfBrJDQpmyg...,0.081247
133127,s6gbPkykrrtcYDB/OidLEdkEsk/bsWIejziprzhq2wJBij...,0.019063


In [57]:
submission.to_csv('submissions/lgbm_{:.5f}_{:.5f}_{:.5f}_{:.5f}.csv'.format(fit_scores.mean(), fit_scores.std(), val_scores.mean(), val_scores.std()), index=False)

# Parameter tuning

In [92]:
from sklearn import model_selection
import scipy.stats as stats


# Parameter spaces can be lists or statistical distributions from
# https://docs.scipy.org/doc/scipy/reference/stats.html
grid = model_selection.ParameterSampler(
    param_distributions={
        'application': ['binary'],
        'boosting_type': ['gbdt'],
        'metric': ['auc'],
        'num_threads': [8],
        'num_leaves': [2 ** 3, 2 ** 4, 2 ** 5],
        'min_data_per_group': [30],
        'max_cat_threshold': [32],
        'max_cat_to_onehot': [4],
        'cat_smooth': [5],
        'cat_l2': [10],
        'max_bin': [255],
        'min_data_in_bin': [3],
        'scale_pos_weight': [1],
        'min_data_in_leaf': [30],
        'learning_rate': stats.uniform(0.1, 0.2),
        'feature_fraction': [1],
        'feature_fraction_seed': [42],
        'bagging_fraction': [1],
        'bagging_seed': [42],
        'lambda_l1': [0],
        'lambda_l2': [0],
        'verbosity': [2]
    },
    n_iter=2,
    random_state=42
)

for config in grid:
    print(config)

{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.1749080237694725, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}
{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.13668695797323277, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}


# Stacking

In [26]:
import catboost as cb
from sklearn import ensemble
from sklearn import linear_model 
from sklearn import metrics
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
import xam
import xgboost as xgb


cat_cols = X_train.select_dtypes('category').columns.tolist()
num_cols = list(set(X_train.columns) - set(cat_cols))

models = {
    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier
    'LightGBM': lgbm.LGBMClassifier(**params),
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
    'XGBoost': xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.01,
        n_estimators=10000,
        random_state=42
    )
}

params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary',
    'num_threads': 8,
    'num_leaves': 2 ** 5,
    'min_data_per_group': 50,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 4,
    'cat_smooth': 40,
    'cat_l2': 10,
    'max_bin': 511,
    'min_data_in_bin': 3,
    'scale_pos_weight': 1,
    'min_data_in_leaf': 40,
    'learning_rate': 0.01,
    'feature_fraction': 1,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_seed': 42,
    'lambda_l1': 1,
    'lambda_l2': 2,
    'verbosity': 2
}


stack = xam.ensemble.BaggedStackingClassifier(
    models=models,
    meta_model=linear_model.Regression(fit_intercept=True),
    metric=metrics.accuracy_score,
    use_base_features=False,
    use_probas=True,
    fit_handlers={
        'LightGBM': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'eval_names': ['fit', 'val'],
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'XGBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'CatBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'early_stopping_rounds': 80,
            'verbose': False
        }
    }
)

In [27]:
submission['Survived'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    stack.fit(X_fit, y_fit, verbose=False)
    
    # Store the training scores
    fit_scores[i] = metrics.roc_auc_score(y_fit, stack.predict(X_fit))
    val_scores[i] = metrics.roc_auc_score(y_val, stack.predict(X_val))
    
    # Accumulate test predictions
    submission['Survived'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))

Fit AUC: 0.89920 (+/- 0.00975)
Val AUC: 0.79175 (+/- 0.02826)


In [21]:
Fit AUC: 0.94612 (+/- 0.00661)
Val AUC: 0.87672 (+/- 0.03101)

SyntaxError: invalid syntax (<ipython-input-21-ff1a4e09e53e>, line 1)