# Data preparation

In [79]:
import pandas as pd

pd.set_option('display.max_columns', 50)

track = pd.concat(
    (
        pd.read_csv('data/train_tracking.csv'),
        pd.read_csv('data/test_tracking.csv')
    ),
    ignore_index=True,
    axis='rows'
)

track['duration'] = pd.to_timedelta(track['duration'])

In [80]:
import numpy as np

train = pd.read_csv('data/train_session.csv')
test = pd.read_csv('data/random_submission.csv')
train['is_train'] = True
test['is_train'] = False
test['target'] = np.nan

df = pd.concat((train, test), ignore_index=True)

In [81]:
df.head()

Unnamed: 0,sid,target,is_train
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True


# Feature extraction

In [82]:
track['unit'] = 1

In [83]:
df = df.join(pd.pivot_table(track, index='sid', columns='type_simplified', values='unit', aggfunc=np.sum).fillna(0).add_prefix('type_simple_'), on='sid')

In [84]:
df = df.join(pd.pivot_table(track, index='sid', columns='type', values='unit', aggfunc=np.sum).fillna(0).add_prefix('type_'), on='sid')

In [85]:
df = df.join(track.groupby('sid').size().rename('n_pages'), on='sid')

Screen resolution.

In [86]:
track['resolution'] = track['rw'] * track['rh']
df = df.join(track.groupby('sid')['resolution'].nunique().rename('n_resolutions'), on='sid')
df = df.join(track.groupby('sid')['resolution'].max().rename('max_resolution'), on='sid')

Number of results pages seen.

In [87]:
df = df.join(track.groupby('sid')['pn'].nunique().rename('n_unique_pages'), on='sid')

In [88]:
import ast

track['n_facets'] = track['facets'].apply(lambda x: len(ast.literal_eval(x)) if pd.notnull(x) else 0)
df = df.join(track.groupby('sid')['n_facets'].max().rename('max_n_facets'), on='sid')

Max quantity in basket.

In [89]:
df = df.join(track.groupby('sid')['quantity'].max().rename('max_size_basket').fillna(0), on='sid')

Total quantity in basket.

In [90]:
df = df.join(track.groupby('sid')['quantity'].sum().rename('sum_size_basket').fillna(0), on='sid')

Unique items in basket.

In [91]:
df['sum_over_unique'] = df['sum_size_basket'] / (df['type_simple_ADD_TO_BASKET'] + 1)

Mean quantity by type.

In [92]:
track = track.join(track.groupby(['type_simplified'])['quantity'].mean().rename('mean_quantity_type'), on=['type_simplified'])
df = df.join(track.groupby('sid')['mean_quantity_type'].mean(), on='sid')

track = track.join(track.groupby(['type'])['quantity'].sum().rename('sum_quantity_type').fillna(0), on = ['type'])
df = df.join(track.groupby('sid')['sum_quantity_type'].sum(), on = 'sid')

track = track.join(track.groupby(['type'])['quantity'].max().rename('max_quantity_type').fillna(0), on = ['type'])
df = df.join(track.groupby('sid')['max_quantity_type'].mean(), on = 'sid')

track = track.join(track.groupby(['type_simplified'])['quantity'].max().rename('max_quantity_category').fillna(0), on=['type_simplified'])
df = df.join(track.groupby('sid')['max_quantity_category'].mean(), on = 'sid')

Total time spent.

In [93]:
df = df.join(track.groupby('sid')['duration'].max().rename('seconds_spent').dt.seconds, on='sid')

Average time spent per page.

In [94]:
df = df.join(track.groupby('sid').apply(lambda x: x['duration'].max().seconds / len(x)).rename('average_duration'), on='sid')

First and last actions.

In [95]:
df = df.join(track.groupby('sid').nth(0)['type'].rename('action_0').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(1)['type'].rename('action_1').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(-2)['type'].rename('action_n-1').astype('category'), on='sid')
df = df.join(track.groupby('sid').nth(-1)['type'].rename('action_n').astype('category'), on='sid')

In [96]:
df['first_2_actions'] = df['action_0'].str.cat(df['action_1'], sep='_').astype('category')
df['last_2_actions'] = df['action_n-1'].str.cat(df['action_n'], sep='_').astype('category')

TF-IDF.

In [97]:
from sklearn import feature_extraction

actions = track.groupby('sid')['type'].apply(lambda x: ' '.join(x))
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 2), min_df=10, max_features=50)
vectorizer.fit(actions)
tfidf = pd.DataFrame(vectorizer.transform(actions).todense(), columns=vectorizer.get_feature_names(), index=actions.index).add_prefix('tfidf')
df = df.join(tfidf, on='sid')

Device.

In [98]:
df = df.join(track.groupby('sid')['device'].first().fillna('missing').astype('category').rename('device'), on='sid')

Total number of products visited during session.

In [99]:
df = df.join(track.groupby('sid').sku.nunique().rename('n_products'), on='sid')

Total number of products visited during session normalized by number of pages visited.

In [100]:
df['n_products_normed'] = df['n_products'] / df['n_pages']

Stype counts.

In [101]:
df = df.join(pd.pivot_table(track, index='sid', columns='stype', values='unit', aggfunc=np.sum).fillna(0).add_prefix('stype_'), on='sid')

True if the seller is not Cdiscount but storage and delivery are provided by Cdiscount.

In [102]:
df = df.join(pd.pivot_table(track, index='sid', columns='ff', values='unit', aggfunc=np.sum).fillna(0).add_prefix('ff_'), on='sid')

Target encoding.

In [103]:
import xam

track = track.join(df.set_index('sid')['target'], how = 'left',on ='sid')

encoder = xam.feature_extraction.SmoothTargetEncoder(
            columns = ['device',
                       'type_simplified',
                       'type',
                      'idcar'],
            prior_weight=100,
            suffix='target_encode')
track = encoder.fit_transform(track,track['target'])

col_encode = ['devicetarget_encode','type_simplifiedtarget_encode','typetarget_encode', 'idcartarget_encode']
df = df.join(track.groupby('sid')[col_encode].mean(), on='sid')

In [104]:
def target_encode(df, on, by, m):
    """From https://www.wikiwand.com/en/Bayes_estimator#/Practical_example_of_Bayes_estimators"""
    
    agg = df.groupby(by)[on].agg(['count', 'sum'])
    R = agg['sum'] / agg['count']
    v = agg['count']
    C = df[on].mean()
    W = (R*v + C*m) / (v + m)
    
    # Automatically generate the feature's name
    if not isinstance(by, (list, tuple)):
        by = [by]
    name = "{}_mean_by_{}".format(on, '_'.join(by))  # e.g. 'interest_mean_by_bond_and_action'
    
    return W.rename(name)

track = track.join(target_encode(track,'target',['idcar','type'],100),on = ['idcar','type'])

df = df.join(track.groupby('sid')['target_mean_by_idcar_type'].mean(), on='sid')

In [105]:
df.head()

Unnamed: 0,sid,target,is_train,type_simple_ADD_TO_BASKET,type_simple_CAROUSEL,type_simple_LIST_PRODUCT,type_simple_PA,type_simple_PRODUCT,type_simple_PURCHASE_PRODUCT,type_simple_SEARCH,type_simple_SHOW_CASE,type_ADD_TO_BASKET_CAROUSEL,type_ADD_TO_BASKET_LP,type_ADD_TO_BASKET_LR,type_ADD_TO_BASKET_PA,type_ADD_TO_BASKET_SHOW_CASE,type_CAROUSEL,type_LIST_PRODUCT,type_PA,type_PRODUCT_CAROUSEL,type_PRODUCT_LP,type_PRODUCT_LR,type_PRODUCT_PA,type_PRODUCT_SHOW_CASE,type_PURCHASE_PRODUCT_CAROUSEL,...,tfidfsearch,tfidfsearch add_to_basket_lr,tfidfsearch carousel,tfidfsearch pa,tfidfsearch product_lr,tfidfsearch search,tfidfsearch show_case,tfidfshow_case,tfidfshow_case carousel,tfidfshow_case list_product,tfidfshow_case product_show_case,tfidfshow_case search,tfidfshow_case show_case,device,n_products,n_products_normed,stype_i40+rMlxCCxz1hgpPEHVCw==,stype_nH9oPdOoBjQ6KgoScH5o4Q==,ff_0.0,ff_1.0,devicetarget_encode,type_simplifiedtarget_encode,typetarget_encode,idcartarget_encode,target_mean_by_idcar_type
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.544708,0.0,0.0,0.0,0.368268,0.357044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9DP4L/tKWLkYUystNndR3g==,1,0.25,1.0,0.0,1.0,0.0,0.140517,0.138758,0.14361,0.131889,0.119954
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True,0.0,4.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,...,0.435791,0.0,0.0,0.0,0.235705,0.342782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9DP4L/tKWLkYUystNndR3g==,3,0.230769,4.0,0.0,4.0,0.0,0.140517,0.139878,0.143113,0.130971,0.119954
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.499872,0.0,0.0,0.0,0.337955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9DP4L/tKWLkYUystNndR3g==,1,0.25,0.0,1.0,1.0,0.0,0.140517,0.128026,0.132878,0.135867,
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.713065,0.0,0.0,0.0,0.0,0.701098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9DP4L/tKWLkYUystNndR3g==,0,0.0,,,,,0.140517,0.133907,0.133907,0.135867,
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.752961,0.0,0.0,0.0,0.0,0.658066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9DP4L/tKWLkYUystNndR3g==,0,0.0,,,,,0.140517,0.133907,0.133907,0.135867,


# Outliers

In [107]:
df['is_outlier'] = df['is_train'] & (train['n_pages'] > train['n_pages'].quantile(0.98))

# Learning

In [123]:
nulls = df.isnull().sum()
nulls = nulls[nulls > 0]
nulls

target                             88750
mean_quantity_type                181925
stype_i40+rMlxCCxz1hgpPEHVCw==     74196
stype_nH9oPdOoBjQ6KgoScH5o4Q==     74196
ff_0.0                             74196
ff_1.0                             74196
target_mean_by_idcar_type          72260
dtype: int64

In [124]:
df.shape

(221873, 115)

Prepare the datasets.

In [116]:
to_drop = ['sid', 'is_train', 'is_outlier']

X_train = df.query('is_train == 1').drop(to_drop + ['target'], axis='columns')
y_train = df.query('is_train == 1')['target']
X_test = df.query('is_train == 0').drop(to_drop + ['target'], axis='columns')
submission = df.query('is_train == 0')['sid'].to_frame()
submission['target'] = 0

Do some sanity checks.

In [117]:
#assert len(X_train) == 133123
#assert len(y_train) == 133123
assert len(X_test) == 88750
assert len(submission) == 88750
assert len(X_train.columns) == len(X_test.columns)

Find the optimal number of boosting rounds through cross-validation.

In [118]:
import time

import lightgbm as lgbm
import numpy as np
from sklearn import model_selection
import xam


# https://lightgbm.readthedocs.io/en/latest/Parameters.html
params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary',
    'num_threads': 8,
    'num_leaves': 2 ** 5,
    'min_data_per_group': 50,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 4,
    'cat_smooth': 40,
    'cat_l2': 10,
    'max_bin': 511,
    'min_data_in_bin': 3,
    'scale_pos_weight': 1,
    'min_data_in_leaf': 40,
    'learning_rate': 0.01,
    'feature_fraction': 1,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_seed': 42,
    'lambda_l1': 1,
    'lambda_l2': 2,
    'verbosity': 2
}

n_splits = 5
cv = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=42)
submission['target'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)
feature_importances_ = pd.DataFrame(index=X_train.columns)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.Dataset
    fit_set = lgbm.Dataset(X_fit, y_fit)
    val_set = lgbm.Dataset(X_val, y_val)

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.train
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        num_boost_round=10000,
        verbose_eval=50,
        early_stopping_rounds=40,
        evals_result=evals_result
    )

    # Store the feature importances
    feature_importances_[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances_[f'split_{i}'] = model.feature_importance('split')
    
    # Store the training scores
    fit_scores[i] = evals_result['fit']['binary_logloss'][-1]
    val_scores[i] = evals_result['val']['binary_logloss'][-1]
    
    # Accumulate test predictions
    submission['target'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))



Training until validation scores don't improve for 40 rounds.
[50]	fit's binary_logloss: 0.275771	val's binary_logloss: 0.282052
[100]	fit's binary_logloss: 0.263467	val's binary_logloss: 0.270865
[150]	fit's binary_logloss: 0.257777	val's binary_logloss: 0.266334
[200]	fit's binary_logloss: 0.254513	val's binary_logloss: 0.264248
[250]	fit's binary_logloss: 0.252379	val's binary_logloss: 0.263358
[300]	fit's binary_logloss: 0.25069	val's binary_logloss: 0.26288
[350]	fit's binary_logloss: 0.249139	val's binary_logloss: 0.262526
[400]	fit's binary_logloss: 0.247784	val's binary_logloss: 0.262244
[450]	fit's binary_logloss: 0.24656	val's binary_logloss: 0.262122
[500]	fit's binary_logloss: 0.245512	val's binary_logloss: 0.262046
[550]	fit's binary_logloss: 0.24451	val's binary_logloss: 0.261943
[600]	fit's binary_logloss: 0.243599	val's binary_logloss: 0.261905
[650]	fit's binary_logloss: 0.242765	val's binary_logloss: 0.261868
[700]	fit's binary_logloss: 0.241944	val's binary_logloss: 

- Fit AUC: 0.24485 (+/- 0.00223)
- Val AUC: 0.25841 (+/- 0.00377)

Display feature importance.

In [77]:
feature_importances_.sort_values('gain_0', ascending=False)

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
max_quantity_type,211464.283101,194,43721.387043,136,271616.193340,168,15559.978102,106,17871.006657,213
target_mean_by_idcar_type,150869.048888,802,178310.961407,643,157284.783222,832,197419.966426,651,221687.572051,834
sum_quantity_type,43689.836526,267,235665.030148,212,22726.201487,207,175367.168972,235,177288.495174,300
idcartarget_encode,40386.286584,487,7328.555309,311,14153.342978,576,42701.158331,489,17058.363374,552
last_2_actions,31180.620956,1588,28169.259928,1179,32602.523172,1854,33638.784100,1660,33309.897056,1891
max_quantity_category,23931.375503,123,3066.393748,55,12500.458356,106,21938.307033,106,20545.819272,114
type_simpl_seq_PURCHASE_PRODUCT_STOP,21655.592681,196,21088.944796,176,22438.381207,204,20234.560895,224,19789.513287,201
first_2_actions,15827.521908,1336,10622.367228,759,14694.591687,1273,13642.398104,1077,15175.429479,1385
average_duration,12057.668879,1033,9808.467931,712,10605.238842,976,9553.546491,826,12069.686214,1173
sum_over_unique,12002.876852,89,12158.529796,37,2569.541006,70,20539.263985,133,25049.121293,122


Make predictions.

In [119]:
submission.head()

Unnamed: 0,sid,target
133123,EhjG5b8h+RHDgxkQpkMc9sECXbdnA3JOPS07CRYvWmwYSJ...,0.027021
133124,34lu87wJJunsPz2c0SxM/aLQ9x+2dlj5W96R95DIG9yRRe...,0.019615
133125,xkSYpSt3qRk8X6Ev1W8d72Vj6oyXbI8DKjkCqHmvcaI36F...,0.042035
133126,Tu9ylHPEk5Lw5K/8TpuJOEdJv3ZUzJu0zz2sfBrJDQpmyg...,0.082323
133127,s6gbPkykrrtcYDB/OidLEdkEsk/bsWIejziprzhq2wJBij...,0.01949


In [120]:
submission.to_csv('submissions/lgbm_{:.5f}_{:.5f}_{:.5f}_{:.5f}.csv'.format(fit_scores.mean(), fit_scores.std(), val_scores.mean(), val_scores.std()), index=False)

# Parameter tuning

In [92]:
from sklearn import model_selection
import scipy.stats as stats


# Parameter spaces can be lists or statistical distributions from
# https://docs.scipy.org/doc/scipy/reference/stats.html
grid = model_selection.ParameterSampler(
    param_distributions={
        'application': ['binary'],
        'boosting_type': ['gbdt'],
        'metric': ['auc'],
        'num_threads': [8],
        'num_leaves': [2 ** 3, 2 ** 4, 2 ** 5],
        'min_data_per_group': [30],
        'max_cat_threshold': [32],
        'max_cat_to_onehot': [4],
        'cat_smooth': [5],
        'cat_l2': [10],
        'max_bin': [255],
        'min_data_in_bin': [3],
        'scale_pos_weight': [1],
        'min_data_in_leaf': [30],
        'learning_rate': stats.uniform(0.1, 0.2),
        'feature_fraction': [1],
        'feature_fraction_seed': [42],
        'bagging_fraction': [1],
        'bagging_seed': [42],
        'lambda_l1': [0],
        'lambda_l2': [0],
        'verbosity': [2]
    },
    n_iter=2,
    random_state=42
)

for config in grid:
    print(config)

{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.1749080237694725, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}
{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.13668695797323277, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}


# Stacking

In [26]:
import catboost as cb
from sklearn import ensemble
from sklearn import linear_model 
from sklearn import metrics
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
import xam
import xgboost as xgb


cat_cols = X_train.select_dtypes('category').columns.tolist()
num_cols = list(set(X_train.columns) - set(cat_cols))

models = {
    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier
    'LightGBM': lgbm.LGBMClassifier(**params),
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
    'XGBoost': xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.01,
        n_estimators=10000,
        random_state=42
    )
}

params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary',
    'num_threads': 8,
    'num_leaves': 2 ** 5,
    'min_data_per_group': 50,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 4,
    'cat_smooth': 40,
    'cat_l2': 10,
    'max_bin': 511,
    'min_data_in_bin': 3,
    'scale_pos_weight': 1,
    'min_data_in_leaf': 40,
    'learning_rate': 0.01,
    'feature_fraction': 1,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_seed': 42,
    'lambda_l1': 1,
    'lambda_l2': 2,
    'verbosity': 2
}


stack = xam.ensemble.BaggedStackingClassifier(
    models=models,
    meta_model=linear_model.Regression(fit_intercept=True),
    metric=metrics.accuracy_score,
    use_base_features=False,
    use_probas=True,
    fit_handlers={
        'LightGBM': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'eval_names': ['fit', 'val'],
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'XGBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'CatBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'early_stopping_rounds': 80,
            'verbose': False
        }
    }
)

In [27]:
submission['Survived'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    stack.fit(X_fit, y_fit, verbose=False)
    
    # Store the training scores
    fit_scores[i] = metrics.roc_auc_score(y_fit, stack.predict(X_fit))
    val_scores[i] = metrics.roc_auc_score(y_val, stack.predict(X_val))
    
    # Accumulate test predictions
    submission['Survived'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))

Fit AUC: 0.89920 (+/- 0.00975)
Val AUC: 0.79175 (+/- 0.02826)


In [21]:
Fit AUC: 0.94612 (+/- 0.00661)
Val AUC: 0.87672 (+/- 0.03101)

SyntaxError: invalid syntax (<ipython-input-21-ff1a4e09e53e>, line 1)