# Data preparation

In [3]:
import pandas as pd

pd.set_option('display.max_columns', 50)

track = pd.concat(
    (
        pd.read_csv('data/train_tracking.csv'),
        pd.read_csv('data/test_tracking.csv')
    ),
    axis='rows'
)

In [41]:
import numpy as np

train = pd.read_csv('data/train_session.csv')
test = pd.read_csv('data/random_submission.csv')
train['is_train'] = True
test['is_train'] = False
test['target'] = np.nan

df = pd.concat((train, test))

In [34]:
df.head()

Unnamed: 0,sid,target,is_train
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True


In [43]:
track['unit'] = 1

In [45]:
df = df.join(pd.pivot_table(track, index='sid', columns='type_simplified', values='unit', aggfunc=np.sum).fillna(0).add_prefix('action_'), on='sid')

In [48]:
df = df.join(track.groupby('sid').size().rename('n_pages'), on='sid')

In [49]:
df.head()

Unnamed: 0,sid,target,is_train,action_ADD_TO_BASKET,action_CAROUSEL,action_LIST_PRODUCT,action_PA,action_PRODUCT,action_PURCHASE_PRODUCT,action_SEARCH,action_SHOW_CASE,n_pages
0,U6clt0UYaQB7vJQFmSWYymsAfwmT9SMhrm2oXQ8TC5M8mf...,0.0,True,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,4
1,0qRbLkxeOjeEo6CtxtB6VI8FNPt4Vl8niySzZLqSWiQOka...,0.0,True,0.0,4.0,0.0,0.0,4.0,0.0,5.0,0.0,13
2,cnCRGomfPqEWuhMZkA0RQfvTQiwVoyHXUqvqnPCutVouFO...,0.0,True,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,4
3,zBwrofN0r2ps9u/UCUS134SiZIqB+UgIEr0MZGHzksqR4f...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4
4,Kbr60r9eaX/ENOJ81R1YBxfwrEohoxyQI9Ma0fODsS+/XN...,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3


In [None]:
import multiprocessing as mp

def apply_parallel(groups, func):
    with mp.Pool(mp.cpu_count()) as p:
        return pd.Series(
            p.map(func, [group for name, group in groups]),
            index=[name for name, _ in groups]
        )

In [69]:
df = pd.DataFrame({
    'session': [1, 1, 1, 2, 2, 2, 3],
    'page': [1, 2, 3, 4, 1, 2, 4],
    'time': [1, 1, 1, 1, 1, 1, 1]
})

df

Unnamed: 0,session,page,time
0,1,1,1
1,1,2,1
2,1,3,1
3,2,4,1
4,2,1,1
5,2,2,1
6,3,4,1


# Feature extraction

In [76]:
df.join(df.pivot('session', 'page')['time'], on='session')

Unnamed: 0,session,page,time,1,2,3,4
0,1,1,1,1.0,1.0,1.0,
1,1,2,1,1.0,1.0,1.0,
2,1,3,1,1.0,1.0,1.0,
3,2,4,1,1.0,1.0,,1.0
4,2,1,1,1.0,1.0,,1.0
5,2,2,1,1.0,1.0,,1.0
6,3,4,1,,,,1.0


Return one feature.

In [None]:
def get_feature(g):
    return g['page'].count() / g['page'].nunique()


df = df.join(apply_parallel(df.groupby('session'), get_feature).rename('count_over_unique'), on='session')

df = df.join(df.groupby('session').apply(get_feature).rename('count_over_unique'), on='session')

Return multiple features.

In [67]:
pd.DataFrame(
    df.groupby('session').apply(lambda g: [g['page'].count(), g['page'].nunique()]).values.tolist(),
    columns=['count', 'nunique']
)

Unnamed: 0,count,nunique
0,3,3
1,3,2
2,1,1


Pivot to one-hot encode sequences of variables lengths.

In [60]:
import numpy as np

pd.pivot_table(df, index='session', columns='page', values='time', aggfunc=np.sum).fillna(0).add_prefix('visited_')

page,visited_1,visited_2,visited_3,visited_4
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,1.0,1.0,0.0
2,2.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0


In [9]:
import functools

import gensim
import numpy as np
from sklearn import decomposition
from sklearn import feature_extraction
from sklearn import pipeline


text = functools.reduce(
    lambda a, b: a.astype(str) + ' ' + b.astype(str), 
    [
        df['Name'],
        df['Sex'],
        df['cabin_letter'].str.replace(' ', '_')
    ]
)


class MeanEmbeddingVectorizer():
    
    def __init__(self, size):
        self.size = size
        
    def fit(self, X, y=None):
        model = gensim.models.Word2Vec(text, size=self.size)
        self.word2vec_ = dict(zip(model.wv.index2word, model.wv.vectors))
        return self

    def transform(self, X):
        dim = len(self.word2vec_.values())
        return np.array([
            np.mean([self.word2vec_[w] for w in words if w in self.word2vec_]
                    or [np.zeros(dim)], axis=0)
            for words in text
        ])

pipe = pipeline.FeatureUnion([
    ('cat2vec', MeanEmbeddingVectorizer(size=5)),
    ('nmf', pipeline.Pipeline([
        ('count', feature_extraction.text.TfidfVectorizer(max_df=0.95, min_df=2, max_features=400)),
        ('vectorize', decomposition.NMF(n_components=5)),
    ]))
])

text_features = pipe.fit_transform(text)

df = pd.concat((df, pd.DataFrame(text_features).add_prefix('text_vec_')), axis='columns')

One-hot encode.

Checkpoint.

In [11]:
df.to_feather('data/features.ftr')

# Learning

In [None]:
df = pd.read_feather('data/features.ftr')

Prepare the datasets.

In [53]:
to_drop = ['sid', 'is_train']

X_train = df.query('is_train == 1').drop(to_drop + ['target'], axis='columns')
y_train = df.query('is_train == 1')['target']
X_test = df.query('is_train == 0').drop(to_drop + ['target'], axis='columns')
submission = df.query('is_train == 0')['sid'].to_frame()
submission['target'] = 0

Do some sanity checks.

In [55]:
assert len(X_train) == 133123
assert len(y_train) == 133123
assert len(X_test) == 88750
assert len(submission) == 88750
assert len(X_train.columns) == len(X_test.columns)

Find the optimal number of boosting rounds through cross-validation.

In [66]:
import time

import lightgbm as lgbm
import numpy as np
from sklearn import model_selection
import xam


# https://lightgbm.readthedocs.io/en/latest/Parameters.html
params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary',
    'num_threads': 8,
    'num_leaves': 2 ** 3,
    'min_data_per_group': 30,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 4,
    'cat_smooth': 5,
    'cat_l2': 10,
    'max_bin': 255,
    'min_data_in_bin': 3,
    'scale_pos_weight': 1,
    'min_data_in_leaf': 30,
    'learning_rate': 0.08,
    'feature_fraction': 1,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_seed': 42,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'verbosity': 2
}

n_splits = 5
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
submission['target'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)
feature_importances_ = pd.DataFrame(index=X_train.columns)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.Dataset
    fit_set = lgbm.Dataset(X_fit, y_fit)
    val_set = lgbm.Dataset(X_val, y_val)

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.train
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        num_boost_round=10000,
        verbose_eval=50,
        early_stopping_rounds=20,
        evals_result=evals_result
    )

    # Store the feature importances
    feature_importances_[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances_[f'split_{i}'] = model.feature_importance('split')
    
    # Store the training scores
    fit_scores[i] = evals_result['fit']['binary_logloss'][-1]
    val_scores[i] = evals_result['val']['binary_logloss'][-1]
    
    # Accumulate test predictions
    submission['target'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))

Training until validation scores don't improve for 20 rounds.
[50]	fit's binary_logloss: 0.269965	val's binary_logloss: 0.275023
[100]	fit's binary_logloss: 0.268578	val's binary_logloss: 0.274388
[150]	fit's binary_logloss: 0.267612	val's binary_logloss: 0.274269
Early stopping, best iteration is:
[141]	fit's binary_logloss: 0.267745	val's binary_logloss: 0.274237
Training until validation scores don't improve for 20 rounds.
[50]	fit's binary_logloss: 0.269594	val's binary_logloss: 0.276518
[100]	fit's binary_logloss: 0.268078	val's binary_logloss: 0.276128
[150]	fit's binary_logloss: 0.267046	val's binary_logloss: 0.27596
Early stopping, best iteration is:
[156]	fit's binary_logloss: 0.266954	val's binary_logloss: 0.275949
Training until validation scores don't improve for 20 rounds.
[50]	fit's binary_logloss: 0.272073	val's binary_logloss: 0.266709
[100]	fit's binary_logloss: 0.270613	val's binary_logloss: 0.26603
[150]	fit's binary_logloss: 0.269556	val's binary_logloss: 0.265975
E

Display feature importance.

In [70]:
feature_importances_

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
action_ADD_TO_BASKET,40782.077949,115,41751.940363,136,41601.895153,130,40541.652144,113,40642.662423,93
action_CAROUSEL,3402.649148,156,3437.42917,136,3341.524972,128,3446.227654,112,3210.832094,91
action_LIST_PRODUCT,1101.353293,79,1264.840563,104,1002.984701,73,1129.202471,64,897.8157,53
action_PA,1146.366207,105,1345.181643,122,1139.013318,97,1004.601848,87,1256.676168,76
action_PRODUCT,1936.50724,140,2266.38267,164,1930.449559,128,1979.154397,144,1801.920318,93
action_PURCHASE_PRODUCT,6940.588735,92,6863.872362,92,6735.852578,82,6602.024999,86,6596.954216,73
action_SEARCH,1660.365706,129,1798.70725,151,1724.648487,144,1648.69967,106,1784.970814,83
action_SHOW_CASE,192.166104,44,274.59049,50,313.797566,51,264.123432,39,147.57525,23
n_pages,1629.835321,127,1300.835523,137,1511.394947,133,1278.360706,96,1225.276385,73


Make predictions.

In [58]:
submission.head()

Unnamed: 0,sid,target
0,EhjG5b8h+RHDgxkQpkMc9sECXbdnA3JOPS07CRYvWmwYSJ...,0.021754
1,34lu87wJJunsPz2c0SxM/aLQ9x+2dlj5W96R95DIG9yRRe...,0.025196
2,xkSYpSt3qRk8X6Ev1W8d72Vj6oyXbI8DKjkCqHmvcaI36F...,0.03189
3,Tu9ylHPEk5Lw5K/8TpuJOEdJv3ZUzJu0zz2sfBrJDQpmyg...,0.05303
4,s6gbPkykrrtcYDB/OidLEdkEsk/bsWIejziprzhq2wJBij...,0.047834


In [69]:
submission.to_csv('submissions/lgbm_{:.5f}_{:.5f}_{:.5f}_{:.5f}.csv'.format(fit_scores.mean(), fit_scores.std(), val_scores.mean(), val_scores.std()), index=False)

# Parameter tuning

In [92]:
from sklearn import model_selection
import scipy.stats as stats


# Parameter spaces can be lists or statistical distributions from
# https://docs.scipy.org/doc/scipy/reference/stats.html
grid = model_selection.ParameterSampler(
    param_distributions={
        'application': ['binary'],
        'boosting_type': ['gbdt'],
        'metric': ['auc'],
        'num_threads': [8],
        'num_leaves': [2 ** 3, 2 ** 4, 2 ** 5],
        'min_data_per_group': [30],
        'max_cat_threshold': [32],
        'max_cat_to_onehot': [4],
        'cat_smooth': [5],
        'cat_l2': [10],
        'max_bin': [255],
        'min_data_in_bin': [3],
        'scale_pos_weight': [1],
        'min_data_in_leaf': [30],
        'learning_rate': stats.uniform(0.1, 0.2),
        'feature_fraction': [1],
        'feature_fraction_seed': [42],
        'bagging_fraction': [1],
        'bagging_seed': [42],
        'lambda_l1': [0],
        'lambda_l2': [0],
        'verbosity': [2]
    },
    n_iter=2,
    random_state=42
)

for config in grid:
    print(config)

{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.1749080237694725, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}
{'application': 'binary', 'bagging_fraction': 1, 'bagging_seed': 42, 'boosting_type': 'gbdt', 'cat_l2': 10, 'cat_smooth': 5, 'feature_fraction': 1, 'feature_fraction_seed': 42, 'lambda_l1': 0, 'lambda_l2': 0, 'learning_rate': 0.13668695797323277, 'max_bin': 255, 'max_cat_threshold': 32, 'max_cat_to_onehot': 4, 'metric': 'auc', 'min_data_in_bin': 3, 'min_data_in_leaf': 30, 'min_data_per_group': 30, 'num_leaves': 8, 'num_threads': 8, 'scale_pos_weight': 1, 'verbosity': 2}


# Stacking

In [26]:
import catboost as cb
from sklearn import ensemble
from sklearn import linear_model 
from sklearn import metrics
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
import xam
import xgboost as xgb


cat_cols = X_train.select_dtypes('category').columns.tolist()
num_cols = list(set(X_train.columns) - set(cat_cols))

models = {
    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier
    'LightGBM': lgbm.LGBMClassifier(**params),
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
    'XGBoost': xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.06,
        n_estimators=300,
        random_state=42
    )
}


stack = xam.ensemble.BaggedStackingClassifier(
    models=models,
    meta_model=linear_model.Regression(fit_intercept=True),
    metric=metrics.accuracy_score,
    use_base_features=False,
    use_probas=True,
    fit_handlers={
        'LightGBM': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'eval_names': ['fit', 'val'],
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'XGBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'CatBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'early_stopping_rounds': 80,
            'verbose': False
        }
    }
)

In [27]:
submission['Survived'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    stack.fit(X_fit, y_fit, verbose=False)
    
    # Store the training scores
    fit_scores[i] = metrics.roc_auc_score(y_fit, stack.predict(X_fit))
    val_scores[i] = metrics.roc_auc_score(y_val, stack.predict(X_val))
    
    # Accumulate test predictions
    submission['Survived'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))

Fit AUC: 0.89920 (+/- 0.00975)
Val AUC: 0.79175 (+/- 0.02826)


In [21]:
Fit AUC: 0.94612 (+/- 0.00661)
Val AUC: 0.87672 (+/- 0.03101)

SyntaxError: invalid syntax (<ipython-input-21-ff1a4e09e53e>, line 1)