In [1]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
df = pd.concat((train, test), sort=False, ignore_index=True)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
import multiprocessing as mp

def apply_parallel(groups, func):
    with mp.Pool(mp.cpu_count()) as p:
        return pd.Series(
            p.map(func, [group for name, group in groups]),
            index=[name for name, _ in groups]
        )

In [40]:
df = pd.DataFrame({
    'session': [1, 1, 1, 2, 2, 2],
    'page': [1, 2, 3, 1, 1, 2]
})

df

Unnamed: 0,session,page
0,1,1
1,1,2
2,1,3
3,2,1
4,2,1
5,2,2


In [41]:
def get_feature(g):
    return g['page'].count() / g['page'].nunique()


df = df.join(apply_parallel(df.groupby('session'), get_feature).rename('count_over_unique'), on='session')

df = df.join(df.groupby('session').apply(get_feature).rename('count_over_unique'), on='session')

df

Unnamed: 0,session,page,count_over_unique
0,1,1,1.0
1,1,2,1.0
2,1,3,1.0
3,2,1,1.5
4,2,1,1.5
5,2,2,1.5


In [39]:
df

Unnamed: 0,session,page,count_over_unique
0,1,1,1.0
1,1,2,1.0
2,1,3,1.0
3,2,1,1.5
4,2,1,1.5
5,2,2,1.5


## Feature extraction

In [2]:
df['is_male'] = df['Sex'].apply(lambda x: x == 'male')

In [3]:
df['Embarked'] = pd.Categorical(df['Embarked'])

In [4]:
df['family_name'] = df['Name'].apply(lambda x: x.split(',')[0])
df = df.join(df['family_name'].value_counts().rename('family_name_count'), on='family_name')

In [5]:
df['title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0])
df['title'] = pd.Categorical(df['title'])

In [6]:
df['cabin_letter'] = df['Cabin'].apply(lambda x: x[0] if isinstance(x, str) else 'no cabin')
df['cabin_letter'] = pd.Categorical(df['cabin_letter'])

In [7]:
df['Embarked'].fillna(df['Embarked'].mode().iloc[0], inplace=True) 

In [8]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_male,family_name,family_name_count,title,cabin_letter
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,True,Braund,2,Mr,no cabin
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False,Cumings,2,Mrs,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,False,Heikkinen,1,Miss,no cabin
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,False,Futrelle,2,Mrs,C
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,True,Allen,2,Mr,no cabin


In [9]:
import functools

import gensim
import numpy as np
from sklearn import decomposition
from sklearn import feature_extraction
from sklearn import pipeline


text = functools.reduce(
    lambda a, b: a.astype(str) + ' ' + b.astype(str), 
    [
        df['Name'],
        df['Sex'],
        df['cabin_letter'].str.replace(' ', '_')
    ]
)


class MeanEmbeddingVectorizer():
    
    def __init__(self, size):
        self.size = size
        
    def fit(self, X, y=None):
        model = gensim.models.Word2Vec(text, size=self.size)
        self.word2vec_ = dict(zip(model.wv.index2word, model.wv.vectors))
        return self

    def transform(self, X):
        dim = len(self.word2vec_.values())
        return np.array([
            np.mean([self.word2vec_[w] for w in words if w in self.word2vec_]
                    or [np.zeros(dim)], axis=0)
            for words in text
        ])

pipe = pipeline.FeatureUnion([
    ('cat2vec', MeanEmbeddingVectorizer(size=5)),
    ('nmf', pipeline.Pipeline([
        ('count', feature_extraction.text.TfidfVectorizer(max_df=0.95, min_df=2, max_features=400)),
        ('vectorize', decomposition.NMF(n_components=5)),
    ]))
])

text_features = pipe.fit_transform(text)

df = pd.concat((df, pd.DataFrame(text_features).add_prefix('text_vec_')), axis='columns')

One-hot encode.

In [10]:
df = pd.get_dummies(df, columns=df.select_dtypes('category').columns)

Checkpoint.

In [11]:
df.to_feather('data/features.fth')

In [12]:
df = pd.read_feather('data/features.fth')

  return feather.read_dataframe(path, nthreads=nthreads)


## Learning

Prepare the datasets.

In [13]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,title_ the Countess,cabin_letter_A,cabin_letter_B,cabin_letter_C,cabin_letter_D,cabin_letter_E,cabin_letter_F,cabin_letter_G,cabin_letter_T,cabin_letter_no cabin
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,0,0,0,0,0,0,0,0,0,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,0,0,0,1,0,0,0,0,0,0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,0,0,0,0,0,0,0,0,0,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,0,0,0,1,0,0,0,0,0,0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,0,0,0,0,0,0,0,0,0,1


In [14]:
is_train = df['Survived'].notnull()
to_drop = ['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'family_name']

X_train = df[is_train].drop(to_drop + ['Survived'], axis='columns')
y_train = df[is_train]['Survived']
X_test = df[~is_train].drop(to_drop + ['Survived'], axis='columns')
submission = df[~is_train]['PassengerId'].to_frame()

Do some sanity checks.

In [15]:
assert len(X_train) == 891
assert len(y_train) == 891
assert len(X_test) == 418
assert len(submission) == 418
assert len(X_train.columns) == len(X_test.columns)

Find the optimal number of boosting rounds through cross-validation.

In [16]:
import time

import lightgbm as lgbm
import numpy as np
from sklearn import model_selection


# https://lightgbm.readthedocs.io/en/latest/Parameters.html
params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'num_threads': 8,
    'num_leaves': 2 ** 3,
    'min_data_per_group': 30,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 4,
    'cat_smooth': 5,
    'cat_l2': 10,
    'max_bin': 255,
    'min_data_in_bin': 3,
    'scale_pos_weight': 1,
    'min_data_in_leaf': 30,
    'learning_rate': 0.08,
    'feature_fraction': 1,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_seed': 42,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'verbosity': 2
}

n_splits = 5
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
submission['Survived'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)
feature_importances_ = pd.DataFrame(index=X_train.columns)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.Dataset
    fit_set = lgbm.Dataset(X_fit, y_fit)
    val_set = lgbm.Dataset(X_val, y_val)

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.train
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        num_boost_round=10000,
        verbose_eval=50,
        early_stopping_rounds=20,
        evals_result=evals_result
    )

    # Store the feature importances
    feature_importances_[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances_[f'split_{i}'] = model.feature_importance('split')
    
    # Store the training scores
    fit_scores[i] = evals_result['fit']['auc'][-1]
    val_scores[i] = evals_result['val']['auc'][-1]
    
    # Accumulate test predictions
    submission['Survived'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))

Training until validation scores don't improve for 20 rounds.
[50]	fit's auc: 0.950803	val's auc: 0.877606
Early stopping, best iteration is:
[39]	fit's auc: 0.939954	val's auc: 0.882497
Training until validation scores don't improve for 20 rounds.
[50]	fit's auc: 0.948755	val's auc: 0.875622
Early stopping, best iteration is:
[32]	fit's auc: 0.926393	val's auc: 0.879656
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[10]	fit's auc: 0.894816	val's auc: 0.888029
Training until validation scores don't improve for 20 rounds.
[50]	fit's auc: 0.95408	val's auc: 0.842127
[100]	fit's auc: 0.98263	val's auc: 0.846861
Early stopping, best iteration is:
[105]	fit's auc: 0.984322	val's auc: 0.851867
Training until validation scores don't improve for 20 rounds.
[50]	fit's auc: 0.948978	val's auc: 0.891627
Early stopping, best iteration is:
[47]	fit's auc: 0.946527	val's auc: 0.892444
Fit AUC: 0.95699 (+/- 0.02045)
Val AUC: 0.87587 (+/- 0.01337)


Display feature importance.

In [17]:
feature_importances_

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
Pclass,405.801406,23,346.540541,22,292.2216,10,327.718702,24,338.893342,20
Age,127.41606,32,154.4337,33,25.894771,5,350.972167,114,178.610441,44
SibSp,0.0,0,3.96485,1,0.0,0,20.249259,14,36.20646,8
Parch,0.0,0,6.79476,2,0.0,0,0.0,0,0.0,0
Fare,248.934542,35,257.019152,35,128.02454,11,298.971003,84,272.4266,40
is_male,11.24399,3,0.0,0,0.0,0,8.18416,3,7.63622,2
family_name_count,51.49052,13,37.92006,8,0.0,0,129.071291,34,41.57655,11
text_vec_0,96.67858,20,139.904751,23,63.45359,8,146.243627,55,301.014132,43
text_vec_1,175.144522,22,9.08614,2,87.278802,5,148.123692,37,31.42438,9
text_vec_2,50.912079,12,85.343349,9,24.82329,4,120.229845,41,64.94236,16


Make predictions.

In [18]:
submission.head()

Unnamed: 0,PassengerId,Survived
891,892,0.090694
892,893,0.473247
893,894,0.107661
894,895,0.155314
895,896,0.525012


## Stacking

In [26]:
import catboost as cb
from sklearn import ensemble
from sklearn import linear_model 
from sklearn import metrics
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
import xam
import xgboost as xgb


cat_cols = X_train.select_dtypes('category').columns.tolist()
num_cols = list(set(X_train.columns) - set(cat_cols))

models = {
    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier
    'LightGBM': lgbm.LGBMClassifier(**params),
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
    'XGBoost': xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.06,
        n_estimators=300,
        random_state=42
    )
}


stack = xam.ensemble.BaggedStackingClassifier(
    models=models,
    meta_model=linear_model.Regression(fit_intercept=True),
    metric=metrics.accuracy_score,
    use_base_features=False,
    use_probas=True,
    fit_handlers={
        'LightGBM': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'eval_names': ['fit', 'val'],
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'XGBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'CatBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'early_stopping_rounds': 80,
            'verbose': False
        }
    }
)

In [27]:
submission['Survived'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    stack.fit(X_fit, y_fit, verbose=False)
    
    # Store the training scores
    fit_scores[i] = metrics.roc_auc_score(y_fit, stack.predict(X_fit))
    val_scores[i] = metrics.roc_auc_score(y_val, stack.predict(X_val))
    
    # Accumulate test predictions
    submission['Survived'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))

Fit AUC: 0.89920 (+/- 0.00975)
Val AUC: 0.79175 (+/- 0.02826)


In [21]:
Fit AUC: 0.94612 (+/- 0.00661)
Val AUC: 0.87672 (+/- 0.03101)

SyntaxError: invalid syntax (<ipython-input-21-ff1a4e09e53e>, line 1)