In [1]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
df = pd.concat((train, test), sort=False)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Feature extraction

In [2]:
df['is_male'] = df['Sex'].apply(lambda x: x == 'male')

In [3]:
df['Embarked'] = pd.Categorical(df['Embarked'])

In [4]:
df['family_name'] = df['Name'].apply(lambda x: x.split(',')[0])
df = df.join(df['family_name'].value_counts().rename('family_name_count'), on='family_name')

In [5]:
df['title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0])
df['title'] = pd.Categorical(df['title'])

In [6]:
df['cabin_letter'] = df['Cabin'].apply(lambda x: x[0] if isinstance(x, str) else 'no cabin')
df['cabin_letter'] = pd.Categorical(df['cabin_letter'])

In [7]:
df['Embarked'].fillna(df['Embarked'].mode().iloc[0], inplace=True) 

One-hot encode.

In [8]:
df = pd.get_dummies(df, columns=df.select_dtypes('category').columns)

## Learning

Prepare the datasets.

In [9]:
is_train = df['Survived'].notnull()
to_drop = ['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'family_name']

X_train = df[is_train].drop(to_drop + ['Survived'], axis='columns')
y_train = df[is_train]['Survived']
X_test = df[~is_train].drop(to_drop + ['Survived'], axis='columns')
submission = df[~is_train]['PassengerId'].to_frame()

Do some sanity checks.

In [10]:
assert len(X_train) == 891
assert len(y_train) == 891
assert len(X_test) == 418
assert len(submission) == 418
assert len(X_train.columns) == len(X_test.columns)

Find the optimal number of boosting rounds through cross-validation.

In [12]:
import time

import lightgbm as lgbm
import numpy as np
from sklearn import model_selection


# https://lightgbm.readthedocs.io/en/latest/Parameters.html
params = {
    'application': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'num_threads': 8,
    'num_leaves': 2 ** 3,
    'min_data_per_group': 30,
    'max_cat_threshold': 32,
    'max_cat_to_onehot': 4,
    'cat_smooth': 5,
    'cat_l2': 10,
    'max_bin': 255,
    'min_data_in_bin': 3,
    'scale_pos_weight': 1,
    'min_data_in_leaf': 30,
    'learning_rate': 0.08,
    'feature_fraction': 1,
    'feature_fraction_seed': 42,
    'bagging_fraction': 1,
    'bagging_seed': 42,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'verbosity': 2
}

n_splits = 5
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
submission['Survived'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)
feature_importances_ = pd.DataFrame(index=X_train.columns)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.Dataset
    fit_set = lgbm.Dataset(X_fit, y_fit)
    val_set = lgbm.Dataset(X_val, y_val)

    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.train
    evals_result = {}
    model = lgbm.train(
        params=params,
        train_set=fit_set,
        valid_sets=(fit_set, val_set),
        valid_names=('fit', 'val'),
        num_boost_round=10000,
        verbose_eval=50,
        early_stopping_rounds=20,
        evals_result=evals_result
    )

    # Store the feature importances
    feature_importances_[f'gain_{i}'] = model.feature_importance('gain')
    feature_importances_[f'split_{i}'] = model.feature_importance('split')
    
    # Store the training scores
    fit_scores[i] = evals_result['fit']['auc'][-1]
    val_scores[i] = evals_result['val']['auc'][-1]
    
    # Accumulate test predictions
    submission['Survived'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))

Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[26]	fit's auc: 0.899069	val's auc: 0.899807
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[20]	fit's auc: 0.90186	val's auc: 0.865873
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[24]	fit's auc: 0.898693	val's auc: 0.904299
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[14]	fit's auc: 0.905975	val's auc: 0.829478
Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[8]	fit's auc: 0.888327	val's auc: 0.895711
Fit AUC: 0.91646 (+/- 0.00617)
Val AUC: 0.87468 (+/- 0.02973)


Display feature importance.

In [13]:
feature_importances_

Unnamed: 0,gain_0,split_0,gain_1,split_1,gain_2,split_2,gain_3,split_3,gain_4,split_4
Pclass,436.16064,24,332.807412,16,412.409901,25,280.929058,14,189.848204,7
Age,114.02562,34,108.2942,26,111.10374,33,102.33104,22,90.41671,14
SibSp,34.17811,9,55.67861,8,3.44981,1,0.0,0,89.0805,3
Parch,2.45844,1,0.0,0,8.74353,3,0.0,0,0.0,0
Fare,225.316279,34,257.751248,34,354.320639,47,192.30531,20,158.564569,16
is_male,707.63168,18,943.025303,14,314.131588,13,736.834106,9,621.42849,5
family_name_count,35.25009,13,54.8551,11,44.42173,12,64.551409,13,4.38681,1
Embarked_C,10.97211,4,0.0,0,1.4012,1,0.0,0,0.0,0
Embarked_Q,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
Embarked_S,43.42269,11,42.035339,7,17.44951,6,11.62508,3,0.0,0


Make predictions.

In [14]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0.160266
1,893,0.472379
2,894,0.16339
3,895,0.187622
4,896,0.443754


## Stacking

In [51]:
import catboost as cb
from sklearn import ensemble
from sklearn import linear_model 
from sklearn import metrics
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
import xam
import xgboost as xgb


cat_cols = X_train.select_dtypes('category').columns.tolist()
num_cols = list(set(X_train.columns) - set(cat_cols))

models = {
    # https://lightgbm.readthedocs.io/en/latest/Python-API.html#lightgbm.LGBMClassifier
    'LightGBM': lgbm.LGBMClassifier(**params),
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
    'XGBoost': xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.06,
        n_estimators=10000,
        random_state=42
    )
}


stack = xam.ensemble.StackingClassifier(
    models=models,
    meta_model=linear_model.LinearRegression(),
    metric=metrics.accuracy_score,
    use_base_features=False,
    use_probas=True,
    fit_handlers={
        'LightGBM': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'eval_names': ['fit', 'val'],
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'XGBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'eval_metric': 'auc',
            'early_stopping_rounds': 80,
            'verbose': False
        },
        'CatBoost': lambda X_fit, y_fit, X_val, y_val: {
            'eval_set': [(X_fit, y_fit), (X_val, y_val)],
            'early_stopping_rounds': 80,
            'verbose': False
        }
    }
)

In [52]:
submission['Survived'] = 0
fit_scores = np.zeros(n_splits)
val_scores = np.zeros(n_splits)

for i, (fit_idx, val_idx) in enumerate(cv.split(X_train, y_train)):

    X_fit = X_train.iloc[fit_idx]
    X_val = X_train.iloc[val_idx]
    y_fit = y_train.iloc[fit_idx]
    y_val = y_train.iloc[val_idx]

    stack.fit(X_fit, y_fit, verbose=False)
    
    # Store the training scores
    fit_scores[i] = metrics.roc_auc_score(y_fit, stack.predict(X_fit))
    val_scores[i] = metrics.roc_auc_score(y_val, stack.predict(X_val))
    
    # Accumulate test predictions
    submission['Survived'] += (model.predict(X_test) / n_splits)

print('Fit AUC: {:.5f} (+/- {:.5f})'.format(fit_scores.mean(), fit_scores.std()))
print('Val AUC: {:.5f} (+/- {:.5f})'.format(val_scores.mean(), val_scores.std()))

Fit AUC: 0.94612 (+/- 0.00661)
Val AUC: 0.87672 (+/- 0.03101)
