## Benchmark for measuring performance of engineered features and models



In [1]:
import sys
sys.path.insert(0, '../../bosch_helper')
from bosch_helper import *

%matplotlib inline

In [2]:
# params = {'max_depth':14, 'eta':0.03, 'silent':1, 'objective':'binary:logistic', 'nthread':16,
#          'lambda':4, 'subsample':0.9, 'min_child_weight':5, 'booster':'gbtree', 'alpha':0,
#          'base_score':0.0058, 'colsample_bytree':0.6}

In [3]:
param_grid = {'max_depth': [13, 14, 15, 16], 
              'eta': [0.025, 0.03, 0.035],
              'silent': [1],
              'objective': ['binary:logistic'],
              'nthread': [16],
              'lambda': [3.5, 4, 4.5],
              'alpha': [0, 0.25], 
              'subsample': [0.85, 0.9, 9.5],
              'min_child_weight': [4.5, 5, 5.5],
              'booster': ['gbtree', 'dart'],
              'base_score': [0.0058], 
              'colsample_bytree': [0.5, 0.55, 0.6, 0.65]}

In [4]:
param_list = list(ParameterSampler(param_grid, n_iter=100, random_state=285749))

### Load train and test data

In [3]:
# important numeric features are imported
important_features = pd.read_csv('../benchmark_1/important_numeric_features.csv', index_col=0, header=None)
important_features = list(important_features.values.ravel())
important_features.extend(['Id', 'Response'])

In [4]:
x_train_numeric = pd.read_csv('../../data/train_numeric.csv.zip', index_col=0, usecols=important_features, dtype=np.float32)

y_train = x_train_numeric.Response
x_train_numeric.drop(['Response'], axis=1, inplace=True)

In [5]:
date_train = pd.read_csv('../benchmark_2/train_station_flow.csv.gz', index_col=0, header=None)

In [6]:
x_train = x_train_numeric.join(date_train)

columns = list(x_train.columns)
columns[-1] = 'station_flow'
x_train.columns = columns

In [7]:
x_test_numeric = pd.read_csv('../../data/test_numeric.csv.zip', index_col=0, usecols=important_features[:-1], dtype=np.float32)

date_test = pd.read_csv('../benchmark_2/test_station_flow.csv.gz', index_col=0, header=None)

x_test = x_test_numeric.join(date_test)
x_test.columns = columns

In [8]:
x_train.index = x_train.index.astype(np.int64)
x_test.index = x_test.index.astype(np.int64)

In [9]:
del x_train_numeric, x_test_numeric
gc.collect()

60

### Load benchmark_3 features

In [10]:
start_chunk = pd.read_csv('../benchmark_3/start_chunk.csv.gz', index_col=0)

start_chunk_train = start_chunk.loc[start_chunk.Response!=-1].drop(['Response'], axis=1)
start_chunk_test = start_chunk.loc[start_chunk.Response==-1].drop(['Response'], axis=1)

In [11]:
x_train = x_train.join(start_chunk_train, on='Id')
x_test = x_test.join(start_chunk_test, on='Id')

gc.collect()

35

### Load benchmark_4 features

In [12]:
n = pd.read_csv('../benchmark_4/benchmark_4_neighbors.csv.gz', index_col=0)

neighbor_train = n.loc[n.Response!=-1]
neighbor_train.drop(['Response'], axis=1, inplace=True)

neighbor_test = n.loc[n.Response==-1]
neighbor_test.drop(['Response'], axis=1, inplace=True)

print(neighbor_test.shape, neighbor_train.shape)

x_train = x_train.join(neighbor_train, on='Id')
x_test = x_test.join(neighbor_test, on='Id')

gc.collect()

(1183748, 14) (1183747, 14)


28

### Add neighbor numeric features

In [13]:
# Date + station flow data will be added to x
x = pd.concat([x_train, x_test], keys=['train', 'test'])
x.sort_index(axis=0, level='Id', inplace=True)

# Add the neighbor records
x = x.join(x.iloc[:, :150].shift(), rsuffix='_previous')
x = x.join(x.iloc[:, :150].shift(-1), rsuffix='_next')

### Load benchmark_7 features

In [16]:
time_features = pd.read_hdf('../benchmark_7/time_features_diff.hdf', 'time_features')
time_features.drop(['time_start', 'time_end', 'time_duration', 'Response'], axis=1, inplace=True)

# Find MeanTimeDiff and discard them
# time_features.columns.tolist()[-40:]

# Join to x
x = x.join(time_features.iloc[:, :-40])

### Load benchmark_8 features

In [25]:
# hash of rows and counts of duplications
hash_numeric = pd.read_hdf('benchmark_8_numeric_features_2.hdf', 'hash_numeric')

x = x.join(hash_numeric)

### Load zscore per week

In [30]:
# zscore per week
x_zscore_per_week = pd.read_hdf('benchmark_8_numeric_features_2.hdf', 'zscore_per_week')

# select important features
tmp = important_features.copy()
tmp.remove('Response')
tmp.remove('Id')

x_zscore_per_week_if = x_zscore_per_week[tmp]

x = x.join(x_zscore_per_week_if, rsuffix='_zscore')

del x_zscore_per_week, x_zscore_per_week_if
gc.collect()

### Load count encoding

In [45]:
# x_count_encoded = pd.read_hdf('benchmark_8_numeric_features_2.hdf', 'count_encoded')

# tmp = ['count_'+k for k in tmp]

# x_count_encoded_if = x_count_encoded[tmp]

# x_count_encoded_if.shape

# x = x.join(x_count_encoded_if)

# del x_count_encoded, x_count_encoded_if
# gc.collect()

In [61]:
x_train = x.loc['train']
x_test = x.loc['test']

In [64]:
del x
gc.collect()

Exception ignored in: <function DMatrix.__del__ at 0x7f3bd4348510>
Traceback (most recent call last):
  File "/c3se/NOBACKUP/users/lyaa/conda_dir/miniconda/envs/kaggle/lib/python3.7/site-packages/xgboost/core.py", line 366, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


28632

### CV score based on stratified KFold with repeated models

In [None]:
# CV
# specify parameters 
# 'booster':'gbtree'
params = {'max_depth':18, 'eta':0.03, 'silent':1, 'objective':'binary:logistic', 'nthread':16,
         'lambda':4, 'subsample':0.9, 'min_child_weight':5, 'booster':'gbtree', 'alpha':0,
         'base_score':0.0058, 'colsample_bytree':0.6}

# 'booster':'dart'
# params = {'max_depth':14, 'eta':0.03, 'silent':1, 'objective':'binary:logistic', 'nthread':20,
#         'lambda':4, 'subsample':0.9, 'min_child_weight':5, 'booster':'dart', 'alpha':0,
#         'base_score':0.0058, 'nthread':20, 'colsample_bytree':0.6, 
#         'sample_type':'uniform', 'normalize_type':'tree', 'rate_drop':0.1, 'skip_drop':0.2, 'one_drop':True}

cv_results, clfs, running_time = cross_val_predict_skf_rm_xgb(params, x_train, y_train, 
                                                              num_boost_round=100, n_splits=5, 
                                                              n_repeats=3, random_state=70864, 
                                                              verbose_eval=True)

results = {'clfs': clfs, 'cv_results': cv_results, 'running_time': running_time}
save_pickle(results, 'results_benchmark_8_cv_1.pickle')

[0]	train-error:0.005811	test-error:0.005812	train-MCC:0.276114	test-MCC:0.265946
[1]	train-error:0.005811	test-error:0.005812	train-MCC:0.340044	test-MCC:0.338623
[2]	train-error:0.005811	test-error:0.005812	train-MCC:0.346387	test-MCC:0.353783
[3]	train-error:0.005811	test-error:0.005812	train-MCC:0.371335	test-MCC:0.38326
[4]	train-error:0.005811	test-error:0.005812	train-MCC:0.380611	test-MCC:0.389524
[5]	train-error:0.005811	test-error:0.005812	train-MCC:0.387751	test-MCC:0.3906
[6]	train-error:0.005811	test-error:0.005812	train-MCC:0.397668	test-MCC:0.395693
[7]	train-error:0.005811	test-error:0.005812	train-MCC:0.411766	test-MCC:0.398945
[8]	train-error:0.005811	test-error:0.005812	train-MCC:0.422518	test-MCC:0.406896
[9]	train-error:0.005811	test-error:0.005812	train-MCC:0.431184	test-MCC:0.411414
[10]	train-error:0.005811	test-error:0.005812	train-MCC:0.435415	test-MCC:0.415496
[11]	train-error:0.005811	test-error:0.005812	train-MCC:0.443826	test-MCC:0.418605
[12]	train-error:

In [None]:
cv_train_mean = cv_results['train'].mean(axis=1)
cv_train_std = cv_results['train'].std(axis=1)
cv_test_mean = cv_results['test'].mean(axis=1)
cv_test_std = cv_results['test'].std(axis=1)

In [None]:
plt.figure(figsize=(14, 7))
plt.plot(np.arange(len(cv_train_mean)), cv_train_mean)
plt.fill_between(np.arange(len(cv_train_mean)), cv_train_mean-cv_train_std, cv_train_mean+cv_train_std, alpha=0.5)
plt.plot(np.arange(len(cv_train_mean)), cv_test_mean)
plt.fill_between(np.arange(len(cv_test_mean)), cv_test_mean-cv_test_std, cv_test_mean+cv_test_std, alpha=0.5)
plt.legend(['train', 'test'])

In [None]:
# Train the model
dtrain = xgb.DMatrix(x_train, label=y_train)
params['seed'] = 28537894
clf = xgb.train(params, dtrain, num_boost_round=60,
               feval=mcc_eval, evals=[(dtrain, 'train')])

y_train_pred = clf.predict(dtrain)

# Find best threshold 
thresholds = np.linspace(0.01, 0.99, 400)
mcc = np.array([matthews_corrcoef(y_train, y_train_pred>thr) for thr in thresholds])
plt.plot(thresholds, mcc)
best_threshold = thresholds[mcc.argmax()]

print('Optimal MCC = {:.3f}'.format(mcc.max()))
print('Optimal threshold = {:.3f}'.format(best_threshold))

### Predict on test data

In [None]:
dtest = xgb.DMatrix(x_test)
y_test_pred = clf.predict(dtest)
y_test_pred_int = (y_test_pred>best_threshold).astype(int)

sub = pd.read_csv("../../data/sample_submission.csv.zip", index_col=0)
sub["Response"] = y_test_pred_int
sub.to_csv("15-benchmark_7_submission_1.csv.gz", compression="gzip")

In [None]:
cv_test_mean[60]+cv_test_std[60]

cv_test_mean[60]-cv_test_std[60]

cv_test_mean[79]

cv_test_std[60]