## Benchmark for measuring performance of engineered features and models

Test benchmark 8, z-score per week

In [1]:
import sys
sys.path.insert(0, '../../bosch_helper')
from bosch_helper import *

%matplotlib inline

  from collections import Sequence


In [2]:
np.finfo(np.float16)

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

### Load train and test data

In [3]:
# important numeric features are imported
important_features = pd.read_csv('../benchmark_1/important_numeric_features.csv', index_col=0, header=None)
important_features = list(important_features.values.ravel())
important_features.extend(['Id', 'Response'])

In [4]:
x_train_numeric = pd.read_csv('../../data/train_numeric.csv.zip', index_col=0, usecols=important_features, dtype=np.float32)

y_train = x_train_numeric.Response
x_train_numeric.drop(['Response'], axis=1, inplace=True)

In [5]:
date_train = pd.read_csv('../benchmark_2/train_station_flow.csv.gz', index_col=0, header=None)

In [6]:
x_train = x_train_numeric.join(date_train)

columns = list(x_train.columns)
columns[-1] = 'station_flow'
x_train.columns = columns

In [7]:
x_test_numeric = pd.read_csv('../../data/test_numeric.csv.zip', index_col=0, usecols=important_features[:-1], dtype=np.float32)

date_test = pd.read_csv('../benchmark_2/test_station_flow.csv.gz', index_col=0, header=None)

x_test = x_test_numeric.join(date_test)
x_test.columns = columns

In [8]:
x_train.index = x_train.index.astype(np.int64)
x_test.index = x_test.index.astype(np.int64)

In [9]:
del x_train_numeric, x_test_numeric
gc.collect()

60

### Load benchmark_3 features

In [10]:
start_chunk = pd.read_csv('../benchmark_3/start_chunk.csv.gz', index_col=0)

start_chunk_train = start_chunk.loc[start_chunk.Response!=-1].drop(['Response'], axis=1)
start_chunk_test = start_chunk.loc[start_chunk.Response==-1].drop(['Response'], axis=1)

In [11]:
x_train = x_train.join(start_chunk_train, on='Id')
x_test = x_test.join(start_chunk_test, on='Id')

gc.collect()

35

### Load benchmark_4 features

In [12]:
n = pd.read_csv('../benchmark_4/benchmark_4_neighbors.csv.gz', index_col=0)

neighbor_train = n.loc[n.Response!=-1]
neighbor_train.drop(['Response'], axis=1, inplace=True)

neighbor_test = n.loc[n.Response==-1]
neighbor_test.drop(['Response'], axis=1, inplace=True)

print(neighbor_test.shape, neighbor_train.shape)

x_train = x_train.join(neighbor_train, on='Id')
x_test = x_test.join(neighbor_test, on='Id')

gc.collect()

(1183748, 14) (1183747, 14)


28

### Add neighbor numeric features
The important numeric features selected from previous and next records

In [13]:
# Date + station flow data will be added to x
x = pd.concat([x_train, x_test], keys=['train', 'test'])
x.sort_index(axis=0, level='Id', inplace=True)

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
x['station_flow'] = le.fit_transform(x['station_flow'])

In [15]:
# Add the neighbor records
x = x.join(x.iloc[:, :150].shift(), rsuffix='_previous')
x = x.join(x.iloc[:, :150].shift(-1), rsuffix='_next')

In [16]:
x_train = x.loc['train']
x_test = x.loc['test']

del x
gc.collect()

63

### Load benchmark_7 features

Time series features

In [17]:
time_features = pd.read_hdf('../benchmark_7/time_features_diff.hdf', 'time_features')
time_features.drop(['time_start', 'time_end', 'time_duration', 'Response'], axis=1, inplace=True)

In [18]:
time_feature_names = time_features.iloc[:, :-40].columns.tolist()

In [19]:
# Do not use MeanTimeDiff features
x_train = x_train.join(time_features.loc['train', time_feature_names])
x_test = x_test.join(time_features.loc['test', time_feature_names])

In [20]:
print(x_train.shape, x_test.shape)

(1183747, 754) (1183748, 754)


In [21]:
del time_features
gc.collect()

121

### zscore each week

In [31]:
x_zscore_per_week = pd.read_hdf('benchmark_8_numeric_features_2.hdf', 'zscore_per_week')
x_zscore_per_week.sort_index(by=['Id'], inplace=True)
x_zscore_per_week.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,L0_S0_F0,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,L0_S0_F18,L0_S0_F2,L0_S0_F20,L0_S0_F22,L0_S0_F4,...,L3_S50_F4243,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262
Unnamed: 0_level_1,Id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
test,1,,,,,,,,,,,...,,,,,,,,,,
test,2,,,,,,,,,,,...,,,,,,,,,,
test,3,,,,,,,,,,,...,,,,,,,,,,
train,4,0.445557,1.416992,-0.343506,0.793945,-0.671875,-0.125977,-0.80127,-1.272461,-1.272461,-0.39209,...,,,,,,,,,,
test,5,0.701172,-0.612305,-0.835449,-0.766602,1.463867,1.358398,0.578613,-0.429199,-0.429199,-0.5,...,,,,,,,,,,


In [32]:
x_train.head()

Unnamed: 0_level_0,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,L0_S0_F18,L0_S0_F20,...,L3_S33_F3857_zscore,L3_S33_F3859_zscore,L3_S33_F3865_zscore,L3_S34_F3882_zscore,L3_S35_F3889_zscore,L3_S35_F3896_zscore,L3_S36_F3920_zscore,L3_S36_F3924_zscore,L3_S38_F3952_zscore,L3_S48_F4196_zscore
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.03,-0.034,-0.197,-0.179,0.116,-0.015,-0.032,0.02,0.083,-0.273,...,-2.185547,-0.652344,0.36792,-0.03067,0.283691,0.291992,,,,
6,,,,,,,,,,,...,0.57959,-0.741699,-0.345947,-0.021423,-0.528809,-0.52832,,,,
7,0.088,0.086,0.003,-0.052,0.025,-0.015,-0.072,-0.225,-0.147,0.25,...,2.005859,1.958008,0.841309,-0.034668,-0.023499,-0.016174,,,,
9,-0.036,-0.064,0.294,0.33,0.161,0.022,0.128,-0.026,-0.046,-0.253,...,0.217651,1.295898,0.473145,-0.046051,,,-0.049255,-0.049255,,
11,-0.055,-0.086,0.294,0.33,0.025,0.03,0.168,-0.169,-0.099,0.042,...,-0.182495,0.943848,-0.286865,-0.039459,,,1.396484,1.396484,,


In [26]:
tmp = important_features.copy()
tmp.remove('Response')
tmp.remove('Id')

In [28]:
x_zscore_per_week_if = x_zscore_per_week[tmp]

x_train = x_train.join(x_zscore_per_week_if.loc['train'], rsuffix='_zscore')
x_test = x_test.join(x_zscore_per_week_if.loc['test'], rsuffix='_zscore')

del x_zscore_per_week, x_zscore_per_week_if
gc.collect()

88

### CV score based on stratified KFold with repeated models

In [30]:
# CV
# specify parameters 
# 'booster':'gbtree'
params = {'max_depth':14, 'eta':0.03, 'silent':1, 'objective':'binary:logistic', 'nthread':20,
         'lambda':4, 'subsample':0.9, 'min_child_weight':5, 'booster':'gbtree', 'alpha':0,
         'base_score':0.0058, 'colsample_bytree':0.6}

# 'booster':'dart'
# params = {'max_depth':14, 'eta':0.03, 'silent':1, 'objective':'binary:logistic', 'nthread':20,
#         'lambda':4, 'subsample':0.9, 'min_child_weight':5, 'booster':'dart', 'alpha':0,
#         'base_score':0.0058, 'nthread':20, 'colsample_bytree':0.6, 
#         'sample_type':'uniform', 'normalize_type':'tree', 'rate_drop':0.1, 'skip_drop':0.2, 'one_drop':True}

cv_results, clfs, running_time = cross_val_predict_skf_rm_xgb(params, x_train, y_train, 
                                                              num_boost_round=80, n_splits=5, 
                                                              n_repeats=3, random_state=5870577, 
                                                              verbose_eval=True)

results = {'clfs': clfs, 'cv_results': cv_results, 'running_time': running_time}
save_pickle(results, 'results_benchmark_8_cv_4.pickle')

[0]	train-error:0.005811	test-error:0.005812	train-MCC:0.280293	test-MCC:0.257913
[1]	train-error:0.005811	test-error:0.005812	train-MCC:0.329268	test-MCC:0.31285
[2]	train-error:0.005811	test-error:0.005812	train-MCC:0.35113	test-MCC:0.341077
[3]	train-error:0.005811	test-error:0.005812	train-MCC:0.363343	test-MCC:0.357184
[4]	train-error:0.005811	test-error:0.005812	train-MCC:0.384067	test-MCC:0.362402
[5]	train-error:0.005811	test-error:0.005812	train-MCC:0.391983	test-MCC:0.362593
[6]	train-error:0.005811	test-error:0.005812	train-MCC:0.402009	test-MCC:0.368709
[7]	train-error:0.005811	test-error:0.005812	train-MCC:0.41533	test-MCC:0.375617
[8]	train-error:0.005811	test-error:0.005812	train-MCC:0.42549	test-MCC:0.37867
[9]	train-error:0.005811	test-error:0.005812	train-MCC:0.435589	test-MCC:0.383508
[10]	train-error:0.005811	test-error:0.005812	train-MCC:0.442813	test-MCC:0.39121
[11]	train-error:0.005811	test-error:0.005812	train-MCC:0.451443	test-MCC:0.399592
[12]	train-error:0.0

[19]	train-error:0.005773	test-error:0.00577	train-MCC:0.478851	test-MCC:0.424677
[20]	train-error:0.005763	test-error:0.005757	train-MCC:0.481965	test-MCC:0.426839
[21]	train-error:0.005701	test-error:0.005719	train-MCC:0.485196	test-MCC:0.425701
[22]	train-error:0.005609	test-error:0.005677	train-MCC:0.487108	test-MCC:0.426001
[23]	train-error:0.005564	test-error:0.005618	train-MCC:0.490929	test-MCC:0.429012
[24]	train-error:0.005527	test-error:0.005592	train-MCC:0.492113	test-MCC:0.433269
[25]	train-error:0.005471	test-error:0.005508	train-MCC:0.493404	test-MCC:0.433456
[26]	train-error:0.005399	test-error:0.005432	train-MCC:0.496035	test-MCC:0.432891
[27]	train-error:0.005306	test-error:0.00536	train-MCC:0.497576	test-MCC:0.434713
[28]	train-error:0.005247	test-error:0.005318	train-MCC:0.500192	test-MCC:0.434733
[29]	train-error:0.005187	test-error:0.005267	train-MCC:0.501783	test-MCC:0.435923
[30]	train-error:0.005142	test-error:0.005229	train-MCC:0.504623	test-MCC:0.437139
[31]	t

[38]	train-error:0.004943	test-error:0.005048	train-MCC:0.503816	test-MCC:0.450023
[39]	train-error:0.004923	test-error:0.005048	train-MCC:0.505885	test-MCC:0.449957
[40]	train-error:0.004909	test-error:0.005039	train-MCC:0.507034	test-MCC:0.454176
[41]	train-error:0.004886	test-error:0.005043	train-MCC:0.510592	test-MCC:0.451595
[42]	train-error:0.004861	test-error:0.005026	train-MCC:0.510992	test-MCC:0.453327
[43]	train-error:0.004848	test-error:0.005018	train-MCC:0.513681	test-MCC:0.453342
[44]	train-error:0.004825	test-error:0.004997	train-MCC:0.514839	test-MCC:0.453327
[45]	train-error:0.004808	test-error:0.004984	train-MCC:0.51526	test-MCC:0.453342
[46]	train-error:0.004793	test-error:0.004976	train-MCC:0.515963	test-MCC:0.453619
[47]	train-error:0.004766	test-error:0.004959	train-MCC:0.518489	test-MCC:0.453996
[48]	train-error:0.004749	test-error:0.004933	train-MCC:0.518539	test-MCC:0.453557
[49]	train-error:0.004739	test-error:0.004912	train-MCC:0.520087	test-MCC:0.454572
[50]	

KeyboardInterrupt: 

In [None]:
cv_train_mean = cv_results['train'].mean(axis=1)
cv_train_std = cv_results['train'].std(axis=1)
cv_test_mean = cv_results['test'].mean(axis=1)
cv_test_std = cv_results['test'].std(axis=1)

In [None]:
plt.figure(figsize=(14, 7))
plt.plot(np.arange(len(cv_train_mean)), cv_train_mean)
plt.fill_between(np.arange(len(cv_train_mean)), cv_train_mean-cv_train_std, cv_train_mean+cv_train_std, alpha=0.5)
plt.plot(np.arange(len(cv_train_mean)), cv_test_mean)
plt.fill_between(np.arange(len(cv_test_mean)), cv_test_mean-cv_test_std, cv_test_mean+cv_test_std, alpha=0.5)
plt.legend(['train', 'test'])

In [None]:
# Train the model
dtrain = xgb.DMatrix(x_train, label=y_train)
params['seed'] = 28537894
clf = xgb.train(params, dtrain, num_boost_round=60,
               feval=mcc_eval, evals=[(dtrain, 'train')])

y_train_pred = clf.predict(dtrain)

# Find best threshold 
thresholds = np.linspace(0.01, 0.99, 400)
mcc = np.array([matthews_corrcoef(y_train, y_train_pred>thr) for thr in thresholds])
plt.plot(thresholds, mcc)
best_threshold = thresholds[mcc.argmax()]

print('Optimal MCC = {:.3f}'.format(mcc.max()))
print('Optimal threshold = {:.3f}'.format(best_threshold))

### Predict on test data

In [None]:
dtest = xgb.DMatrix(x_test)
y_test_pred = clf.predict(dtest)
y_test_pred_int = (y_test_pred>best_threshold).astype(int)

sub = pd.read_csv("../../data/sample_submission.csv.zip", index_col=0)
sub["Response"] = y_test_pred_int
sub.to_csv("benchmark_8_submission_cv_5.csv.gz", compression="gzip")

Only With MeanTimeDiff Features:
- LB: `n_estimators=60`: Private MCC = 0.44401, public MCC = 0.43569
- CV: `n_estimators=80`: Score mean = 0.448, std = 0.013
- CV: `n_estimators=60`: Score mean = 0.445491, std = 0.0133, MCC~[0.43218, 0.45880]

Time series features, excluding MeanTimeDiff:
- LB: `n_estimators=60`: Private MCC = 0.46212, public MCC = 0.44761
- CV: `n_estimators=80`: Score mean = 0.454509, std = 0.00586
- CV: `n_estimators=60`: Score mean = 0.4534794, std = 0.0067, MCC~[0.44682, 0.46014]

Time series features, including MeanTimeDiff:
- LB: `n_estimators=60`: Private MCC = 0.43169, public MCC = 0.42776
- CV: `n_estimators=80`: Score mean = 0.452, std = 0.009
- CV: `n_estimators=60`: Score mean = 0.449, std = 0.009, MCC~[0.440109, 0.458029]
- Apparently overfitting since CV is higher than LB

In [28]:
cv_test_mean[60]+cv_test_std[60]

cv_test_mean[60]-cv_test_std[60]

cv_test_mean[79]

cv_test_std[60]

0.008573770361371