In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import os
import importlib

from kaggle_submit import submit_to_kaggle


from objects import *
from helpers import *
# from utils.globals import *
from utils.distribution_statistics import *

train_file = "kaggle_data/X_train.h5/X_train.h5"
test_file = "kaggle_data/X_test.h5/X_test.h5"

def get_train_test_connections():
    h5_train = h5py.File(train_file, mode='a')
    h5_test = h5py.File(test_file, mode='a')
    return h5_train, h5_test

def close_connections(*h5_conns):
    for h5_file in h5_conns:
        h5_file.close()
    
from additional_features.make_features import make_all_features
h5_train, h5_test = get_train_test_connections()
make_all_features(h5_train, h5_test, overwrite=False, verbose=True)
close_connections(h5_train, h5_test)

y_train = pd.read_csv("kaggle_data/y_train.csv", index_col=0, squeeze=True)
y_train_arr = y_train.to_numpy()

from utils.globals import *

h5_train, h5_test = get_train_test_connections()

_create_band_signals
_create_log_energy
_create_log_modulus
_create_pulse_max_log_energy_and_freq
_create_speed_and_acceleration
_create_time_features


In [2]:
from helpers import get_subject_ids, subjects_ids_to_indexers

train_ids = get_subject_ids(h5_train)
np.random.seed(1)
train_ids = np.random.permutation(train_ids)
train_ids, validation_ids = train_ids[:28], train_ids[28:]

validation_ix = subjects_ids_to_indexers(h5_train, validation_ids, as_boolean_array=True)
y_validation_true = y_train_arr[validation_ix]


In [4]:
from final_models.best_rf import BestRF
from final_models.best_rf2 import BestRF2
from final_models.best_hgb import BestHGB
from final_models.classic_et import ClassicET
from final_models.classic_bg import ClassicBG

rf = BestRF(h5_train, h5_test, y_train_arr, train_ids)
rf.train()
# print(rf.validation_score)

rf2 = BestRF2(h5_train, h5_test, y_train_arr, train_ids)
rf2.train()
# print(rf2.validation_score)

hgb = BestHGB(h5_train, h5_test, y_train_arr, train_ids)
hgb.train()
# print(hgb.validation_score)

et = ClassicET(h5_train, h5_test, y_train_arr, train_ids)
et.train()
# print(et.validation_score)

bg = ClassicBG(h5_train, h5_test, y_train_arr, train_ids)
bg.train()
# print(bg.validation_score)

Feature #9/9[1K1K

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   17.5s finished


Feature #28/28[1K

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   23.1s finished


Binning 0.115 GB of training data: 2.301 s
Binning 0.013 GB of validation data: 0.063 s
Fitting gradient boosted rounds:
[1/1000] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 1.26780, val loss: 1.28141, in 0.947s
[2/1000] 5 trees, 155 leaves (31 on avg), max depth = 11, train loss: 1.13080, val loss: 1.15616, in 0.969s
[3/1000] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 1.02543, val loss: 1.05826, in 1.524s
[4/1000] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 0.93966, val loss: 0.98096, in 0.663s
[5/1000] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 0.86914, val loss: 0.92067, in 0.676s
[6/1000] 5 trees, 155 leaves (31 on avg), max depth = 10, train loss: 0.80959, val loss: 0.87099, in 1.110s
[7/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.75788, val loss: 0.82642, in 1.186s
[8/1000] 5 trees, 155 leaves (31 on avg), max depth = 10, train loss: 0.71364, val loss: 0.78840, in 0.948s
[9/1000] 5 trees, 1

[76/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.10340, val loss: 0.39519, in 0.663s
[77/1000] 5 trees, 155 leaves (31 on avg), max depth = 13, train loss: 0.10114, val loss: 0.39398, in 0.691s
[78/1000] 5 trees, 155 leaves (31 on avg), max depth = 14, train loss: 0.09895, val loss: 0.39365, in 0.663s
[79/1000] 5 trees, 155 leaves (31 on avg), max depth = 13, train loss: 0.09688, val loss: 0.39324, in 0.852s
[80/1000] 5 trees, 155 leaves (31 on avg), max depth = 14, train loss: 0.09483, val loss: 0.39285, in 0.705s
[81/1000] 5 trees, 155 leaves (31 on avg), max depth = 15, train loss: 0.09282, val loss: 0.39213, in 0.662s
[82/1000] 5 trees, 155 leaves (31 on avg), max depth = 16, train loss: 0.09080, val loss: 0.39114, in 0.785s
[83/1000] 5 trees, 155 leaves (31 on avg), max depth = 14, train loss: 0.08896, val loss: 0.39086, in 0.711s
[84/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.08717, val loss: 0.38947, in 1.345s
[85/1000] 5 trees, 

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    4.1s finished


Feature #9/9[1K1K

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  6.6min finished


In [7]:
## ENSEMBLE LAST HARMONIC (not submitted)
from final_models.vote import vote 

def get_harmonic_weight(model):
    return 1 / (1 - model.validation_score)
    
models = [hgb, rf, et, bg]
weights = [get_harmonic_weight(mod) for mod in [hgb, rf, et, bg]]

y_validation_pred_soft = vote(
        models=models, 
        weights=weights,
        policy='soft',
        kind='validation'
)
score_soft = custom_score(y_validation_pred_soft, y_validation_true)

y_validation_pred_hard = vote(
    models=models, 
    weights=weights, 
    policy='hard',
    kind='validation'
)
score_hard = custom_score(y_validation_pred_hard, y_validation_true)

y_test_pred_harmonic = vote(
    models=models, 
    weights=weights, 
    policy='soft',
    kind='test'
)

print("weights :", weights)
print('soft :', score_soft)
print('hard :', score_hard)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n

weights : [4.903715297865709, 4.151362419976312, 3.5474887027100754, 3.684401471977287]
soft : 0.7945263019564238
hard : 0.7815097746360565


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    3.4s finished


In [8]:
## ENSEMBLE LAST (submitted)

models = [hgb, rf, et, bg]
weights = [1.9, 1.8, 1, 1]

y_validation_pred_soft = vote(
        models=models, 
        weights=weights,
        policy='soft',
        kind='validation'
)
score_soft = custom_score(y_validation_pred_soft, y_validation_true)

y_test_pred_last = vote(
    models=models, 
    weights=weights, 
    policy='soft',
    kind='test'
)


#y_validation_pred_hard = vote(
#    models=models, 
#    weights=weights, 
#    policy='hard',
#    kind='validation'
#)
#score_hard = custom_score(y_validation_pred_hard, y_validation_true)

print("weights :", weights)
print('soft :', score_soft)
#print('hard :', score_hard)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n

weights : [1.9, 1.8, 1, 1]
soft : 0.7945806234501889


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.8s finished


In [10]:
## FINAL MODEL (submitted)

models = [hgb, rf2]
weights = [0.5, 0.9]

y_validation_pred_soft = vote(
        models=models, 
        weights=weights,
        policy='soft',
        kind='validation'
)
score_soft = custom_score(y_validation_pred_soft, y_validation_true)

#y_validation_pred_hard = vote(
#    models=models, 
#    weights=weights, 
#    policy='hard',
#    kind='validation'
#)
score_hard = custom_score(y_validation_pred_hard, y_validation_true)

y_test_pred_final = vote(
    models=models, 
    weights=weights, 
    policy='soft',
    kind='test'
)

print("weights :", weights)
print('soft :', score_soft)
#print('hard :', score_hard)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.2s


weights : [0.5, 0.9]
soft : 0.7820900556852667


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.4s finished


In [11]:
#from kaggle_submit import submit_to_kaggle # needs a set up of the kaggle API
#submit_to_kaggle(y_test_pred_final, h5_test, fname='submission_final.csv', msg='')
#submit_to_kaggle(y_test_pred_last, h5_test, fname='submission_last.csv', msg='')