In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import os
import importlib

from kaggle_submit import submit_to_kaggle


from objects import *
from helpers import *
# from utils.globals import *
from utils.distribution_statistics import *

train_file = "kaggle_data/X_train.h5/X_train.h5"
test_file = "kaggle_data/X_test.h5/X_test.h5"

def get_train_test_connections():
    h5_train = h5py.File(train_file, mode='a')
    h5_test = h5py.File(test_file, mode='a')
    return h5_train, h5_test

def close_train_test_connections(h5_train, h5_test):
    h5_train.close()
    h5_test.close()
    
#h5_train, h5_test = get_train_test_connections()

y_train = pd.read_csv("kaggle_data/y_train.csv", index_col=0, squeeze=True)
y_train_arr = y_train.to_numpy()

from utils.globals import *

h5_train, h5_test = get_train_test_connections()

In [2]:
from helpers import get_subject_ids

train_ids = get_subject_ids(h5_train)
np.random.seed(1)
train_ids = np.random.permutation(train_ids)
train_ids, validation_ids = train_ids[:28], train_ids[28:]


In [3]:
from final_models.best_hgb import BestHGB

hgb = BestHGB(h5_train, h5_test, y_train_arr, train_ids)
hgb.train()
print(hgb.validation_score)

Binning 0.115 GB of training data: 3.092 s
Binning 0.013 GB of validation data: 0.070 s
Fitting gradient boosted rounds:
[1/1000] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 1.26780, val loss: 1.28141, in 1.370s
[2/1000] 5 trees, 155 leaves (31 on avg), max depth = 11, train loss: 1.13080, val loss: 1.15616, in 1.362s
[3/1000] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 1.02543, val loss: 1.05826, in 1.437s
[4/1000] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 0.93966, val loss: 0.98096, in 2.375s
[5/1000] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 0.86914, val loss: 0.92067, in 1.259s
[6/1000] 5 trees, 155 leaves (31 on avg), max depth = 10, train loss: 0.80959, val loss: 0.87099, in 1.030s
[7/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.75788, val loss: 0.82642, in 1.246s
[8/1000] 5 trees, 155 leaves (31 on avg), max depth = 10, train loss: 0.71364, val loss: 0.78840, in 2.075s
[9/1000] 5 trees, 1

[76/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.10340, val loss: 0.39519, in 1.109s
[77/1000] 5 trees, 155 leaves (31 on avg), max depth = 13, train loss: 0.10114, val loss: 0.39398, in 1.516s
[78/1000] 5 trees, 155 leaves (31 on avg), max depth = 14, train loss: 0.09895, val loss: 0.39365, in 1.141s
[79/1000] 5 trees, 155 leaves (31 on avg), max depth = 13, train loss: 0.09688, val loss: 0.39324, in 1.174s
[80/1000] 5 trees, 155 leaves (31 on avg), max depth = 14, train loss: 0.09483, val loss: 0.39285, in 1.171s
[81/1000] 5 trees, 155 leaves (31 on avg), max depth = 15, train loss: 0.09282, val loss: 0.39213, in 1.505s
[82/1000] 5 trees, 155 leaves (31 on avg), max depth = 16, train loss: 0.09080, val loss: 0.39114, in 1.093s
[83/1000] 5 trees, 155 leaves (31 on avg), max depth = 14, train loss: 0.08896, val loss: 0.39086, in 1.204s
[84/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.08717, val loss: 0.38947, in 1.547s
[85/1000] 5 trees, 

In [4]:
from final_models.best_rf import BestRF

rf = BestRF(h5_train, h5_test, y_train_arr, train_ids)
rf.train()
print(rf.validation_score)

Feature #10/10[1K

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:   13.1s


0.7499833379996307


[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   29.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished


In [16]:
from helpers import subjects_ids_to_indexers, custom_score

y_validation_true = y_train_arr[subjects_ids_to_indexers(h5_train, subjects_ids=sorted(validation_ids), as_indices=True)]

from final_models.vote import vote

print("Validation score after vote (hard policy):")
y_validation_pred_hard = vote(models=[hgb, rf], weights=[0.5, 0.5], policy='hard', kind='validation')
custom_score(y_validation_pred_hard, y_validation_true)


Validation score after vote (hard policy):


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


0.7642678599848916

In [17]:
from final_models.vote import vote
print("Validation score after vote (soft policy):")
y_validation_pred_soft = vote(models=[hgb, rf], weights=[0.5, 0.9], policy='soft', kind='validation')
custom_score(y_validation_pred_soft, y_validation_true)


Validation score after vote (soft policy):


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


0.7965753765487312

In [7]:
y_test = vote(models=[hgb, rf], weights=[0.5, 0.9], policy='soft', kind='test')

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.4s finished


In [14]:
import matplotlib.pyplot as plt
example_id = validation_ids[0]
validation_ix = np.array(subjects_ids_to_indexers(h5_train, validation_ids, as_indices=True))
example_ix = validation_ix[subjects_ids_to_indexers(h5_train, [example_id], as_boolean_array=True)]
example_predicted = y_validation_pred_soft[example_ix]
example_true = y_validation_true[subjects_ids_to_indexers(h5_train, [example_id], as_indices=True)]
plt.plot(example_predicted)

IndexError: boolean index did not match indexed array along dimension 0; dimension is 1541 but corresponding boolean dimension is 811

In [15]:
example_ix

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [8]:
y_test = y_test.astype(int)
y_test

array([0, 0, 0, ..., 2, 2, 0])

In [9]:
from kaggle_submit import submit_to_kaggle

#submit_to_kaggle(y_test, h5_test, fname="final_model.csv", msg="")