In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import os

In [2]:
# MAKE CUSTOM FEATURES
from additional_features.make_features import make_all_features
make_all_features(n_chunks=10, verbose=True, overwrite=False)

In [3]:
from kaggle_submit import submit_to_kaggle
from helpers import *
from utils.globals import *
from utils.distribution_statistics import *
from objects import *

train_file = "kaggle_data/X_train.h5/X_train.h5"
test_file = "kaggle_data/X_test.h5/X_test.h5"

h5_train = h5py.File(train_file, mode='a')
h5_test = h5py.File(test_file, mode='a')

y_train = pd.read_csv("kaggle_data/y_train.csv", index_col=0, squeeze=True)
y_train_arr = y_train.to_numpy()

In [4]:
def ensure_integrity(h5_file):
    keys = list(h5_file.keys())
    for i, key in enumerate(keys):
        print_bis(f'{i+1}/{len(keys)}')
        x = h5_file[key][:]
        assert np.sum(np.isnan(x)) == 0
        assert np.sum(np.isinf(x)) == 0
        
# ensure_integrity(h5_train)
# ensure_integrity(h5_test)

In [5]:
"""
# SAVE MODEL

from joblib import dump, load
ARCHIVES_FOLDER = "models_archives"
if not os.path.exists(ARCHIVES_FOLDER):
    os.makedirs(ARCHIVES_FOLDER)

def save_model(model, name):
    fpath = os.path.join(ARCHIVES_FOLDER, f"{name}.joblib")
    dump(model, fpath)
    print(f"New model saved at {fpath}")
    return fpath
    
    
def load_model(name):
    if not name.startswith(ARCHIVES_FOLDER):
        name = os.path.join(ARCHIVES_FOLDER, name)
    if not name.endswith(".joblib"):
        name = f"{name}.joblib"
    model = load(name)
    return model

LEADERBOARD_FILE = "leaderboard.txt"
if not os.path.exists(LEADERBOARD_FILE):
    with open(LEADERBOARD_FILE, 'a') as leaderboard:
        leaderboard.write(";;;".join(['path', 'training_score', 'validation_score', 'comments']))
    
def write_model_to_leaderboard(model, model_name, train_score, val_score, comments=""):
    fpath = save_model(model, model_name)
    with open(LEADERBOARD_FILE, "a") as leaderboard:
        leaderboard.write("\n" + ';;;'.join([fpath, str(train_score), str(val_score), comments]))
    
"""
_ = ""

In [6]:
## Input maker

def make_input_for_svm_extreme(h5_file):
    return make_input(h5_file, features=FEATURES, quantiles=TAIL_QUANTILES, dist_char=False, truncate_dist=False)

svm_extreme_input_maker = InputMaker(make_input_for_svm_extreme)

## input shaper
import re
from sklearn.preprocessing import StandardScaler

pca_cols_filters = [
    lambda x: 'eeg' in x[0] and 'logmod' in x[0], 
    lambda x: bool(re.search('eeg_\d', x[0])),
]

pca_list = [CustomPCA(columns_filter=filt, name=f"PCA_{i}", var_capture=0.95) for i, filt in enumerate(pca_cols_filters)]

svm_extreme_input_shaper = InputShaper(*pca_list, StandardScaler())

## parameters grid
from sklearn.model_selection import ParameterGrid

svm_grid_1 = ParameterGrid({"kernel": ["rbf", "sigmoid"],
                            "C": [0.01, 0.1, 1, 10, 100],
                            "gamma": ["auto", "scale"]}
                          )
svm_grid_2 = ParameterGrid({"kernel": ["poly"],
                            "C": [0.01, 0.1, 1, 10, 100],
                            "degree": [1, 2, 3, 4]}
                          )
svm_hyperparameters = list(svm_grid_1) + list(svm_grid_2)


# generic object
from sklearn.svm import SVC

svm_extreme = PoolModels(
    input_maker=svm_extreme_input_maker,
    n_splits=5,
    train_size=27,
    input_shaper=svm_extreme_input_shaper,
    blueprint=SVC,
    parameters_list=svm_hyperparameters,
    iterations_to_warm=10,
    seed=3,
    h5_train=h5_train,
    h5_test=h5_test,
    y_train_arr=y_train_arr
)

# train
#import matplotlib.pyplot as plt
#svm_extreme.warm_up()
#svm_extreme.plot_validation()
#plt.show()
#best_svm_extremes = svm_extreme.train_n_best_models_until_convergence(3)

In [None]:
# glouton

# input_maker

def make_input_glouton(h5_file):
    return make_input(h5_file, features=FEATURES, quantiles=QUANTILES, dist_char=True, truncate_dist=True)

glouton_im = InputMaker(make_input_glouton)

# input_shaper 
def make_filter(pat):
    def f(col_tup):
        return bool(re.search(pat, col_tup[0]))
    return f

groups_pats = ["alpha", "beta", "delta", "theta", "eeg_\d.*logmod"]
custom_pcas = [CustomPCA(make_filter(gp), name=gp, var_capture=0.95) for gp in groups_pats]
glouton_is = InputShaper(*custom_pcas, StandardScaler())

glouton_pool = PoolModels(
    input_maker=glouton_im,
    n_splits=5,
    train_size=27,
    input_shaper=glouton_is,
    blueprint=SVC,
    parameters_list=svm_hyperparameters,
    iterations_to_warm=1000,
    seed=3,
    h5_train=h5_train,
    h5_test=h5_test,
    y_train_arr=y_train_arr
)

glouton_pool.warm_up()
glouton_pool.plot_validation()
plt.show()
best_gloutons = glouton_pool.train_n_best_models_until_convergence(5)

Split #1/5 - WARM UP Model #1/40 [ETA: ?][1K



Split #1/5 - WARM UP Model #2/40 [ETA: 2643.87s][1K



Split #1/5 - WARM UP Model #3/40 [ETA: 2631.34s][1K



Split #1/5 - WARM UP Model #4/40 [ETA: 2615.59s][1K



Split #1/5 - WARM UP Model #5/40 [ETA: 2519.7s][1K



Split #1/5 - WARM UP Model #6/40 [ETA: 2471.96s][1K



Split #1/5 - WARM UP Model #7/40 [ETA: 2415.13s][1K



Split #1/5 - WARM UP Model #8/40 [ETA: 2351.83s][1K



Split #1/5 - WARM UP Model #9/40 [ETA: 2286.63s][1K



Split #1/5 - WARM UP Model #10/40 [ETA: 2202.1s][1K



Split #1/5 - WARM UP Model #11/40 [ETA: 2106.44s][1K



Split #1/5 - WARM UP Model #12/40 [ETA: 2029.51s][1K



Split #1/5 - WARM UP Model #13/40 [ETA: 1942.82s][1K



Split #1/5 - WARM UP Model #14/40 [ETA: 1848.26s][1K



Split #1/5 - WARM UP Model #15/40 [ETA: 1765.0s][1K



Split #1/5 - WARM UP Model #16/40 [ETA: 1680.21s][1K



Split #1/5 - WARM UP Model #17/40 [ETA: 1602.61s][1K



Split #1/5 - WARM UP Model #18/40 [ETA: 1505.66s][1K



Split #1/5 - WARM UP Model #19/40 [ETA: 1433.25s][1K



Split #1/5 - WARM UP Model #20/40 [ETA: 1345.32s][1K



Split #1/5 - WARM UP Model #21/40 [ETA: 1277.03s][1K



Split #1/5 - WARM UP Model #22/40 [ETA: 1213.46s][1K



Split #1/5 - WARM UP Model #23/40 [ETA: 1149.41s][1K



Split #1/5 - WARM UP Model #24/40 [ETA: 1085.27s][1K



Split #1/5 - WARM UP Model #25/40 [ETA: 1020.73s][1K



Split #1/5 - WARM UP Model #26/40 [ETA: 957.23s][1K



Split #1/5 - WARM UP Model #27/40 [ETA: 15003.49s][1K



Split #1/5 - WARM UP Model #28/40 [ETA: 13447.25s][1K



Split #1/5 - WARM UP Model #29/40 [ETA: 11996.45s][1K



Split #1/5 - WARM UP Model #30/40 [ETA: 10641.7s][1K



Split #1/5 - WARM UP Model #31/40 [ETA: 9373.05s][1K



Split #1/5 - WARM UP Model #32/40 [ETA: 8181.98s][1K



Split #1/5 - WARM UP Model #33/40 [ETA: 7061.38s][1K



Split #1/5 - WARM UP Model #34/40 [ETA: 6002.54s][1K



Split #1/5 - WARM UP Model #35/40 [ETA: 5004.4s][1K



Split #1/5 - WARM UP Model #36/40 [ETA: 4059.94s][1K



Split #1/5 - WARM UP Model #37/40 [ETA: 3164.64s][1K



Split #1/5 - WARM UP Model #38/40 [ETA: 2311.89s][1K



Split #1/5 - WARM UP Model #39/40 [ETA: 1524.17s][1K



Split #1/5 - WARM UP Model #40/40 [ETA: 743.83s][1K



Split #2/5 - WARM UP Model #1/40 [ETA: ?][1K



Split #2/5 - WARM UP Model #2/40 [ETA: 2640.75s][1K



Split #2/5 - WARM UP Model #3/40 [ETA: 2568.7s][1K



Split #2/5 - WARM UP Model #4/40 [ETA: 2502.32s][1K



Split #2/5 - WARM UP Model #5/40 [ETA: 2433.69s][1K



Split #2/5 - WARM UP Model #6/40 [ETA: 2367.23s][1K



Split #2/5 - WARM UP Model #7/40 [ETA: 2295.54s][1K



Split #2/5 - WARM UP Model #8/40 [ETA: 2230.48s][1K



Split #2/5 - WARM UP Model #9/40 [ETA: 3942.61s][1K



Split #2/5 - WARM UP Model #10/40 [ETA: 3636.0s][1K