In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import os
import importlib

from kaggle_submit import submit_to_kaggle


from objects import *
from helpers import *
# from utils.globals import *
from utils.distribution_statistics import *

train_file = "kaggle_data/X_train.h5/X_train.h5"
test_file = "kaggle_data/X_test.h5/X_test.h5"

def get_train_test_connections():
    h5_train = h5py.File(train_file, mode='a')
    h5_test = h5py.File(test_file, mode='a')
    return h5_train, h5_test

def close_train_test_connections(h5_train, h5_test):
    h5_train.close()
    h5_test.close()
    
#h5_train, h5_test = get_train_test_connections()

y_train = pd.read_csv("kaggle_data/y_train.csv", index_col=0, squeeze=True)
y_train_arr = y_train.to_numpy()

# MAKE CUSTOM FEATURES
#from additional_features.make_features import make_all_features
#make_all_features(h5_train, h5_test, n_chunks=10, verbose=True, overwrite=False)
#from additional_features.features_to_frequential import _create_log_modulus, BAND_EEG
#from additional_features.eeg_band_signals import _create_band_signals
#for h5_file in (h5_train, h5_test):
#    _create_band_signals(h5_file, n_chunks=10, overwrite=False, verbose=True)
#    _create_log_modulus(h5_file, n_chunks=10, features_to_convert=BAND_EEG, overwrite=False, verbose=True)

# Close connections because utils.globals needs them
#close_train_test_connections(h5_train, h5_test)

from utils.globals import *

h5_train, h5_test = get_train_test_connections()

In [2]:
import itertools

eeg_nums = list(range(1, 8))
greek_letters = ["alpha", "beta", "delta", "theta"]

EEG_FEATURES = [f"eeg_{i}" for i in eeg_nums]
EEG_BAND_FEATURES = [f"{greek}_eeg_{i}" for greek, i in itertools.product(greek_letters, eeg_nums)]
OTHER_TIME_FEATURES = ["speed_norm", "accel_norm", "pulse"]

EEG_LOGMOD_FEATURES = [f"{eeg}_ft_logmod" for eeg in EEG_FEATURES]
EEG_BAND_LOGMOD_FEATURES = [f"{eeg_band}_ft_logmod" for eeg_band in EEG_BAND_FEATURES]
OTHER_LOGMOD_FEATURES = [f"{time_feat}_ft_logmod" for time_feat in OTHER_TIME_FEATURES]

SLEEP_FEATURES = [feat for feat in h5_train.keys() if "sleep" in feat]

# OLD NAMES
BAND_LOG_ENERGY_FEATURES_OLD = [f"{greek}_{eeg}_logE" for greek, eeg in itertools.product(greek_letters, EEG_FEATURES)]
LOGMOD_FEATURES_OLD = EEG_LOGMOD_FEATURES + OTHER_LOGMOD_FEATURES
TIME_FEATURES_OLD = EEG_FEATURES + OTHER_TIME_FEATURES

In [3]:
SLEEP_FEATURES

['sleep_left', 'sleep_time', 'sleep_time_relative']

In [4]:
print(LOGMOD_FEATURES_OLD)

['eeg_1_ft_logmod', 'eeg_2_ft_logmod', 'eeg_3_ft_logmod', 'eeg_4_ft_logmod', 'eeg_5_ft_logmod', 'eeg_6_ft_logmod', 'eeg_7_ft_logmod', 'speed_norm_ft_logmod', 'accel_norm_ft_logmod', 'pulse_ft_logmod']


In [5]:
def make_input_default(h5_file):
    
    dfs = list()
    
    dfs.append( # df_bandlog
        make_input_new(
            h5_file,
            features=BAND_LOG_ENERGY_FEATURES_OLD,
            rescale_by_subject=False,
            moments=[1],
        )
    )
    
    #dfs.append( # df_spectral_entropy = 
    #    make_input_new(
    #        h5_file,
    #        features=EEG_LOGMOD_FEATURES,
    #        rescale_by_subject=False,
    #        entropy=True,
    #        pre_op=lambda x: np.exp(2 * x),
    #        pre_op_name='energy'
    #    )
    #)
    
    #dfs.append( # df_time_hjorth = 
    #    make_input_new(
    #        h5_file,
    #        features=EEG_FEATURES,
    #        rescale_by_subject=False,
    #        hjorth=True,
            #pre_op=lambda x: np.exp(2 * x),
            #pre_op_name='energy'
    #    )
    #)
        
    dfs.append( # df_sleep = 
        make_input_new(
            h5_file,
            features=SLEEP_FEATURES[:2],
            rescale_by_subject=False,
            moments=[1]
        )
    )
    
    ## LOGMOD RENAME COLUMNS
    
    dfs.append( # df_logmod = 
        make_input_new(
            h5_file,
            features=OTHER_LOGMOD_FEATURES,
            rescale_by_subject=False,
            #interquantiles=[(0.2, 0.8)],
            quantiles_inv=[0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99],
            diff_orders=[0],
            interquantiles_inv=[(0.1, 0.9), (0.3, 0.7)],
        )
    )
    
 
    dfs.append( # df_time_diff_0 = 
        make_input_new(
            h5_file,
            features=sorted(set(TIME_FEATURES_OLD) - {"speed_norm"}),
            rescale_by_subject=False,
            hjorth=True, mmd=True,
            # hjorth=True,
            # moments=[1, 2],
            quantiles=[1e-4, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99, 1-1e-4],
            # interquantiles=[(0.1, 0.9), (0.3, 0.7)],
            diff_orders=[0]
        )
    )
    
    dfs.append( # df_eeg_band_logmod = 
        make_input_new(
            h5_file,
            features=EEG_BAND_LOGMOD_FEATURES,
            rescale_by_subject=False,
            entropy=True, renyi_entropy=True,
            # hjorth=True,
            # moments=[1, 2],
            #quantiles=[1e-4, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99, 1-1e-4],
            #interquantiles=[(0.1, 0.9), (0.3, 0.7)],
            diff_orders=[0],
            pre_op=lambda x: np.exp(2*x),
            pre_op_name='energy',
            post_op=np.log,
            post_op_name='log',
        )
    )
    
    
    
    #dfs.append( # df_eeg_bandlog =
    #    make_input_new(
    #        h5_file,
    #        features=EEG_LOGMOD_FEATURES,
    #        renyi_entropy=True,
    #        diff_orders=[0],
    #        pre_op=lambda x: np.exp(2*x),
    #        pre_op_name='energy',
    #        post_op=np.log,
    #        post_op_name='log'
    #    )
    #)
    #dfs.append( # df_eeg_bands =
    #    make_input_new(
    #        h5_file,
    #        features=EEG_BAND_FEATURES,
    #       hjorth=True,
    #       diff_orders=[0]
    #    )
    #)
    
    # EEG INTEGRATED
    #eeg_features = [feat for feat in FEATURES if re.search('^eeg_\d$', feat)]
    # print(eeg_features)
    #df_eeg_integrated = make_input_new(
    #    h5_file,
    #    features=eeg_features,
    #    rescale_by_subject=False,
    #    # moments=[1, 2],
    #    quantiles=[0.1, 0.5, 0.9],
    #    #interquantiles=[(0.1, 0.9), (0.3, 0.7)],
    #    diff_orders=[-1]
    #)
    
    
    #df_time_diff_1 = make_input_new(
     #   h5_file,
     #   features=TIME_FEATURES_OLD,
     #   rescale_by_subject=False,
     #   moments=[1, 2],
        #quantiles=[1e-4, 1-1e-4],
     #   diff_orders=[1]
    #)
    
    #df_pulse_max_freq = make_input_new(
    #    h5_file,
    #    features=["pulse_max_freq"],
    #    rescale_by_subject=True,
    #    moments=[1],
    #)
    
    #df_pulse_max_logE = make_input_new(
    #    h5_file,
    #    features=["pulse_max_logE"],
    #    rescale_by_subject=False,
    #    moments=[1],
        #pre_op=np.exp,
        #pre_op_name="energy"
    #)
    
    
    res_df = pd.concat(dfs, axis=1, keys=[str(i) for i in range(len(dfs))])
    
    # Filling policy
    missing_values = res_df.isna().sum(axis=0)
    missing_values = missing_values.loc[missing_values > 0]
    if len(missing_values) > 0:
        print("Missing values :")
        print(missing_values)
        print("Filling missing values with zero")
        res_df = res_df.fillna(0)
        
    return res_df

def shift_and_fill(df, shift):
    shifted_df = df.shift(shift)
    if shift > 0:
        shifted_df.bfill(inplace=True)
    elif shift < 0:
        shifted_df.ffill(inplace=True)
    return shifted_df


def roll_and_concat(df, shifts_range):
    return pd.concat(map(lambda shift: shift_and_fill(df, shift), shifts_range), 
                     axis=1, keys=shifts_range)    
    
def concat_windows(df, ids, shifts):
    df = df.groupby(ids, as_index=False).apply(roll_and_concat, shifts_range=shifts)
    return df
    
def make_input_default_test(h5_file):
    return make_input_new(h5_file, ["eeg_1", "eeg_2"], moments=[1])

def make_input_default_rolling(h5_file, shifts):
    """
    !!! not suited for pca because columns have 3 levels
    """
    df = make_input_default(h5_file)
    df_with_window = concat_windows(h5_file, df, shifts)
    return df_with_window


In [6]:
X_train_raw = make_input_default(h5_train)

Feature #28/28[1K

In [7]:
train_ids = get_subject_ids(h5_train)
np.random.seed(1)
train_ids = np.random.permutation(train_ids)
train_train_ids, train_val_ids = sorted(train_ids[:28]), sorted(train_ids[28:])
#train_train_ids, train_val_ids = train_ids[:28], train_ids[28:]

X_train_train = X_train_raw.loc[subjects_ids_to_indexers(h5_train, train_train_ids, as_indices=True), :]
y_train_train = y_train_arr[subjects_ids_to_indexers(h5_train, train_train_ids, as_indices=True)]

X_train_val = X_train_raw.loc[subjects_ids_to_indexers(h5_train, train_val_ids, as_indices=True), :]
y_train_val = y_train_arr[subjects_ids_to_indexers(h5_train, train_val_ids, as_indices=True)]

"""
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

scaler_ = MinMaxScaler()
pca_ = PCA(0.99)

X_train_train = pca_.fit_transform(scaler_.fit_transform(X_train_train))
X_train_val = pca_.transform(scaler_.transform(X_train_val))
X_test = pca_.transform(scaler_.transform(X_test_raw))
"""
def subjects_ids_col(h5_file):
    return h5_file["index"][:]

def concat_windows(arr, subjects_ids, h5_file, shifts): # subjects_ids must be sorted
    sid_col = subjects_ids_col(h5_file)
    sid_col = sid_col[np.isin(sid_col, subjects_ids)]
    df = pd.DataFrame(arr)
    
    return df.groupby(sid_col).apply(roll_and_concat, shifts_range=shifts)

shifts = [-1, 0, 1]
X_train_train_rolled = concat_windows(X_train_train, train_train_ids, h5_train, shifts)
X_train_val_rolled = concat_windows(X_train_val, train_val_ids, h5_train, shifts)


TypeError: concat_windows() missing 1 required positional argument: 'shifts'

In [10]:
shifts

[-1, 0, 1]

In [None]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, ExtraTreesClassifier

 
class VotingClassifier_:
    """ Implements a voting classifier for pre-trained classifiers"""

    def __init__(self, estimators):
        self.estimators = estimators
        # self.weights = weights

    def predict_hard(self, X, weights=None):
        # get values
        Y = np.zeros([X.shape[0], len(self.estimators)], dtype=int)
        for i, clf in enumerate(self.estimators):
            Y[:, i] = clf.predict(X)
        # apply voting 
        y = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            y[i] = np.argmax(np.bincount(Y[i,:], weights=weights))
        return y

    def predict_soft(self, X, weights=None):
        # get values
        if weights is None:
            weights = np.ones((len(self.estimators),), dtype=float)
        Y = np.zeros([X.shape[0], len(self.estimators), 5], dtype=float)
        for i, clf in enumerate(self.estimators):
            Y[:, i, :] = clf.predict_proba(X)
        Y_proba = np.sum(Y * np.array(weights).reshape((1, len(self.estimators), 1)), axis=1)
        # apply voting 
        y = np.zeros(X.shape[0], dtype=float)
        # print(weights.dtype, Y.dtype, Y_proba.dtype)
        for i in range(X.shape[0]):
            y[i] = np.argmax(Y_proba[i, :])
        return y

    
# Random Forest
rfc_ = RandomForestClassifier(random_state=1, n_estimators=300, verbose=1, n_jobs=-2)
rfc_.fit(X_train_train_rolled, y_train_train)
print("Random Forest validation score :", custom_score(rfc_.predict(X_train_val_rolled), y_train_val))

# Extra Trees
#etc_ = ExtraTreesClassifier(verbose=1, random_state=1, n_estimators=1000, n_jobs=-2)
#etc_.fit(X_train_train_rolled, y_train_train)
#print("Extra Trees validation score :", custom_score(etc_.predict(X_train_val_rolled), y_train_val))

# Bagging
#bc_ = BaggingClassifier(verbose=1, random_state=1, n_estimators=70, n_jobs=-2)
#bc_.fit(X_train_train_rolled, y_train_train)
#print('Bagging validation score :', custom_score(bc_.predict(X_train_val_rolled), y_train_val))

# Gradient Boosting
#hgb_ = HistGradientBoostingClassifier(verbose=1, random_state=1, max_iter=1000)
#print("fit")
#hgb_.fit(X_train_train_rolled, y_train_train)
#print('HistGradientBoosting validation score :', custom_score(hgb_.predict(X_train_val_rolled), y_train_val))
     

meta_estimator = VotingClassifier_([rfc_, etc_, bc_, hgb_])


In [82]:
print("Individual predictions")
for est in meta_estimator.estimators:
    print(est, "----->", custom_score(est.predict(X_train_val_rolled), y_train_val))

print("Hard prediction weights=[1, 1, 1, 1]")
print(custom_score(meta_estimator.predict_hard(X_train_val_rolled, weights=[1, 1, 1, 1]), y_train_val))
    
print("Hard prediction weights=[0.2, 0.2, 0.2, 1]")
print(custom_score(meta_estimator.predict_hard(X_train_val_rolled, weights=[0.2, 0.2, 0.2, 1]), y_train_val))

print("Hard prediction weights=[0.4, 0.4, 0.4, 1]")
print(custom_score(meta_estimator.predict_hard(X_train_val_rolled, weights=[0.4, 0.4, 0.4, 1]), y_train_val))

print("Hard prediction weights=[0.7, 0.7, 0.7, 1]")
print(custom_score(meta_estimator.predict_hard(X_train_val_rolled, weights=[0.7, 0.7, 0.7, 1]), y_train_val))

print("Soft prediction weights=[1, 1, 1, 1]")
print(custom_score(meta_estimator.predict_soft(X_train_val_rolled, weights=[1, 1, 1, 1]), y_train_val))
    
print("Soft prediction weights=[0.2, 0.2, 0.2, 1]")
print(custom_score(meta_estimator.predict_soft(X_train_val_rolled, weights=[0.2, 0.2, 0.2, 1]), y_train_val))

print("Soft prediction weights=[0.4, 0.4, 0.4, 1]")
print(custom_score(meta_estimator.predict_soft(X_train_val_rolled, weights=[0.4, 0.4, 0.4, 1]), y_train_val))

print("Soft prediction weights=[0.7, 0.7, 0.7, 1]")
print(custom_score(meta_estimator.predict_soft(X_train_val_rolled, weights=[0.7, 0.7, 0.7, 1]), y_train_val))

Individual predictions
RandomForestClassifier(n_jobs=-2, random_state=1, verbose=1) -----> 0.7192152460425681


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.8s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    1.3s


ExtraTreesClassifier(n_estimators=1000, n_jobs=-2, random_state=1, verbose=1) -----> 0.6991284274439796


[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    1.9s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    4.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


BaggingClassifier(n_estimators=70, n_jobs=-2, random_state=1, verbose=1) -----> 0.7254172221127266
HistGradientBoostingClassifier(max_iter=1000, random_state=1, verbose=1) -----> 0.796072989711446
Hard prediction weights=[1, 1, 1, 1]


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s


0.7298699342541014
Hard prediction weights=[0.2, 0.2, 0.2, 1]


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s


0.796072989711446
Hard prediction weights=[0.4, 0.4, 0.4, 1]


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished


0.7755866640989398
Hard prediction weights=[0.7, 0.7, 0.7, 1]


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished


0.7659853502503688
Soft prediction weights=[1, 1, 1, 1]


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


0.7715646707697053
Soft prediction weights=[0.2, 0.2, 0.2, 1]


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


0.7957171332063077
Soft prediction weights=[0.4, 0.4, 0.4, 1]


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s


0.7880804187412094
Soft prediction weights=[0.7, 0.7, 0.7, 1]


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


0.7787980580047907


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s


Soft prediction weights=None


[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished


0.7774488783662253
Soft prediction weights=[0.2, 0.2, 0.2, 1]


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


0.7773833762779745
Soft prediction weights=[0.4, 0.4, 0.4, 1]


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


0.7804111096740698
Soft prediction weights=[0.7, 0.7, 0.7, 1]


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


0.781832453189057


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_base = RandomForestClassifier(verbose=1, random_state=1, n_estimators=100, n_jobs=-2)
rf_base.fit(X_train_train_rolled, y_train_train)
custom_score(rf_base.predict(X_train_val_rolled), y_train_val)

In [26]:
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier

gg = HistGradientBoostingClassifier(verbose=1, random_state=1, max_iter=100)
print("fit")
gg.fit(X_train_train_rolled, y_train_train)
print("predict")
print(custom_score(gg.predict(X_train_val_rolled), y_train_val))

fit
Binning 0.139 GB of training data: 2.250 s
Binning 0.016 GB of validation data: 0.066 s
Fitting gradient boosted rounds:
[1/100] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 1.25911, val loss: 1.27682, in 1.398s
[2/100] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 1.11466, val loss: 1.14326, in 0.692s
[3/100] 5 trees, 155 leaves (31 on avg), max depth = 10, train loss: 1.00447, val loss: 1.04346, in 0.716s
[4/100] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.91655, val loss: 0.96353, in 1.277s
[5/100] 5 trees, 155 leaves (31 on avg), max depth = 11, train loss: 0.84523, val loss: 0.89942, in 1.722s
[6/100] 5 trees, 155 leaves (31 on avg), max depth = 9, train loss: 0.78476, val loss: 0.84468, in 0.727s
[7/100] 5 trees, 155 leaves (31 on avg), max depth = 10, train loss: 0.73287, val loss: 0.80091, in 1.042s
[8/100] 5 trees, 155 leaves (31 on avg), max depth = 10, train loss: 0.68810, val loss: 0.76328, in 1.611s
[9/100] 5 trees, 155 l

[76/100] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.09353, val loss: 0.37282, in 1.771s
[77/100] 5 trees, 155 leaves (31 on avg), max depth = 14, train loss: 0.09138, val loss: 0.37172, in 1.580s
[78/100] 5 trees, 155 leaves (31 on avg), max depth = 14, train loss: 0.08944, val loss: 0.37184, in 0.952s
[79/100] 5 trees, 155 leaves (31 on avg), max depth = 15, train loss: 0.08765, val loss: 0.37121, in 1.051s
[80/100] 5 trees, 155 leaves (31 on avg), max depth = 13, train loss: 0.08584, val loss: 0.37071, in 2.200s
[81/100] 5 trees, 155 leaves (31 on avg), max depth = 15, train loss: 0.08404, val loss: 0.37056, in 2.828s
[82/100] 5 trees, 155 leaves (31 on avg), max depth = 14, train loss: 0.08228, val loss: 0.37020, in 0.904s
[83/100] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.08053, val loss: 0.36921, in 0.794s
[84/100] 5 trees, 155 leaves (31 on avg), max depth = 15, train loss: 0.07879, val loss: 0.36839, in 0.840s
[85/100] 5 trees, 155 leaves

In [73]:
for est in ens_model.estimators_:
    print(est)
    print(custom_score(est.predict(X_train_val_rolled), y_train_val))

NameError: name 'ens_model' is not defined

In [36]:
ens_model.estimators_

[RandomForestClassifier(random_state=1, verbose=1),
 BaggingClassifier(random_state=1, verbose=1),
 GradientBoostingClassifier(random_state=1, verbose=1)]

In [46]:
# ! pip3 install mlxtend
from mlxtend.classifier import EnsembleVoteClassifier

# Random Forest
rfc_ = RandomForestClassifier(random_state=6, n_estimators=100, verbose=1, n_jobs=-2)
rfc_.fit(X_train_train_rolled, y_train_train)
print("Random Forest validation score :", custom_score(rfc_.predict(X_train_val_rolled), y_train_val))

# Extra Trees
etc_ = ExtraTreesClassifier(verbose=1, random_state=1, n_estimators=1000, n_jobs=-2)
etc_.fit(X_train_train_rolled, y_train_train)
print("Extra Trees validation score :", custom_score(etc_.predict(X_train_val_rolled), y_train_val))

# Bagging
bc_ = BaggingClassifier(verbose=1, random_state=1, n_estimators=70, n_jobs=-2)
bc_.fit(X_train_train_rolled, y_train_train)
print('Bagging validation score :', custom_score(bc_.predict(X_train_val_rolled), y_train_val))

# Gradient Boosting

meta_estimator = EnsembleVoteClassifier(clfs=[rfc_, etc_, bc_, gbc_], weights=[1, 1, 1, 1], refit=False, voting='soft')
print('Meta validation score :', custom_score(meta_estimator.predict(X_train_val_rolled), y_train_val))
      


[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   17.6s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished


Random Forest validation score : 0.7585675942465293


[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-2)]: Done 1000 out of 1000 | elapsed:   40.2s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.3s finished


Extra Trees validation score : 0.7151015595476256


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  6.5min finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Bagging validation score : 0.7614552994819377
Gradient Boosting validation score : 0.7676057390387728


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished


TypeError: __init__() got an unexpected keyword argument 'refit'

In [84]:
import numpy as np 
from joblib import load

#ens_model_orig = load("models_archives/")
#gbc_ = ens_model_orig.estimators_[-1]


class VotingClassifier_(object):
    """ Implements a voting classifier for pre-trained classifiers"""

    def __init__(self, estimators, weights):
        self.estimators = estimators
        self.weights = weights

    def predict(self, X):
        # get values
        Y = np.zeros([X.shape[0], len(self.estimators)], dtype=int)
        for i, clf in enumerate(self.estimators):
            Y[:, i] = clf.predict(X)
        # apply voting 
        y = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            y[i] = np.argmax(np.bincount(Y[i,:], weights=self.weights))
        return y
    
z = VotingClassifier_([rfc_, etc_, bc_, gg], weights=[0.6, 0, 0.6, 0.6])
pred = z.predict(X_train_val_rolled)
print(custom_score(pred, y_train_val))

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


0.7665147580716193


[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished


In [65]:
custom_score(pred, y_train_val)

0.7472935982285043

In [77]:
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier

gg = HistGradientBoostingClassifier(verbose=1, random_state=1, max_iter=1000)
print("fit")
gg.fit(X_train_train_rolled, y_train_train)
print("predict")
print(custom_score(gg.predict(X_train_val_rolled), y_train_val))


fit
Binning 0.103 GB of training data: 2.037 s
Binning 0.011 GB of validation data: 0.068 s
Fitting gradient boosted rounds:
[1/1000] 5 trees, 155 leaves (31 on avg), max depth = 11, train loss: 1.27188, val loss: 1.29291, in 0.777s
[2/1000] 5 trees, 155 leaves (31 on avg), max depth = 10, train loss: 1.12838, val loss: 1.16196, in 0.707s
[3/1000] 5 trees, 155 leaves (31 on avg), max depth = 10, train loss: 1.01860, val loss: 1.06476, in 0.810s
[4/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.93048, val loss: 0.98763, in 0.937s
[5/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.85840, val loss: 0.92206, in 1.221s
[6/1000] 5 trees, 155 leaves (31 on avg), max depth = 11, train loss: 0.79825, val loss: 0.87061, in 1.028s
[7/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.74767, val loss: 0.82639, in 0.703s
[8/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.70255, val loss: 0.78754, in 0.557s
[9/1000] 5 

[76/1000] 5 trees, 155 leaves (31 on avg), max depth = 13, train loss: 0.10418, val loss: 0.38777, in 0.931s
[77/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.10210, val loss: 0.38716, in 1.014s
[78/1000] 5 trees, 155 leaves (31 on avg), max depth = 16, train loss: 0.10014, val loss: 0.38724, in 0.498s
[79/1000] 5 trees, 155 leaves (31 on avg), max depth = 13, train loss: 0.09809, val loss: 0.38690, in 0.518s
[80/1000] 5 trees, 155 leaves (31 on avg), max depth = 12, train loss: 0.09616, val loss: 0.38665, in 1.044s
[81/1000] 5 trees, 155 leaves (31 on avg), max depth = 13, train loss: 0.09415, val loss: 0.38590, in 1.022s
[82/1000] 5 trees, 155 leaves (31 on avg), max depth = 13, train loss: 0.09228, val loss: 0.38499, in 0.555s
[83/1000] 5 trees, 155 leaves (31 on avg), max depth = 11, train loss: 0.09049, val loss: 0.38420, in 0.523s
[84/1000] 5 trees, 155 leaves (31 on avg), max depth = 13, train loss: 0.08885, val loss: 0.38274, in 1.092s
[85/1000] 5 trees, 

In [40]:
from sklearn.ensemble import BaggingClassifier

for i in range(0, 8):
    bc = BaggingClassifier(
        verbose=1,
        random_state=i,
        n_estimators=70,
        n_jobs=-2
    )
    bc.fit(X_train_train_rolled, y_train_train)
    print("random_state =", i, " ", custom_score(bc.predict(X_train_val_rolled), y_train_val))

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


KeyboardInterrupt: 

In [39]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(
    verbose=1,
    random_state=7,
    n_estimators=1000,
    n_jobs=-2
)
etc.fit(X_train_train_rolled, y_train_train)
custom_score(etc.predict(X_train_val_rolled), y_train_val)

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-2)]: Done 194 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-2)]: Done 444 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-2)]: Done 794 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-2)]: Done 1000 out of 1000 | elapsed:   43.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.6s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    1.0s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    1.0s finished


0.7152021912689559

In [33]:
np.mean(etc.predict(X_train_val_rolled) == bc.predict(X_train_val_rolled))

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.5s finished


0.8425553319919518

In [24]:
from sklearn.ensemble import RandomForestClassifier

estimator_rfs = [RandomForestClassifier(
    verbose=1,
    random_state=i,
    n_estimators=100, # default=100
    n_jobs=-2,
) for i in range(10)]

print('Training')
[estimator_rf.fit(X_train_train_rolled, y_train_train) for estimator_rf in estimator_rfs]

# train_score_rf = custom_score(estimator_rf.predict(X_train_train_rolled), y_train_train)
print("Validation")
val_score_rfs = [custom_score(estimator_rf.predict(X_train_val_rolled), y_train_val) for estimator_rf in estimator_rfs]
print(val_score_rfs)

Training


[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   19.0s finished
[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   18.4s finished
[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   19.7s finished
[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   18.4s finished
[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:    8.1s
[

Validation


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_job

[0.7460787543984956, 0.7578073111516713, 0.7498914084261744, 0.7568112500771721, 0.7521785044982809, 0.7532130867032345, 0.7585675942465293, 0.7495620166087473, 0.7447450983784023, 0.7418007638569297]


In [76]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, ExtraTreesClassifier

ens_model = VotingClassifier(
    [('random_forest', RandomForestClassifier(n_estimators=100, random_state=1, verbose=1, n_jobs=-2)),
     ('bag_clf', BaggingClassifier(n_estimators=70, random_state=1, verbose=1, n_jobs=-2)),
     ('hgbc', HistGradientBoostingClassifier(random_state=1, verbose=1, max_iter=??)),
     ('etc', ExtraTreesClassifier(n_estimators=1000, random_state=1, verbose=1, n_jobs=-2))
    ],
    voting='soft',
    weights=[1, 1, 1, 1])

print('Fitting')
ens_model.fit(X_train_train_rolled, y_train_train)

print('Elementary validation score')
for est in ens_model.estimators_:
    print(est, "------> validation_score :", custom_score(est.predict(X_train_val_rolled), y_train_val))

print('Predicting')
print("soft voting validation score :", custom_score(ens_model.predict(X_train_val_rolled), y_train_val))

ens_model.set_params(voting='hard') # Change voting policy
print('hard voting validation score :', custom_score(ens_model.predict(X_train_val_rolled), y_train_val))
ens_model.set_params(voting='soft') # Back to original voting policy

Fitting


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  44 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   23.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


KeyboardInterrupt: 

In [74]:
print('Fitting')
# ens_model.fit(X_train_train_rolled, y_train_train)
ens_model.set_params(weights=[1, 1, 1])
print('Predicting')
print("soft voting validation score :", custom_score(ens_model.predict(X_train_val_rolled), y_train_val))

ens_model.set_params(voting='hard') # Change voting policy
print('hard voting validation score :', custom_score(ens_model.predict(X_train_val_rolled), y_train_val))
ens_model.set_params(voting='soft') # Back to original voting policy

Fitting
Predicting


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.4s finished


soft voting validation score : 0.760879703732226


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.5s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.3s


hard voting validation score : 0.7429610390794625


[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.5s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.6s finished


VotingClassifier(estimators=[('random_forest',
                              RandomForestClassifier(n_jobs=-2, random_state=1,
                                                     verbose=1)),
                             ('bag_clf',
                              BaggingClassifier(n_estimators=70, n_jobs=-2,
                                                random_state=1, verbose=1)),
                             ('etc',
                              ExtraTreesClassifier(n_estimators=1000, n_jobs=-2,
                                                   random_state=1,
                                                   verbose=1))],
                 voting='soft', weights=[1, 1, 0])

In [48]:
from joblib import dump, load
dump(ens_model, "models_archives/ensemble_model_rf-bc-gbc_rs=1.joblib")

['models_archives/ensemble_model_rf-bc-gbc_rs=1.joblib']

| Random Forest Params | Time Features | LogMod Features | Sleep Features | Shifts | Comments | Training Score | Validation Score |
| :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | 
| `random_state=1, n_estimators=100` | `quantiles=[0.0001, 0.01, ODD_DECILES, 0.99, 0.9999], interquantiles=[(0.1, 0.9), (0.3, 0.7)]` | `quantiles_inv=[ODD_DECILES], interquantiles_inv=[(0.1, 0.9), (0.3, 0.7)]`| Yes | (-1, 0, 1) | - | - | 0.758 |
| `random_state=1, n_estimators=100, max_features='log2'` | `quantiles=[0.0001, 0.01, ODD_DECILES, 0.99, 0.9999], interquantiles=[(0.1, 0.9), (0.3, 0.7)]` | `quantiles_inv=[ODD_DECILES], interquantiles_inv=[(0.1, 0.9), (0.3, 0.7)]`| Yes | (-1, 0, 1) | - | - | 0.735 |
| `random_state=1, n_estimators=100` | `quantiles=[0.0001, 0.01, ODD_DECILES, 0.99, 0.9999], interquantiles=[(0.1, 0.9), (0.45, 0.55)]` | `quantiles_inv=[ODD_DECILES], interquantiles_inv=[(0.1, 0.9), (0.45, 0.55)]`| Yes | (-1, 0, 1) | - | - | 0.759 |
| `GradientBoostingClassifier` | `quantiles=[0.0001, 0.01, ODD_DECILES, 0.99, 0.9999], interquantiles=[(0.1, 0.9), (0.45, 0.55)]` | `quantiles_inv=[ODD_DECILES], interquantiles_inv=[(0.1, 0.9), (0.3, 0.7)]`| Yes | (-1, 0, 1) | - | - | 0.768 |
| `VotingClassifier([RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier], weights=[0.3, 0.3, 0.4]); random_state=1` | `quantiles=[0.0001, 0.01, ODD_DECILES, 0.99, 0.9999], interquantiles=[(0.1, 0.9), (0.3, 0.7)]` | `quantiles_inv=[ODD_DECILES], interquantiles_inv=[(0.1, 0.9), (0.3, 0.7)]`| Yes | (-1, 0, 1) | - | - | hard=0.767; soft=0.771 |

| Random Forest Params | Time Features Quantiles | Time Features Moments | Sleep Features | Pulse Freq (f_max, A_max) | Shifts | Comments | Training Score | Validation Score |
| :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: |
| - | 0.1, 0.5, 0.9 | - | No | - | 0 | - | 1| 0.67|
| - | 0.1, 0.5, 0.9 | - | Yes | - | 0 | - | 1 | 0.69|
| - | 0.1, 0.5, 0.9 | - | Yes | - | -1, 0, 1 | - | 1 | 0.7|
| - | 0.01, 0.1, 0.5, 0.9, 0.99 | - | Yes | - | -1, 0, 1 | - | 1| 0.7  |
| `min_samples_leaf=10` | 0.01, 0.1, 0.5, 0.9, 0.99 | - | Yes | - | -1, 0, 1 | - | 0.89 | 0.69  |
| `min_samples_leaf=10` | 0.01, 0.1, 0.5, 0.9, 0.99 | 1, 2 | Yes | - | -1, 0, 1 |  - | 0.89 | 0.69  |
| - | 0.01, 0.1, 0.5, 0.9, 0.99 | 1, 2 | Yes | - | -1, 0, 1 |  - | 1 | 0.697  |
| - | 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99 | - | Yes | - | -1, 0, 1 |  - | 1 | 0.708  |
| - | 0.01, DECILES, 0.99 | - | Yes | - | -1, 0, 1 |  - | 1 | 0.709  |
| `min_samples_leaf=10` | 0.01, DECILES, 0.99 | - | Yes | - | -1, 0, 1 |  - | 0.89 | 0.697 |
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 |  - | 0.89 | 0.699 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 |  - | 1 | 0.713929 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | Yes | -1, 0, 1 |  - | 1 | 0.7 |
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | Yes | -1, 0, 1 | - | 0.89 | 0.697 |
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | Pulse Only | -1, 0, 1 | - | 0.89 | 0.7 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | Pulse Only | -1, 0, 1 |  - | 1 | 0.7055 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | - |  1 | 0.7055 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX + derivee 0.5 | - | Yes | - | -1, 0, 1 |  - | 1 | 0.708 |
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX + derivee 0.5 | - | Yes | - | -1, 0, 1 |  - | 0.89 | 0.700|
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX + derivee MIN, MAX | - | Yes | - | -1, 0, 1 | - | 0.89| 0.7 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX + derivee MIN, MAX | - | Yes | - | -1, 0, 1 | - | 1| 0.703|
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `bandlog rescaled`| 1| 0.665 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = 10%, 90% for logmod`| 1| 0.7165 |
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = 10%, 90% for logmod`| ? | 0.70 < x < 0.71 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES for logmod`| 1 | 0.737|
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES for logmod`| 0.898 | 0.721 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES for logmod; eeg_mean only`| 1 | 0.645 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES and quantiles = 0.1, 0.5, 0.9 for logmod`| 1 | 0.719 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `interquantiles_inv = (0.1, 0.9), (0.3, 0.7) and quantiles_inv = 0.5 for logmod`| 1 | 0.74.. |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES and interquantiles_inv = (0.1, 0.9), (0.3, 0.7) for logmod`| 1 | 0.753 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES and interquantiles_inv = (0.1, 0.9), (0.3, 0.7) for logmod, interquantiles = (0.1, 0.9), (0.3, 0.7) for time features`| 1 | 0.754 (Alex) - 0.742 (Mrml) |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES and interquantiles_inv = (0.1, 0.9), (0.3, 0.7) for logmod, interquantiles = (0.1, 0.9), (0.3, 0.7) for time features;  n_estimators = 300`| 1 | 0.756 (Alex)|


In [64]:
w_noob = np.linspace(0.1, 0.7, num=30)
scores = list()
for w in w_noob:
    scores.append(custom_score(meta_estimator.predict_soft(X_train_val_rolled, weights=[w, w, w, 1]), y_train_val))
    
pd.Series(index=w_noob, data=scores).sort_values()

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    3.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend T

[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.4s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: D

0.700000    0.774816
0.679310    0.776229
0.658621    0.776255
0.637931    0.776255
0.617241    0.776266
0.596552    0.776984
0.575862    0.776993
0.555172    0.779529
0.534483    0.780217
0.513793    0.781572
0.472414    0.781634
0.493103    0.781705
0.451724    0.783088
0.431034    0.784910
0.410345    0.786205
0.389655    0.786205
0.348276    0.786207
0.368966    0.786274
0.306897    0.786526
0.244828    0.786706
0.265517    0.786708
0.327586    0.787061
0.286207    0.787190
0.224138    0.788614
0.203448    0.789349
0.100000    0.792994
0.182759    0.793041
0.162069    0.793193
0.141379    0.794047
0.120690    0.794047
dtype: float64

In [65]:
custom_score(meta_estimator.predict_soft(X_train_val_rolled, weights=[0, 0, 0, 1]), y_train_val)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.3s finished


0.7958563452403618

In [67]:
#test_ids = get_subject_ids(h5_test)
#X_test_raw = make_input_default(h5_test)
#X_test_rolled = concat_windows(X_test_raw, test_ids, h5_test, shifts)
#opt_w = 0.120690 
y_pred = meta_estimator.predict_soft(X_test_rolled, weights=[opt_w, opt_w, opt_w, 1]).astype(int)
#print(custom_score(meta_estimator.predict_soft(X_train_val_rolled, weights=[opt_w, opt_w, opt_w, 1]), y_train_val))
#submit_to_kaggle(y_pred, h5_test, fname='ensemble_21-01-03.csv', msg="")

New submission file at submissions/ensemble_21-01-03.csv


In [75]:
# y_pred.astype(int)

array([0, 0, 0, ..., 0, 2, 0])

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

scaler_ = MinMaxScaler()

X_train_train_rolled_svc = scaler_.fit_transform(X_train_train_rolled) 
X_train_val_rolled_svc = scaler_.transform(X_train_val_rolled)
X_test_rolled_svc = scaler_.transform(X_test_rolled)


estimator_svc = SVC(verbose=1, kernel='rbf', C=1, max_iter=1000, random_state=1)
estimator_svc.fit(X_train_train_rolled_svc, y_train_train)

train_score_svc = custom_score(estimator_svc.predict(X_train_train_rolled_svc), y_train_train)
val_score_svc = custom_score(estimator_svc.predict(X_train_val_rolled_svc), y_train_val)

In [125]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()

estimator_rf = RandomForestClassifier(
    random_state=1,
    n_estimators=100
)

clf.fit(X_train_train_rolled, y_train_train)
estimator_rf.fit(clf.transform(X_train_train_rolled), y_train_train)

# train_score_rf = custom_score(estimator_rf.predict(X_train_train_rolled), y_train_train)
val_score_rf = custom_score(estimator_rf.predict(clf.transform(X_train_val_rolled)), y_train_val)


In [127]:
clf.transform(X_train_val_rolled)

array([[-3.13757384,  2.20432082, -0.70161822,  1.40496954],
       [-4.363443  ,  3.82483367, -0.03437306,  3.44951271],
       [-3.89253356,  4.15378957, -0.03478012,  1.36605611],
       ...,
       [-2.07190603,  1.44258113, -1.40363889,  0.08353262],
       [-3.11006764,  3.8393016 , -1.80742658, -1.15457525],
       [-4.44476345,  5.3927919 , -1.67107598, -0.19829525]])

In [None]:
from sklearn.ensemble import AdaBoostClassifier

estimator_ab = AdaBoostClassifier(
    base_estimator = estimator_rf,
    # verbose=1,
    random_state=1,
    n_estimators=50, # default=100
    #learning_rate=0.5
)

estimator_ab.fit(X_train_train_rolled, y_train_train)

train_score_ab = custom_score(estimator_ab.predict(X_train_train_rolled), y_train_train)
val_score_ab = custom_score(estimator_ab.predict(X_train_val_rolled), y_train_val)

print(train_score_ab)
print(val_score_ab)

In [10]:
a = [1, 2, 3, 4]
a = np.array(a)

In [14]:
np.cumsum(a)

array([ 1,  3,  6, 10])

In [16]:
z = pd.DataFrame(np.random.rand(100, 100))

In [18]:
z.sum(axis=1)

0     49.739217
1     52.057421
2     46.613845
3     53.308325
4     43.123208
        ...    
95    50.187987
96    53.308523
97    48.514607
98    54.877092
99    49.926575
Length: 100, dtype: float64

### IDÉES
- recherche de l'harmonique dans le spectre (donc ne garder que les quantiles inverses qui sont moindres)
- (codé) essayer l'entropie
- (codé) essayer les paramètres de Hjorth
- essayer EMD (Empirical Mode Decomposition)
- (codé) essayer MMD (Minimum-Maximum Distance) 

In [27]:
get_mmd(h5_train["alpha_eeg_2"][:3])

array([[399056.36732905],
       [179230.45407763],
       [ 75138.16948643]])

In [50]:
get_hjorth_parameters(np.exp(2*h5_train["eeg_1"][:30]))

  """Entry point for launching an IPython kernel.
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  x = asanyarray(arr - arrmean)
  a = op(a[slice1], a[slice2])
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  a = op(a[slice1], a[slice2])


array([[nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan]])

In [7]:
X_train_train.isna().sum(axis=0).sort_values()

>>alpha_eeg_1_logE>>  moment_1                 0
>>eeg_4>>             qt_0.01                  0
                      qt_0.1                   0
                      qt_0.3                   0
                      qt_0.5                   0
                                           ...  
energy>>eeg_1>>       Hjorth_complexity    22694
energy>>eeg_6>>       Hjorth_mobility      22697
energy>>eeg_5>>       Hjorth_mobility      22698
                      Hjorth_complexity    22698
energy>>eeg_6>>       Hjorth_complexity    22698
Length: 170, dtype: int64