In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import os
import importlib

from kaggle_submit import submit_to_kaggle


from objects import *
from helpers import *
# from utils.globals import *
from utils.distribution_statistics import *

train_file = "kaggle_data/X_train.h5/X_train.h5"
test_file = "kaggle_data/X_test.h5/X_test.h5"

def get_train_test_connections():
    h5_train = h5py.File(train_file, mode='a')
    h5_test = h5py.File(test_file, mode='a')
    return h5_train, h5_test

def close_train_test_connections(h5_train, h5_test):
    h5_train.close()
    h5_test.close()
    
h5_train, h5_test = get_train_test_connections()

y_train = pd.read_csv("kaggle_data/y_train.csv", index_col=0, squeeze=True)
y_train_arr = y_train.to_numpy()

# MAKE CUSTOM FEATURES
#from additional_features.make_features import make_all_features
#make_all_features(h5_train, h5_test, n_chunks=10, verbose=True, overwrite=False)
from additional_features.features_to_frequential import _create_log_modulus, BAND_EEG

for h5_file in (h5_train, h5_test):
    _create_log_modulus(h5_file, n_chunks=10, features_to_convert=BAND_EEG, overwrite=False, verbose=True)

# Close connections because utils.globals needs them
close_train_test_connections(h5_train, h5_test)

from utils import *

h5_train, h5_test = get_train_test_connections()

In [8]:
import itertools

eeg_nums = list(range(1, 8))
greek_letters = ["alpha", "beta", "delta", "theta"]

EEG_FEATURES = [f"eeg_{i}" for i in eeg_nums]
EEG_BAND_FEATURES = [f"{greek}_eeg_{i}" for greek, i in itertools.product(greek_letters, eeg_nums)]
OTHER_TIME_FEATURES = ["speed_norm", "accel_norm", "pulse"]

EEG_LOGMOD_FEATURES = [f"{eeg}_ft_logmod" for eeg in EEG_FEATURES]
EEG_BAND_LOGMOD_FEATURES = [f"{eeg_band}_ft_logmod" for eeg_band in EEG_BAND_FEATURES]
OTHER_LOGMOD_FEATURES = [f"{time_feat}_ft_logmod" for time_feat in OTHER_TIME_FEATURES]

SLEEP_FEATURES = [feat for feat in h5_train.keys() if "sleep" in feat]

# OLD NAMES
BAND_LOG_ENERGY_FEATURES_OLD = [f"{greek}_{eeg}_logE" for greek, eeg in itertools.product(greek_letters, EEG_FEATURES)]
LOGMOD_FEATURES_OLD = EEG_LOGMOD_FEATURES + OTHER_LOGMOD_FEATURES
TIME_FEATURES_OLD = EEG_FEATURES + OTHER_TIME_FEATURES

In [9]:
all_features = set(h5_train.keys())
all_features - set(BAND_LOG_ENERGY_FEATURES_OLD) - set(LOGMOD_FEATURES_OLD) - set(TIME_FEATURES_OLD) - set(SLEEP_FEATURES)

{'alpha_eeg_1',
 'alpha_eeg_1_ft_logmod',
 'alpha_eeg_2',
 'alpha_eeg_2_ft_logmod',
 'alpha_eeg_3',
 'alpha_eeg_3_ft_logmod',
 'alpha_eeg_4',
 'alpha_eeg_4_ft_logmod',
 'alpha_eeg_5',
 'alpha_eeg_5_ft_logmod',
 'alpha_eeg_6',
 'alpha_eeg_6_ft_logmod',
 'alpha_eeg_7',
 'alpha_eeg_7_ft_logmod',
 'alpha_eeg_mean_frontal_logE',
 'alpha_eeg_mean_frontal_occipital_logE',
 'beta_eeg_1',
 'beta_eeg_1_ft_logmod',
 'beta_eeg_2',
 'beta_eeg_2_ft_logmod',
 'beta_eeg_3',
 'beta_eeg_3_ft_logmod',
 'beta_eeg_4',
 'beta_eeg_4_ft_logmod',
 'beta_eeg_5',
 'beta_eeg_5_ft_logmod',
 'beta_eeg_6',
 'beta_eeg_6_ft_logmod',
 'beta_eeg_7',
 'beta_eeg_7_ft_logmod',
 'beta_eeg_mean_frontal_logE',
 'beta_eeg_mean_frontal_occipital_logE',
 'delta_eeg_1',
 'delta_eeg_1_ft_logmod',
 'delta_eeg_2',
 'delta_eeg_2_ft_logmod',
 'delta_eeg_3',
 'delta_eeg_3_ft_logmod',
 'delta_eeg_4',
 'delta_eeg_4_ft_logmod',
 'delta_eeg_5',
 'delta_eeg_5_ft_logmod',
 'delta_eeg_6',
 'delta_eeg_6_ft_logmod',
 'delta_eeg_7',
 'delta_eeg_

In [10]:
def make_input_default(h5_file):
    
    dfs = list()
    
    dfs.append( # df_bandlog
        make_input_new(
            h5_file,
            features=BAND_LOG_ENERGY_FEATURES_OLD,
            rescale_by_subject=False,
            moments=[1],
        )
    )
    
    #dfs.append( # df_spectral_entropy = 
    #    make_input_new(
    #        h5_file,
    #        features=EEG_LOGMOD_FEATURES,
    #        rescale_by_subject=False,
    #        entropy=True,
    #        pre_op=lambda x: np.exp(2 * x),
    #        pre_op_name='energy'
    #    )
    #)
    
    #dfs.append( # df_time_hjorth = 
    #    make_input_new(
    #        h5_file,
    #        features=EEG_FEATURES,
    #        rescale_by_subject=False,
    #        hjorth=True,
            #pre_op=lambda x: np.exp(2 * x),
            #pre_op_name='energy'
    #    )
    #)
        
    dfs.append( # df_sleep = 
        make_input_new(
            h5_file,
            features=SLEEP_FEATURES,
            rescale_by_subject=False,
            moments=[1]
        )
    )
    
    ## LOGMOD RENAME COLUMNS
    
    dfs.append( # df_logmod = 
        make_input_new(
            h5_file,
            features=LOGMOD_FEATURES_OLD,
            rescale_by_subject=False,
            #interquantiles=[(0.2, 0.8)],
            quantiles_inv=[0.1, 0.3, 0.5, 0.7, 0.9],
            diff_orders=[0],
            interquantiles_inv=[(0.1, 0.9), (0.3, 0.7)],
        )
    )
    
 
    dfs.append( # df_time_diff_0 = 
        make_input_new(
            h5_file,
            features=TIME_FEATURES_OLD,
            rescale_by_subject=False,
            # moments=[1, 2],
            quantiles=[1e-4, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99, 1-1e-4],
            interquantiles=[(0.1, 0.9), (0.3, 0.7)],
            diff_orders=[0]
        )
    )
    
    #dfs.append( # df_eeg_bands =
    #    make_input_new(
    #        h5_file,
    #        features=EEG_BAND_FEATURES,
    #       hjorth=True,
    #       diff_orders=[0]
    #    )
    #)
    
    # EEG INTEGRATED
    #eeg_features = [feat for feat in FEATURES if re.search('^eeg_\d$', feat)]
    # print(eeg_features)
    #df_eeg_integrated = make_input_new(
    #    h5_file,
    #    features=eeg_features,
    #    rescale_by_subject=False,
    #    # moments=[1, 2],
    #    quantiles=[0.1, 0.5, 0.9],
    #    #interquantiles=[(0.1, 0.9), (0.3, 0.7)],
    #    diff_orders=[-1]
    #)
    
    
    #df_time_diff_1 = make_input_new(
     #   h5_file,
     #   features=TIME_FEATURES_OLD,
     #   rescale_by_subject=False,
     #   moments=[1, 2],
        #quantiles=[1e-4, 1-1e-4],
     #   diff_orders=[1]
    #)
    
    #df_pulse_max_freq = make_input_new(
    #    h5_file,
    #    features=["pulse_max_freq"],
    #    rescale_by_subject=True,
    #    moments=[1],
    #)
    
    #df_pulse_max_logE = make_input_new(
    #    h5_file,
    #    features=["pulse_max_logE"],
    #    rescale_by_subject=False,
    #    moments=[1],
        #pre_op=np.exp,
        #pre_op_name="energy"
    #)
    
    
    res_df = pd.concat(dfs, axis=1, keys=[str(i) for i in range(len(dfs))])
    
    # Filling policy
    missing_values = res_df.isna().sum(axis=0)
    missing_values = missing_values.loc[missing_values > 0]
    if len(missing_values) > 0:
        print("Missing values :")
        print(missing_values)
        print("Filling missing values with zero")
        res_df = res_df.fillna(0)
        
    return res_df

def shift_and_fill(df, shift):
    shifted_df = df.shift(shift)
    if shift > 0:
        shifted_df.bfill(inplace=True)
    elif shift < 0:
        shifted_df.ffill(inplace=True)
    return shifted_df


def roll_and_concat(df, shifts_range):
    return pd.concat(map(lambda shift: shift_and_fill(df, shift), shifts_range), 
                     axis=1, keys=shifts_range)    
    
def concat_windows(h5_file, df, shifts):
    df = df.groupby(h5_file["index"][:], as_index=False).apply(roll_and_concat, shifts_range=shifts)
    return df
    
def make_input_default_test(h5_file):
    return make_input_new(h5_file, ["eeg_1", "eeg_2"], moments=[1])

def make_input_default_rolling(h5_file, shifts):
    """
    !!! not suited for pca because columns have 3 levels
    """
    df = make_input_default(h5_file)
    df_with_window = concat_windows(h5_file, df, shifts)
    return df_with_window


In [11]:
X_train_raw = make_input_default(h5_train)

Feature #10/10[1K

In [12]:
train_ids = get_subject_ids(h5_train)
train_train_ids, train_val_ids = train_ids[:28], train_ids[28:]

X_train_train = X_train_raw.loc[subjects_ids_to_indexers(h5_train, train_train_ids, as_indices=True), :]
y_train_train = y_train_arr[subjects_ids_to_indexers(h5_train, train_train_ids, as_indices=True)]

X_train_val = X_train_raw.loc[subjects_ids_to_indexers(h5_train, train_val_ids, as_indices=True), :]
y_train_val = y_train_arr[subjects_ids_to_indexers(h5_train, train_val_ids, as_indices=True)]

"""
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

scaler_ = MinMaxScaler()
pca_ = PCA(0.99)

X_train_train = pca_.fit_transform(scaler_.fit_transform(X_train_train))
X_train_val = pca_.transform(scaler_.transform(X_train_val))
X_test = pca_.transform(scaler_.transform(X_test_raw))
"""
def subjects_ids_col(h5_file):
    return h5_file["index"][:]

def concat_windows(arr, subjects_ids, h5_file, shifts): # subjects_ids must be sorted
    sid_col = subjects_ids_col(h5_file)
    sid_col = sid_col[np.isin(sid_col, subjects_ids)]
    df = pd.DataFrame(arr)
    
    return df.groupby(sid_col).apply(roll_and_concat, shifts_range=shifts)

shifts = [-1, 0, 1]
X_train_train_rolled = concat_windows(X_train_train.fillna(0), train_train_ids, h5_train, shifts)
X_train_val_rolled = concat_windows(X_train_val.fillna(0), train_val_ids, h5_train, shifts)


In [13]:
from sklearn.ensemble import RandomForestClassifier

estimator_rf = RandomForestClassifier(
    random_state=1,
    n_estimators=100, # default=100
)

#estimator_rf.fit(X_train_train_rolled, y_train_train)

# train_score_rf = custom_score(estimator_rf.predict(X_train_train_rolled), y_train_train)
#val_score_rf = custom_score(estimator_rf.predict(X_train_val_rolled), y_train_val)

In [47]:
from sklearn.ensemble import VotingClassifier

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC


ens_model = VotingClassifier(
    [('random_forest', RandomForestClassifier(random_state=1, verbose=1)),
     ('bag_clf', BaggingClassifier(random_state=1, verbose=1)),
     ('gbc', GradientBoostingClassifier(random_state=1, verbose=1)),
    ],
    voting='soft',
    weights=[0.3, 0.3, 0.4])

print('Fitting')
ens_model.fit(X_train_train_rolled, y_train_train)

print('Predicting')
print("soft voting validation score :", custom_score(ens_model.predict(X_train_val_rolled), y_train_val))

ens_model.set_params(voting='hard') # Change voting policy
print('hard voting validation score :', custom_score(ens_model.predict(X_train_val_rolled), y_train_val))
ens_model.set_params(voting='soft') # Back to original voting policy

Fitting


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   42.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min finished


      Iter       Train Loss   Remaining Time 
         1           1.3639           24.98m
         2           1.2785           24.33m
         3           1.2105           23.97m
         4           1.1536           23.65m
         5           1.1048           23.42m
         6           1.0639           23.26m
         7           1.0277           23.34m
         8           0.9947           23.27m
         9           0.9621           23.10m
        10           0.9344           22.81m
        20           0.7561           19.88m
        30           0.6604           17.28m
        40           0.5988           14.78m
        50           0.5537           12.46m
        60           0.5211           10.05m
        70           0.4924            7.66m
        80           0.4688            5.15m
        90           0.4501            2.57m
       100           0.4332            0.00s
Predicting


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


soft voting validation score : 0.7709691807713865
hard voting validation score : 0.7673198539290149


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


VotingClassifier(estimators=[('random_forest',
                              RandomForestClassifier(random_state=1,
                                                     verbose=1)),
                             ('bag_clf',
                              BaggingClassifier(random_state=1, verbose=1)),
                             ('gbc',
                              GradientBoostingClassifier(random_state=1,
                                                         verbose=1))],
                 voting='soft', weights=[0.3, 0.3, 0.4])

In [48]:
from joblib import dump, load
dump(ens_model, "models_archives/ensemble_model_rf-bc-gbc_rs=1.joblib")

['models_archives/ensemble_model_rf-bc-gbc_rs=1.joblib']

| Random Forest Params | Time Features | LogMod Features | Sleep Features | Shifts | Comments | Training Score | Validation Score |
| :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | 
| `random_state=1, n_estimators=100` | `quantiles=[0.0001, 0.01, ODD_DECILES, 0.99, 0.9999], interquantiles=[(0.1, 0.9), (0.3, 0.7)]` | `quantiles_inv=[ODD_DECILES], interquantiles_inv=[(0.1, 0.9), (0.3, 0.7)]`| Yes | (-1, 0, 1) | - | - | 0.758 |
| `random_state=1, n_estimators=100, max_features='log2'` | `quantiles=[0.0001, 0.01, ODD_DECILES, 0.99, 0.9999], interquantiles=[(0.1, 0.9), (0.3, 0.7)]` | `quantiles_inv=[ODD_DECILES], interquantiles_inv=[(0.1, 0.9), (0.3, 0.7)]`| Yes | (-1, 0, 1) | - | - | 0.735 |
| `random_state=1, n_estimators=100` | `quantiles=[0.0001, 0.01, ODD_DECILES, 0.99, 0.9999], interquantiles=[(0.1, 0.9), (0.45, 0.55)]` | `quantiles_inv=[ODD_DECILES], interquantiles_inv=[(0.1, 0.9), (0.45, 0.55)]`| Yes | (-1, 0, 1) | - | - | 0.759 |
| `GradientBoostingClassifier` | `quantiles=[0.0001, 0.01, ODD_DECILES, 0.99, 0.9999], interquantiles=[(0.1, 0.9), (0.45, 0.55)]` | `quantiles_inv=[ODD_DECILES], interquantiles_inv=[(0.1, 0.9), (0.3, 0.7)]`| Yes | (-1, 0, 1) | - | - | 0.768 |
| `VotingClassifier([RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier], weights=[0.3, 0.3, 0.4]); random_state=1` | `quantiles=[0.0001, 0.01, ODD_DECILES, 0.99, 0.9999], interquantiles=[(0.1, 0.9), (0.3, 0.7)]` | `quantiles_inv=[ODD_DECILES], interquantiles_inv=[(0.1, 0.9), (0.3, 0.7)]`| Yes | (-1, 0, 1) | - | - | hard=0.767; soft=0.771 |

| Random Forest Params | Time Features Quantiles | Time Features Moments | Sleep Features | Pulse Freq (f_max, A_max) | Shifts | Comments | Training Score | Validation Score |
| :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: |
| - | 0.1, 0.5, 0.9 | - | No | - | 0 | - | 1| 0.67|
| - | 0.1, 0.5, 0.9 | - | Yes | - | 0 | - | 1 | 0.69|
| - | 0.1, 0.5, 0.9 | - | Yes | - | -1, 0, 1 | - | 1 | 0.7|
| - | 0.01, 0.1, 0.5, 0.9, 0.99 | - | Yes | - | -1, 0, 1 | - | 1| 0.7  |
| `min_samples_leaf=10` | 0.01, 0.1, 0.5, 0.9, 0.99 | - | Yes | - | -1, 0, 1 | - | 0.89 | 0.69  |
| `min_samples_leaf=10` | 0.01, 0.1, 0.5, 0.9, 0.99 | 1, 2 | Yes | - | -1, 0, 1 |  - | 0.89 | 0.69  |
| - | 0.01, 0.1, 0.5, 0.9, 0.99 | 1, 2 | Yes | - | -1, 0, 1 |  - | 1 | 0.697  |
| - | 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99 | - | Yes | - | -1, 0, 1 |  - | 1 | 0.708  |
| - | 0.01, DECILES, 0.99 | - | Yes | - | -1, 0, 1 |  - | 1 | 0.709  |
| `min_samples_leaf=10` | 0.01, DECILES, 0.99 | - | Yes | - | -1, 0, 1 |  - | 0.89 | 0.697 |
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 |  - | 0.89 | 0.699 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 |  - | 1 | 0.713929 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | Yes | -1, 0, 1 |  - | 1 | 0.7 |
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | Yes | -1, 0, 1 | - | 0.89 | 0.697 |
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | Pulse Only | -1, 0, 1 | - | 0.89 | 0.7 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | Pulse Only | -1, 0, 1 |  - | 1 | 0.7055 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | - |  1 | 0.7055 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX + derivee 0.5 | - | Yes | - | -1, 0, 1 |  - | 1 | 0.708 |
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX + derivee 0.5 | - | Yes | - | -1, 0, 1 |  - | 0.89 | 0.700|
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX + derivee MIN, MAX | - | Yes | - | -1, 0, 1 | - | 0.89| 0.7 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX + derivee MIN, MAX | - | Yes | - | -1, 0, 1 | - | 1| 0.703|
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `bandlog rescaled`| 1| 0.665 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = 10%, 90% for logmod`| 1| 0.7165 |
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = 10%, 90% for logmod`| ? | 0.70 < x < 0.71 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES for logmod`| 1 | 0.737|
| `min_samples_leaf=10` | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES for logmod`| 0.898 | 0.721 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES for logmod; eeg_mean only`| 1 | 0.645 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES and quantiles = 0.1, 0.5, 0.9 for logmod`| 1 | 0.719 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `interquantiles_inv = (0.1, 0.9), (0.3, 0.7) and quantiles_inv = 0.5 for logmod`| 1 | 0.74.. |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES and interquantiles_inv = (0.1, 0.9), (0.3, 0.7) for logmod`| 1 | 0.753 |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES and interquantiles_inv = (0.1, 0.9), (0.3, 0.7) for logmod, interquantiles = (0.1, 0.9), (0.3, 0.7) for time features`| 1 | 0.754 (Alex) - 0.742 (Mrml) |
| - | MIN, 0.01, ODD_DECILES, 0.99, MAX | - | Yes | - | -1, 0, 1 | `quantiles_inv = ODD_DECILES and interquantiles_inv = (0.1, 0.9), (0.3, 0.7) for logmod, interquantiles = (0.1, 0.9), (0.3, 0.7) for time features;  n_estimators = 300`| 1 | 0.756 (Alex)|


In [150]:
test_ids = get_subject_ids(h5_test)
X_test_raw = make_input_default(h5_test)
X_test_rolled = concat_windows(X_test_raw, test_ids, h5_test, shifts)
y_pred = estimator_rf.predict(X_test_rolled)
#submit_to_kaggle(y_pred, h5_test, fname='rf_best_alternate_2021-01-01.csv', msg="")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.6s finished


New submission file at submissions/rf_best_alternate_2021-01-01.csv


In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

scaler_ = MinMaxScaler()

X_train_train_rolled_svc = scaler_.fit_transform(X_train_train_rolled) 
X_train_val_rolled_svc = scaler_.transform(X_train_val_rolled)
X_test_rolled_svc = scaler_.transform(X_test_rolled)


estimator_svc = SVC(verbose=1, kernel='rbf', C=1, max_iter=1000, random_state=1)
estimator_svc.fit(X_train_train_rolled_svc, y_train_train)

train_score_svc = custom_score(estimator_svc.predict(X_train_train_rolled_svc), y_train_train)
val_score_svc = custom_score(estimator_svc.predict(X_train_val_rolled_svc), y_train_val)

In [125]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()

estimator_rf = RandomForestClassifier(
    random_state=1,
    n_estimators=100
)

clf.fit(X_train_train_rolled, y_train_train)
estimator_rf.fit(clf.transform(X_train_train_rolled), y_train_train)

# train_score_rf = custom_score(estimator_rf.predict(X_train_train_rolled), y_train_train)
val_score_rf = custom_score(estimator_rf.predict(clf.transform(X_train_val_rolled)), y_train_val)


In [127]:
clf.transform(X_train_val_rolled)

array([[-3.13757384,  2.20432082, -0.70161822,  1.40496954],
       [-4.363443  ,  3.82483367, -0.03437306,  3.44951271],
       [-3.89253356,  4.15378957, -0.03478012,  1.36605611],
       ...,
       [-2.07190603,  1.44258113, -1.40363889,  0.08353262],
       [-3.11006764,  3.8393016 , -1.80742658, -1.15457525],
       [-4.44476345,  5.3927919 , -1.67107598, -0.19829525]])

In [None]:
from sklearn.ensemble import AdaBoostClassifier

estimator_ab = AdaBoostClassifier(
    base_estimator = estimator_rf,
    # verbose=1,
    random_state=1,
    n_estimators=50, # default=100
    #learning_rate=0.5
)

estimator_ab.fit(X_train_train_rolled, y_train_train)

train_score_ab = custom_score(estimator_ab.predict(X_train_train_rolled), y_train_train)
val_score_ab = custom_score(estimator_ab.predict(X_train_val_rolled), y_train_val)

print(train_score_ab)
print(val_score_ab)

In [10]:
a = [1, 2, 3, 4]
a = np.array(a)

In [14]:
np.cumsum(a)

array([ 1,  3,  6, 10])

In [16]:
z = pd.DataFrame(np.random.rand(100, 100))

In [18]:
z.sum(axis=1)

0     49.739217
1     52.057421
2     46.613845
3     53.308325
4     43.123208
        ...    
95    50.187987
96    53.308523
97    48.514607
98    54.877092
99    49.926575
Length: 100, dtype: float64

### IDÉES
- recherche de l'harmonique dans le spectre (donc ne garder que les quantiles inverses qui sont moindres)
- (codé) essayer l'entropie
- (codé) essayer les paramètres de Hjorth
- essayer EMD (Empirical Mode Decomposition)
- (codé) essayer MMD (Minimum-Maximum Distance) 

In [27]:
get_mmd(h5_train["alpha_eeg_2"][:3])

array([[399056.36732905],
       [179230.45407763],
       [ 75138.16948643]])

In [50]:
get_hjorth_parameters(np.exp(2*h5_train["eeg_1"][:30]))

  """Entry point for launching an IPython kernel.
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  x = asanyarray(arr - arrmean)
  a = op(a[slice1], a[slice2])
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)
  a = op(a[slice1], a[slice2])


array([[nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan],
       [nan, nan, nan]])

In [7]:
X_train_train.isna().sum(axis=0).sort_values()

>>alpha_eeg_1_logE>>  moment_1                 0
>>eeg_4>>             qt_0.01                  0
                      qt_0.1                   0
                      qt_0.3                   0
                      qt_0.5                   0
                                           ...  
energy>>eeg_1>>       Hjorth_complexity    22694
energy>>eeg_6>>       Hjorth_mobility      22697
energy>>eeg_5>>       Hjorth_mobility      22698
                      Hjorth_complexity    22698
energy>>eeg_6>>       Hjorth_complexity    22698
Length: 170, dtype: int64