In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import os
import importlib

from kaggle_submit import submit_to_kaggle
from helpers import *
from utils.globals import *
from utils.distribution_statistics import *

train_file = "kaggle_data/X_train.h5/X_train.h5"
test_file = "kaggle_data/X_test.h5/X_test.h5"

h5_train = h5py.File(train_file, mode='a')
h5_test = h5py.File(test_file, mode='a')

y_train = pd.read_csv("kaggle_data/y_train.csv", index_col=0, squeeze=True)
y_train_arr = y_train.to_numpy()

# MAKE CUSTOM FEATURES
from additional_features.make_features import make_all_features
make_all_features(h5_train, h5_test, n_chunks=10, verbose=True, overwrite=False)


from objects import *


In [2]:
features_df = pd.DataFrame(list(set(h5_train.keys()) - set(IRRELEVANT_FEATURES)), columns=["Feature"])
features_df.loc[:, "Dim"] = features_df['Feature'].apply(lambda x: h5_train[x][0].shape[0])
features_df.sort_values(by=["Dim", "Feature"])

import re

BAND_LOG_ENERGY_FEATURES = [feat for feat in FEATURES if re.search("(?:(?:alpha)|(?:beta)|(?:delta)|(?:theta)){1}.*_logE", feat)]

SLEEP_FEATURES = [feat for feat in FEATURES if re.search('sleep.*[^(?:logmod)]', feat)]

LOGMOD_FEATURES = [feat for feat in FEATURES if re.search('.*_logmod', feat)]

TIME_FEATURES

OTHER_FEATURES = ["pulse_max_freq", "pulse_max_logE"]

_features = sum([BAND_LOG_ENERGY_FEATURES, SLEEP_FEATURES, LOGMOD_FEATURES, TIME_FEATURES, OTHER_FEATURES], [])
assert sorted(_features) == sorted(FEATURES)



In [3]:

    
def make_input_default(h5_file):
    df_bandlog = make_input_new(
        h5_file,
        features=BAND_LOG_ENERGY_FEATURES,
        rescale_by_subject=False,
        moments=[1],
    )
    #pre_op=np.exp,
    #    pre_op_name="energy"
    #)
    
    df_sleep = make_input_new(
        h5_file,
        features=SLEEP_FEATURES,
        rescale_by_subject=False,
        moments=[1]
    )
    
    ## LOGMOD RENAME COLUMNS
    
    df_logmod_no_rescale = make_input_new(
        h5_file,
        features=LOGMOD_FEATURES,
        rescale_by_subject=False,
        moments=[1],
        quantiles_inv=[0.25, 0.5, 0.75],
        diff_orders=[0],
        pre_op=lambda x: np.exp(2 * x),
        pre_op_name="energy"
    )
    
   # cols_no_rescale = [(col[0] + "_no_rescale", *col[1:]) for col in df_logmod_no_rescale.columns]
   # df_logmod_no_rescale.columns = pd.MultiIndex.from_tuples(cols_no_rescale)
    
   # df_logmod_with_rescale = make_input_bis(
   #     h5_file,
   #     features=LOGMOD_FEATURES,
   #     rescale=True,
   #     moments=[1, 2],
   #     quantiles=[0.05, 0.95],
   #     quantiles_inv=[0.1, 0.3, 0.5, 0.7, 0.9],
   #     diff_orders=[0, 1],
   #     pre_op=lambda x: np.exp(2 * x),
   #     pre_op_name="energy"
   # )
    
   # cols_with_rescale = [(col[0] + "_with_rescale", *col[1:]) for col in df_logmod_with_rescale.columns]
   # df_logmod_with_rescale.columns = pd.MultiIndex.from_tuples(cols_with_rescale)
    
    ## END LOGMOD RENAME COLUMNS
    
    
    df_time = make_input_new(
        h5_file,
        features=TIME_FEATURES,
        rescale_by_subject=False,
        # moments=[1, 2, 3, 4],
        quantiles=[0.1, 0.5, 0.9],
        diff_orders=[0]
    )
    
    df_pulse_max_freq = make_input_new(
        h5_file,
        features=["pulse_max_freq"],
        rescale_by_subject=False,
        moments=[1],
    )
    
    df_pulse_max_logE = make_input_new(
        h5_file,
        features=["pulse_max_logE"],
        rescale_by_subject=False,
        moments=[1, 2],
        pre_op=np.exp,
        pre_op_name="energy"
    )
    
    
    return pd.concat([
        df_bandlog,
        #df_sleep,
        #df_logmod_no_rescale, 
        #df_logmod_with_rescale, 
        df_time,
        #df_pulse_max_freq,
        #df_pulse_max_logE,
        ],
        axis=1)

def shift_and_fill(df, shift):
    shifted_df = df.shift(shift)
    if shift > 0:
        shifted_df.bfill(inplace=True)
    elif shift < 0:
        shifted_df.ffill(inplace=True)
    return shifted_df


def roll_and_concat(df, shifts_range):
    return pd.concat(map(lambda shift: shift_and_fill(df, shift), shifts_range), 
                     axis=1, keys=shifts_range)    
    
def concat_windows(h5_file, df, shifts):
    df = df.groupby(h5_file["index"][:], as_index=False).apply(roll_and_concat, shifts_range=shifts)
    return df
    
def make_input_default_test(h5_file):
    return make_input_new(h5_file, ["eeg_1", "eeg_2"], moments=[1])

def make_input_default_rolling(h5_file, shifts):
    """
    !!! not suited for pca because columns have 3 levels
    """
    df = make_input_default(h5_file)
    df_with_window = concat_windows(h5_file, df, shifts)
    return df_with_window


In [4]:
X_train_raw, X_test_raw = make_input_default(h5_train), make_input_default(h5_test)

Feature #1/1[1K1K

In [5]:
train_ids = get_subject_ids(h5_train)
train_train_ids, train_val_ids = train_ids[:28], train_ids[28:]
test_ids = get_subject_ids(h5_test)

X_train_train = X_train_raw.loc[subjects_ids_to_indexers(h5_train, train_train_ids, as_indices=True), :]
y_train_train = y_train_arr[subjects_ids_to_indexers(h5_train, train_train_ids, as_indices=True)]

X_train_val = X_train_raw.loc[subjects_ids_to_indexers(h5_train, train_val_ids, as_indices=True), :]
y_train_val = y_train_arr[subjects_ids_to_indexers(h5_train, train_val_ids, as_indices=True)]

"""
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

scaler_ = MinMaxScaler()
pca_ = PCA(0.99)

X_train_train = pca_.fit_transform(scaler_.fit_transform(X_train_train))
X_train_val = pca_.transform(scaler_.transform(X_train_val))
X_test = pca_.transform(scaler_.transform(X_test_raw))

def subjects_ids_col(h5_file):
    return h5_file["index"][:]

def concat_windows(arr, subjects_ids, h5_file, shifts): # subjects_ids must be sorted
    sid_col = subjects_ids_col(h5_file)
    sid_col = sid_col[np.isin(sid_col, subjects_ids)]
    df = pd.DataFrame(arr)
    
    return df.groupby(sid_col).apply(roll_and_concat, shifts_range=shifts)
    
shifts = [-1, 0, 1]
X_train_train_rolled = concat_windows(X_train_train, train_train_ids, h5_train, shifts)
X_train_val_rolled = concat_windows(X_train_val, train_val_ids, h5_train, shifts)
X_test_rolled = concat_windows(X_test, test_ids, h5_test, shifts)
"""
_ = None

In [None]:
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(verbose=1, min_samples_leaf=10)
estimator.fit(X_train_train, y_train_train)

train_score = custom_score(estimator.predict(X_train_train), y_train_train)
val_score = custom_score(estimator.predict(X_train_val), y_train_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
print(train_score)
print(val_score)

In [10]:
from sklearn.svm import SVC

estimator_svc = SVC(verbose=1, kernel='rbf', C=1)
estimator_svc.fit(X_train_train, y_train_train)

train_score_svc = custom_score(estimator_svc.predict(X_train_train), y_train_train)
val_score_svc = custom_score(estimator_svc.predict(X_train_val), y_train_val)

[LibSVM]

In [11]:
print(train_score_svc)
print(val_score_svc)

0.2892077938342528
0.24134018697794937
