In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
import os


PICTURES_FOLDER = "pictures"
os.makedirs(PICTURES_FOLDER, exist_ok=True)

SLEEP_STAGES_COLORS = {
    0: "blue",
    1: "green",
    2: "red",
    3: "black",
    4: "orange",
}

In [48]:
train_file = "kaggle_data/X_train.h5/X_train.h5"
test_file = "kaggle_data/X_test.h5/X_test.h5"

h5_train = h5py.File(train_file, mode='a')
h5_test = h5py.File(test_file, mode='a')

y_train = pd.read_csv("kaggle_data/y_train.csv", index_col=0, squeeze=True)

In [143]:
IRRELEVANT_FEATURES = ['index', 'index_absolute', 'index_window',
                       'x', 'y', 'z',
                       'speed_x', 'speed_y', 'speed_z']

def update_globals():
    features = [feat for feat in h5_train.keys() if feat not in IRRELEVANT_FEATURES]
    frequencies = {feat: h5_train[feat][0].size // 30 for feat in features}
    frequencies = {feat: freq if int(freq) in (10, 50) else 0 
                   for feat, freq in frequencies.items()}
    return features, frequencies
    
FEATURES, FREQUENCIES = update_globals()
print("FEATURES =", FEATURES)
print("FREQUENCIES =", FREQUENCIES)

FEATURES = ['accel_norm', 'eeg_1', 'eeg_2', 'eeg_3', 'eeg_4', 'eeg_5', 'eeg_6', 'eeg_7', 'pulse', 'speed_norm']
FREQUENCIES = {'accel_norm': 10, 'eeg_1': 50, 'eeg_2': 50, 'eeg_3': 50, 'eeg_4': 50, 'eeg_5': 50, 'eeg_6': 50, 'eeg_7': 50, 'pulse': 10, 'speed_norm': 10}


In [90]:
# HELPERS

def print_bis(txt):
    print(txt, end='\x1b[1K\r')
    
def print_ter(txt):
    print(f"\n{txt}")

    
def make_timeline(freq):
    """
    ARGS:
        freq (int): frequency in Hertz
    
    RETURNS:
        (pd.timedelta_range) : timestamps for a signal sampled at <freq> Hz for 30 seconds
    """
    return pd.timedelta_range(start='0s', end='30s', periods=freq*30)


def make_full_timeline(windows, freq):
    # test there is no missing data
    deltas = np.unique(np.diff(windows))
    assert (len(deltas) == 1) and (int(deltas[0]) == 1)
    return pd.timedelta_range(start='0s',
                              end=pd.to_timedelta('30s') * (windows[-1] + 1),
                              periods=freq * 30 * (windows[-1] + 1))

def get_subject_ids(h5_file):
    return np.unique(h5_file["index"][:])

    
def get_subject_boundaries(h5_file, subject_id, ready_to_use=True):
    """
    Helper function to select data relating to a given subject (on numpy arrays)
    
    ARGS:
        h5_file (h5py.File)
        subject_id (int)
        ready_to_use (bool, default=True): return a slice or a tuple
        
    RETURNS:
        subject_boundaries : (slice) (index_start, index_end+1) if <ready_to_use>
                             (tuple) (index_start, index_end) if not <ready_to_use>
                        
    """
    sids = h5_file['index'][:]
    start = np.argmax(sids == subject_id)
    end = len(sids) - 1 - np.argmax(sids[::-1] == subject_id)
    
    indexers = h5_file['index_absolute'][:]
    start = indexers[start]
    end = indexers[end]
    if ready_to_use:
        return slice(start, end + 1) # for numpy arrays
    return (start, end)


def get_subject_feature_signals(h5_file, subject_id, feature, as_timeseries=False):
    """
    Get the full timeseries for a given (subject_id, feature) pair.
    
    ARGS:
        h5_file (h5py.File)
        subject_id (int)
        feature (str)
        
    RETURNS:
        timeseries : (pd.Series if <as_timeseries>) represents the <feature> timeseries of the subject 
                     (list[np.array[?]] if not <as_timeseries>) list of <feature> signals from the subject
    """
    # Fetch subject boundaries
    boundaries = get_subject_boundaries(h5_file, subject_id)
    # Retrieve samples
    feature_timeseries = h5_file[feature][boundaries]
    if not as_timeseries:
        return feature_timeseries
    feature_timeseries = np.concatenate(feature_timeseries, axis=0)
    # Build timeline
    feature_frequency = FREQUENCIES[feature]
    windows = h5_file['index_window'][boundaries]
    timeline = make_full_timeline(windows, feature_frequency)
    return pd.Series(data=feature_timeseries, index=timeline)


def get_subject_sleep_stage(subject_id):
    start, end = get_subject_boundaries(h5_train, subject_id, ready_to_use=False)
    return y_train.loc[start:end] # because loc includes <end> (different behaviour than numpy arrays)
    



['accel_norm', 'eeg_1', 'eeg_2', 'eeg_3', 'eeg_4', 'eeg_5', 'eeg_6', 'eeg_7', 'pulse', 'speed_norm']


In [51]:
# Example
get_subject_feature_signals(h5_train, 1, "eeg_1", as_timeseries=False)

array([[-9.0287616e+02, -2.4733213e+04, -2.8913711e+04, ...,
         1.2473976e+01,  5.6019249e+00,  3.6001048e+00],
       [ 2.1024307e+01,  2.4208887e+01,  1.0953083e+01, ...,
        -3.6253862e+00,  2.5444579e+00,  1.5417756e+01],
       [ 1.3676975e+01,  1.2750147e+01, -4.2638946e+00, ...,
        -6.4895964e+00, -2.8554073e-01, -4.8893394e+01],
       ...,
       [-1.8695693e-01,  6.5529265e+00,  4.1397521e-01, ...,
        -5.0123764e+01, -3.3739128e+01, -5.1105301e+01],
       [-3.8710526e+01,  9.1726965e-01,  4.4646332e+01, ...,
        -1.9432636e+01, -1.1449772e+01, -3.6908641e+00],
       [ 2.2870004e+00, -6.5207219e+00,  4.5359030e+00, ...,
         9.2413536e+01, -3.1127655e+01, -1.7876479e+02]], dtype=float32)

In [52]:
# Example
get_subject_feature_signals(h5_train, 1, "eeg_1", as_timeseries=True)

0 days 00:00:00               -902.876160
0 days 00:00:00.020000015   -24733.212891
0 days 00:00:00.040000031   -28913.710938
0 days 00:00:00.060000047   -25399.294922
0 days 00:00:00.080000062   -23260.080078
                                 ...     
0 days 07:04:59.919999937      208.143906
0 days 07:04:59.939999952      142.460953
0 days 07:04:59.959999968       92.413536
0 days 07:04:59.979999984      -31.127655
0 days 07:05:00               -178.764786
Length: 1275000, dtype: float32

In [110]:
def _create_speed_and_acceleration(h5_file, overwrite=False, verbose=True):
    """
    a[t] = (v[t] - v[t-1]) / dt 
    ===> v[t] = sum_{s=0}^{t} a[s] (+ v[-1] = 0)
    """
    freq = 10
    dt = 1 / freq
    
    # Create datasets if required
    if "accel_norm" in h5_file.keys() and not overwrite:
        return None
    shape, dtype = h5_file["x"].shape, h5_file["x"].dtype
    for name in ["accel_norm", "speed_x", "speed_y", "speed_z", "speed_norm"]:
        try:
            h5_file.create_dataset(name, shape=shape, dtype=dtype)
        except:
            pass
    
    # Initiate subject id
    sid = -1
    for ix in range(shape[0]):
        if sid != h5_file["index"][ix]:
            sid = h5_file["index"][ix]
            speed = np.array([[0, 0, 0]])
            if verbose:
                print_bis(f"SUBJECT #{sid}")
        # acceleration
        accel = np.stack([h5_file[feat][ix] for feat in ("x", "y", "z")], axis=-1)
        h5_file["accel_norm"][ix] = np.linalg.norm(accel, ord=2, axis=1)
        # speed
        speed = speed + np.cumsum(accel, axis=0) * dt
        h5_file["speed_x"][ix] = speed[:, 0]
        h5_file["speed_y"][ix] = speed[:, 1]
        h5_file["speed_z"][ix] = speed[:, 2]
        h5_file["speed_norm"][ix] = np.linalg.norm(speed, ord=2, axis=1)
        # speed for next iteration
        speed = speed[[-1], :]
    return None
        

# Create speed and acceleration
_create_speed_and_acceleration(h5_train, overwrite=False, verbose=True)
_create_speed_and_acceleration(h5_test, overwrite=False, verbose=True)
FEATURES, FREQUENCIES = update_globals()
print(FEATURES)

['accel_norm', 'eeg_1', 'eeg_2', 'eeg_3', 'eeg_4', 'eeg_5', 'eeg_6', 'eeg_7', 'pulse', 'speed_norm']


| Name     | Frequency (Hz)|
| ---------| -----------|
| $\delta$ | 0-4 |
| $\theta$ | 4-8 |
| $\alpha$ | 8-13 |
| $\beta$  | 13-22 |
| $\gamma$ | 30-. |


In [271]:
# SIGNAL PROCESSING

# max frequency in our fourier transform: 25 Hz so no gamma
BANDS_FRONTIERS = [-1, 4, 8, 13, 22]
BANDS_LABELS = ['delta', 'theta', 'alpha', 'beta']

from scipy.fft import fft

"""
def get_spectrum(seq, fs):
    ft_modulus = np.abs(fft(seq))
    # The signal is real so the spectrum is symmetric 
    if len(seq) % 2 == 0:
        ft_modulus = ft_modulus[:len(seq) // 2]
    else:
        ft_modulus = ft_modulus[:len(seq) // 2 + 1]
    freqs = np.arange(0, len(ft_modulus)) * fs / len(seq) # frequencies of the spectrum
    return pd.Series(data=ft_modulus, index=freqs)


def get_energy_by_band(seq, fs):
    spectrum = get_spectrum(seq, fs)
    bands = pd.cut(spectrum.index,
                   bins=BANDS_FRONTIERS,
                   labels=BANDS_LABELS
                  )
    energy = spectrum.pow(2).groupby(bands).sum() # energy proportional to this
    # energy.clip(1e-10, None)
    return energy
"""

def get_spectrum_energy_chunk(sequences, sampling_freq):
    fourier_transform = fft(sequences, axis=1)
    energy = np.power(fourier_transform.real, 2) + np.power(fourier_transform.imag, 2) # proportional to energy
    energy = energy[:, :int(np.ceil(sequences.shape[1] / 2))] # Shannon
    frequencies = np.arange(0, energy.shape[1]) * sampling_freq / sequences.shape[1]
    bands = pd.cut(frequencies, bins=BANDS_FRONTIERS, labels=BANDS_LABELS)
    energy_by_band = pd.DataFrame(data=energy, columns=bands)
    energy_by_band = energy_by_band.groupby(energy_by_band.columns, axis=1).sum()
    return energy_by_band
    
    
def chunks_iterator(N, size): # with np.array convention 
    chunk_size = int(np.ceil(size / N))
    if chunk_size == 0:
        yield 0, size
    else:
        i = 0
        while i <= size:
            yield i, i + chunk_size
            i += chunk_size
    
    

def _create_log_energy(h5_file, n_chunks=10, overwrite=False, verbose=True):
    if (not overwrite) and ('alpha_eeg_1_logE' in h5_file.keys()):
        return None
    
    eegs = list(filter(lambda x: x.startswith('eeg'), FEATURES))
    shape = (h5_file["eeg_1"].shape[0], 1)
    dtype = h5_file["eeg_1"].dtype
    
    for band_name in BANDS_LABELS:
        for eeg in eegs:
            try:
                h5_file.create_dataset(f"{band_name}_{eeg}_logE", shape=shape, dtype=dtype)
            except:
                pass
    
    for chunk_num, (chunk_start, chunk_end) in enumerate(chunks_iterator(n_chunks, shape[0])):
        if verbose:
            print_bis(f"{chunk_num+1}/{n_chunks}")
        for eeg in eegs:
            energy = get_spectrum_energy_chunk(h5_file[eeg][chunk_start:chunk_end], FREQUENCIES[eeg])
            log_energy = np.log(energy)
            for band_name in BANDS_LABELS:
                h5_file[f"{band_name}_{eeg}_logE"][chunk_start:chunk_end] = log_energy[[band_name]].values
    return None
    
        

#def get_spectrum_maxima(seq, fs, thresh=0.1):
#    spectrum = get_spectrum(seq, fs)
#    delta_left = np.diff(spectrum, prepend=spectrum[0] - 1) > 0 # ascending
#    delta_right = np.diff(spectrum[::-1], prepend=spectrum[-1] - 1)[::-1] > 0 # descending
#    ix_keep = np.logical_and(delta_left, delta_right) # local maximum
#    spectrum_util = spectrum.loc[ix_keep]
#    spectrum_util = spectrum_util.loc[spectrum_util > spectrum_util.max() * thresh]
#    return spectrum_util

_create_log_energy(h5_train, n_chunks=100, overwrite=False, verbose=True)
_create_log_energy(h5_test, n_chunks=100, overwrite=False, verbose=True)
FEATURES, FREQUENCIES = update_globals()
print("FEATURES =", FEATURES)
print("FREQUENCIES =", FREQUENCIES)

FEATURES = ['accel_norm', 'alpha_eeg_1_logE', 'alpha_eeg_2_logE', 'alpha_eeg_3_logE', 'alpha_eeg_4_logE', 'alpha_eeg_5_logE', 'alpha_eeg_6_logE', 'alpha_eeg_7_logE', 'beta_eeg_1_logE', 'beta_eeg_2_logE', 'beta_eeg_3_logE', 'beta_eeg_4_logE', 'beta_eeg_5_logE', 'beta_eeg_6_logE', 'beta_eeg_7_logE', 'delta_eeg_1_logE', 'delta_eeg_2_logE', 'delta_eeg_3_logE', 'delta_eeg_4_logE', 'delta_eeg_5_logE', 'delta_eeg_6_logE', 'delta_eeg_7_logE', 'eeg_1', 'eeg_2', 'eeg_3', 'eeg_4', 'eeg_5', 'eeg_6', 'eeg_7', 'pulse', 'speed_norm', 'theta_eeg_1_logE', 'theta_eeg_2_logE', 'theta_eeg_3_logE', 'theta_eeg_4_logE', 'theta_eeg_5_logE', 'theta_eeg_6_logE', 'theta_eeg_7_logE']
FREQUENCIES = {'accel_norm': 10, 'alpha_eeg_1_logE': 0, 'alpha_eeg_2_logE': 0, 'alpha_eeg_3_logE': 0, 'alpha_eeg_4_logE': 0, 'alpha_eeg_5_logE': 0, 'alpha_eeg_6_logE': 0, 'alpha_eeg_7_logE': 0, 'beta_eeg_1_logE': 0, 'beta_eeg_2_logE': 0, 'beta_eeg_3_logE': 0, 'beta_eeg_4_logE': 0, 'beta_eeg_5_logE': 0, 'beta_eeg_6_logE': 0, 'beta_eeg

In [273]:
"""
def plot_subject_quantiles(subject_id, q_inf=0.025, q_sup=0.975, n_quantiles=20):
    sleep_states = get_subject_sleep_state(subject_id)
    qts = np.linspace(q_inf, q_sup, n_quantiles).round(3)
    for feature in FEATURES:
        signal = get_subject_feature_signals(h5_train, subject_id, feature, as_timeseries=False)
        size = signal[0].size
        signal_by_state = pd.Series(data=np.concatenate(signal, axis=0),
                                    index=np.repeat(sleep_states.values, size))
        qt_df = signal_by_state.groupby(signal_by_state.index).quantile(qts)
        qt_df.unstack(0).plot()
        # sns.heatmap(qt_df.unstack(0))
        plt.title(feature)
        plt.show()
    return None
"""
_=""

## VISUALIZATION

In [274]:
def robust_rescale(df):
    """
    X_rescaled = (X - MED(X)) / MED(|X - MED(X)|)
    """
    med = df.median()
    med_spread = (df - df.median()).abs().median()
    # df_rescaled = (df - med) / med_spread
    return (df - med) / med_spread

def min_max_rescale(df):
    min_ = df.min()
    max_ = df.max()
    return (df - min_) / (max_ - min_)
    
def z_rescale(df): 
    mean = df.mean()
    std = df.std()
    return (df - mean) / std

def get_fig_subjects():
    fig, axes = plt.subplots(10, 3, figsize=(10, 40))
    return fig, np.ravel(axes)

def title_with_subject_id(ax, subject_id):
    ax.set_title(f'SUBJECT #{subject_id}')
    return None

def save_feature_quantiles(feature,
                           inf_qt=0.025,
                           sup_qt=0.975,
                           n_quantiles=21,
                           robust_rescaling=False,
                           overwrite=False,
                           verbose=True):
    """
    See pictures/quantile_plots
    
    Can be improved (make robust and not robust qplots simultaneously)
    """
    # Make directory if it does not exist
    qplot_dir = os.path.join(PICTURES_FOLDER, f"quantile_plots")
    os.makedirs(qplot_dir, exist_ok=True)
    # Escape if not overwrite and already done
    qplot_fname = os.path.join(qplot_dir, f'{feature}{"--rescaled" if robust_rescaling else ""}.png')
    if (not overwrite) and os.path.exists(qplot_fname):
        return None
    # Otherwise,
    subject_ids = get_subject_ids(h5_train)
    quantiles = np.linspace(inf_qt, sup_qt, n_quantiles).round(3)
    subjects_quantiles = dict()
    for cnt, sid in enumerate(subject_ids):
        if verbose:
            print_bis(f"FEATURE #{FEATURES.index(feature)} SUBJECT {cnt+1}/{len(subject_ids)} (RESCALE = {str(robust_rescaling)})")
        # Robust representation of the signal
        signal = get_subject_feature_signals(h5_train, sid, feature, as_timeseries=False)
        size = signal[0].size
        signal = pd.Series(np.concatenate(signal))
        if robust_rescaling:
            signal = robust_rescale(signal)
        # Behaviour by sleep stage
        sleep_stages = get_subject_sleep_stage(sid).values
        signal_by_stage = signal.groupby(np.repeat(sleep_stages, size))
        subjects_quantiles[sid] = signal_by_stage.quantile(quantiles).unstack(0)
        
    fig, axes = get_fig_subjects()
    for ax, sid in zip(axes, subject_ids):
        subjects_quantiles[sid].plot(ax=ax, color=SLEEP_STAGES_COLORS)
        title_with_subject_id(ax, sid)
    plt.savefig(qplot_fname)
    plt.close(fig)
    return subjects_quantiles


# TO WRITE QUANTILE PLOTS IN pictures/quantile_plots
for i, feat in enumerate(FEATURES):
    # print_ter(f"========= FEATURE {i+1}/{len(FEATURES)} =========")
    save_feature_quantiles(feat, robust_rescaling=False, overwrite=False, verbose=True)
    save_feature_quantiles(feat, robust_rescaling=True, overwrite=False, verbose=True)


FEATURE #37 SUBJECT 31/31 (RESCALE = True)[1KK

In [275]:
'''
def do_nothing(x):
    return x

def aggregate_stat(stat_func, char, dataset, y_vals, chunksize=1000):
    """stat_func must have axis kwarg and take 2d arrays as arg"""
    chunks_ix = np.array_split(y_vals.index, len(y_vals) / chunksize)
    final = pd.Series([list() for _ in np.unique(y_vals)], index=np.unique(y_vals))
    for cnt, ix in enumerate(chunks_ix): 
        # print(cnt, '/', len(chunks_ix))
        tmp = pd.Series(stat_func(dataset[char][ix.tolist()], axis=1), index=y_vals.loc[ix])
        tmp = tmp.groupby(tmp.index).agg(list)
        final = final + tmp.reindex(final.index, fill_value=list())
    return final

def custom_group_mean(char, dataset, y_vals, chunksize=1000):
    chunks_ix = np.array_split(y_vals.index, len(y_vals) / chunksize)
    for cnt, ix in enumerate(chunks_ix): 
        # print(cnt, '/', len(chunks_ix))
        mean_tmp = pd.Series(np.mean(dataset[char][ix.tolist()], axis=1), index=y_vals.loc[ix])
        mean_tmp = mean_tmp.groupby(mean_tmp.index).agg(['mean', 'size'])
        if cnt == 0:
            mean_df = mean_tmp
            continue
        mean_df.loc[:, 'mean'] = (mean_df.prod(axis=1) + mean_tmp.prod(axis=1)) / (mean_df['size'] + mean_tmp['size'])
        mean_df.loc[:, 'size'] = mean_df['size'] + mean_df['size']
    return mean_df['mean']

# custom_group_mean('eeg_1', h5_train, y_train)
averages = pd.concat(map(lambda x: custom_group_mean(x, h5_train, y_train, 1000), FEATURES), axis=1, keys=FEATURES)
'''
_=""

In [108]:
def get_proba_transition(subject_id=None):
    if subject_id:
        start, end = get_subject_boundaries(h5_train, subject_id, ready_to_use=False)
        y = y_train.loc[start:end]
    else: # all subjects
        y = y_train.loc[:]
    transition_df = pd.DataFrame(data={"stage": y, "stage_after": y.shift(-1)})
    transition_df = transition_df.iloc[:-1] # NaN
    transition_df = transition_df.astype(int)
    counts = transition_df.groupby(["stage", "stage_after"]).size()
    counts = counts.unstack(1, fill_value=0)
    probas = counts.div(counts.sum(axis=1), axis=0)
    probas = probas.reindex(range(0, 5), axis=0, fill_value=0)
    probas = probas.reindex(range(0, 5), axis=1, fill_value=0)
    return probas

transition_plots_dir = os.path.join(PICTURES_FOLDER, "transition_plots")
os.makedirs(transition_plots_dir, exist_ok=True)

def save_transition_plots_by_subject(overwrite=False, verbose=True):
    fpath = os.path.join(transition_plots_dir, "transition_matrix_by_subject.png")
    if (not overwrite) and os.path.exists(fpath):
        return None
    subject_ids = get_subject_ids(h5_train)
    fig, axes = get_fig_subjects()
    for ax, sid in zip(axes, subject_ids):
        if verbose:
            print_bis(f"SUBJECT #{sid}")
        probas = get_proba_transition(subject_id=sid)
        sns.heatmap(probas, ax=ax, vmin=0, vmax=1, annot=True)
        title_with_subject_id(ax, sid)
    fig.tight_layout()
    fig.savefig(fpath)
    plt.close(fig)    
    return None

def save_transition_plot_global(overwrite=False):
    fpath = os.path.join(transition_plots_dir, "transition_matrix_global.png")
    if (not overwrite) and os.path.exists(fpath):
        return None
    proba_global = get_proba_transition()
    fig, ax = plt.subplots()
    sns.heatmap(proba_global, ax=ax, vmin=0, vmax=1, annot=True)
    fig.savefig(fpath)
    plt.close(fig)
    return None
    
save_transition_plots_by_subject(overwrite=False)
save_transition_plot_global(overwrite=False)

## MODELS

In [291]:
QUANTILES = [0.1, 0.5, 0.9]

def make_input_feature(h5_file, feature, n_chunks=100):
    print_bis(f"Feature #{FEATURES.index(feature)}")
    if FREQUENCIES[feature] == 0:
        return h5_file[feature][:]
    feature_array = np.empty(shape=(h5_file[feature].shape[0], len(QUANTILES)))
    for i, j in chunks_iterator(n_chunks, h5_file[feature].shape[0]):
        feature_array[i:j, :] = np.quantile(h5_file[feature][i:j], QUANTILES, axis=1).T
    return feature_array

def make_input(h5_file):
    return np.concatenate([make_input_feature(h5_file, feat) for feat in FEATURES], axis=1)
    


Feature #37[1K

In [292]:
X_train = make_input(h5_train)
y_train_ = y_train.values

X_test = make_input(h5_test)

Feature #37[1K

In [293]:
from sklearn.ensemble import RandomForestClassifier

basic_rf = RandomForestClassifier()
basic_rf.fit(X_train, y_train_)
y_test = basic_rf.predict(X_test)


RandomForestClassifier()

In [336]:
import re

SUBMISSION_FOLDER = "kaggle_data/submissions"
os.makedirs(SUBMISSION_FOLDER, exist_ok=True)

def serialize_for_submission(y, save=True):
    submission = pd.Series(data=y, index=h5_test["index_absolute"][:], name="sleep_stage")
    if not save:
        return submission
    submissions = os.listdir(SUBMISSION_FOLDER)
    if len(submissions) == 0:
        fname = os.path.join(SUBMISSION_FOLDER, "submission_1.csv")
    else:
        last = sorted(submissions)[-1]
        last_num = re.search("(\d+)\.csv", last).groups()[0]
        fname = os.path.join(SUBMISSION_FOLDER, f"submission_{int(last_num)+1}.csv")
    submission.to_csv(fname, index_label='index')
    
z = serialize_for_submission(y_test)


submission_4.csv
4


In [327]:
re.search("(\d+)\.csv", "submission_10.csv").group()

'10.csv'

In [61]:
h5_train.close()
h5_test.close()