In [1]:
import pandas as pd
import matplotlib as mpl
import seaborn as sb
import re
from os.path import join, dirname
from scipy.io import loadmat
from os import listdir
from os.path import isfile, join

%matplotlib inline

In [2]:
def convert_index_to_timedelta(index, sampling_rate=400):
    """converts the index to time delta"""
    index = [i*1.0/sampling_rate for i in index]
    return pd.to_timedelta(index,'s')
    

def load_data(path,convert_index=True):
    """converts the data to a pandas object
    Parameters: 
        path(str): absolute path to the m file 
        convert_index(bool): if True, convert the index to a time delta
    Returns: 
        (data,sampling_rate,sequence):(pd.DataFrame, int, int) 
    """
    # load the matlab file and extract the data
    data = loadmat(path)['dataStruct']
    
    # get the sampling rate and cast to int
    sampling_rate = int(data['iEEGsamplingRate'][0][0])
    
    #extract the iEEG traces and electrode channel data and place into a data frame
    traces = data['data'][0][0]
    channels = data['channelIndices'][0][0][0]
    df = pd.DataFrame(traces, columns = channels)
    
    if convert_index: 
        df.index = convert_index_to_timedelta(df.index, sampling_rate)
    
    #get the sequence collection number if present (not present in test)
    sequence = -1
    if 'sequence' in data.dtype.names:
        sequence =int(data['sequence'])

    return df, sampling_rate, sequence

In [3]:
import dask.multiprocessing
from dask import compute, delayed

def process_folder(train_path, function_name):
    seizure_df = pd.DataFrame()
    failures = []
    results = []
    print(train_path)
    for patient_path in train_path:
        # This is how I speed up processing 4x by making full use of all cores in the CPUs.
        values = [delayed(function_name)('\\'.join([patient_path, f])) for f in listdir(patient_path) if isfile('/'.join([patient_path, f]))]        
        result = compute(*values, get=dask.multiprocessing.get)
        results.append(result)
    return results

def calculate_median_variance(input_path_file):
    base_path, target_path, f = input_path_file.split('\\')
    print(base_path, target_path)
    path = '\\'.join([base_path, target_path])
    print('processing', path, f)
    this_file_df = pd.DataFrame()
    this_file_dict = {}
    try:
        df, sampling_rate, sequence = load_data(join(path, f))
        df.columns = [i for i in range(0,16)]
        # Determine if this is an inter or preictal dataset and put in corresponding bucket.
        split_string = f.replace('.', '_').split('_')
        print(split_string)
        for electrode in range(0,16):
            this_file_dict[str(electrode)+'_mean'] = [df[electrode].mean()]
            this_file_dict[str(electrode)+'_std'] = [df[electrode].std()]
            this_file_dict[str(electrode)+'_med'] = [df[electrode].median()]
            this_file_dict[str(electrode)+'_mad'] = [df[electrode].mad()]
        this_file_df = pd.DataFrame(this_file_dict)
        this_file_df['dataset_id'] = split_string[1]
        this_file_df['pre_ictal'] = split_string[2]
        this_file_df['patient'] = split_string[0]
    except Exception as e:
        pass
    return this_file_df

In [4]:
def check_for_nulls(input_path_file):
    base_path, target_path, f = input_path_file.split('\\')
    print(base_path, target_path)
    path = '\\'.join([base_path, target_path])
    print('processing', path, f)
    this_file_df = pd.DataFrame()
    this_file_dict = {}
    try:
        df, sampling_rate, sequence = load_data(join(path, f))
        df.columns = [i for i in range(0,16)]
        # Determine if this is an inter or preictal dataset and put in corresponding bucket.
        split_string = f.replace('.', '_').split('_')
        print(split_string)
        no_contact_df = df.query(1 == 0 and 2 == 0 and 3 == 0 and 4 == 0 and 5 == 0 and
                                 6 == 0 and 7 == 0 and 8 == 0 and 9 == 0 and 10 == 0 and
                                 11 == 0 and 12 == 0 and 13 == 0 and 14 == 0 and 15 == 0 and
                                 16 == 0)
        if not no_contact_df.empty:
            return input_path_file
        else:
            return 0
    except Exception as e:
        pass

In [5]:
# Setting up the path
base = "D:/Users/Anthony/Documents/Kaggle-Competitions/Kaggle-EKK-Data"
train1_path = join(base, 'train_1')
train2_path = join(base, 'train_2')
train3_path = join(base, 'train_3')

In [6]:
no_contact_res = process_folder([train2_path, train3_path], check_for_nulls)

['D:/Users/Anthony/Documents/Kaggle-Competitions/Kaggle-EKK-Data\\train_2', 'D:/Users/Anthony/Documents/Kaggle-Competitions/Kaggle-EKK-Data\\train_3']


In [7]:
test = 

[(), ()]