### In this notebook I use dask

This is demonstrating how to use dask for our data processing. 

In [None]:
import dask

In [None]:
### Load the data

In [None]:
from data_processing import load_data, get_files, parse_filename
from feature_extraction import extract_fft_features, extract_time_domain_features
import pandas as pd
import dask
from dask import do
from os.path import split
from dask import delayed, compute

In [None]:
files =get_files()[:2]

In [None]:
def parse_filename(filename, split_file=False):
    """Parses m filename to get the pertinent information"""
    if split_file:
        filename = split(filename)[1]

    # strip out the .mat
    filename = filename.replace('.mat', '')

    # parse the remaing part
    return [int(part) for part in filename.split('_')]

def map_functions(data, functions):
    """maps a list of functions to data and returns as a list of results
    Parameters: 
        data: data to be computed on
        functions(list): a list of functions
    Returns: 
        results(list): a list of the results
    """
    return [fun(data) for fun in functions]


def process_data(file_name, functions=None):
    """Processes one file at a time for extracting features
    Parameters: 
        file_name(str): the file name
        functions(list): a list of functions for extracting features
    Returns: 
        res(pd.DataFrame): a one row data frame with the features in the columns    
    """
    
    if functions is None: 
        functions = [extract_time_domain_features,extract_fft_features]
    
    # get the time series and parse the filename for the info
    time_series = load_data(file_name,True)[0]
    patient,number,condition = parse_filename(file_name, True)
    
    # create an index and prefix df
    index = pd.MultiIndex.from_tuples([(patient, number, condition)], 
                                      names=['Patient', 'TraceNumber', 'Condition'])

    prefix_df = pd.DataFrame({'Patient':patient,
                              'TraceNumber':number,
                              'Condition':condition},
                             index = [0]
                              )
    
    # create a list two hold the data frames, call the functions and then concatenate the resulting dataframes
    res = [prefix_df]
    res.extend(map_functions(time_series,functions))
    res = pd.concat(res, axis =1)
    res.index = index
    return res
    
def process_multiple_data(files):
    """uses dask to process many files in parallel"""
    # set up the compute graph
    graph = delayed([delayed(process_data)(file_) for file_ in files])
    # compute the graph
    results = compute(graph)
    
    return pd.concat([results[0][i] for i in range(len(files))])

features = process_multiple_data(files)

In [None]:
def map_functions(data, functions):
    """maps a list of functions to data (slowly)"""
    return [fun(data) for fun in functions]

import numpy as np
functions = [np.sin, np.cos]
map_functions(1,functions)

### Try the new code


In [1]:
from os import listdir
import pandas as pd
from os.path import join, split
from data_processing import Processor,get_data_files,load_data,interpolate_zeros,replace_outliers_with_zeros

In [2]:
processor = Processor()

In [3]:
test_path =['/Users/crivera5/Desktop/test']

In [4]:
res = processor.process_data(test_path)

In [11]:
pd.concat(res)

Unnamed: 0,Variance 1,Variance 2,Variance 3,Variance 4,Variance 5,Variance 6,Variance 7,Variance 8,Variance 9,Variance 10,...,SpectralEntropy 10,SpectralEntropy 11,SpectralEntropy 12,SpectralEntropy 13,SpectralEntropy 14,SpectralEntropy 15,SpectralEntropy 16,patient,dataset_id,pre_ictal
0,1108.285587,1173.037383,1490.013692,3146.195714,1246.922731,994.800913,2778.21601,1043.109055,1330.460439,3066.468131,...,-1.404093,-1.387194,-1.287744,-1.486006,-1.627189,-1.36642,-1.257787,1,1,0
0,1485.645763,1009.379968,1047.718752,1917.973616,1076.874141,750.613258,1775.984768,824.203218,1445.925582,1812.478019,...,-1.325729,-1.266203,-1.136935,-1.274524,-1.354159,-1.260215,-1.187721,1,1,1


In [8]:
base = test_path[0]
data=load_data(join(base,listdir(base)[0]))[0]
listdir(base)

['1_1_0.mat', '1_1_1.mat']

In [None]:
import numpy as np
def interpolate_zeros(df):
    """Replaces zero values using linear interpolation
    Parameters:
        df(pd.DataFrame): a data frame with numeric values
    Returns:
        df
    """

    def replace_zeros(x):
        if x == 0.0:
            return np.nan
        return x

    df = df.copy()
    df = df.applymap(replace_zeros)
    return df.interpolate(method='linear')

In [None]:
res = interpolate_zeros(data)

NameError: name 'base' is not defined