### In this notebook I use dask

This is demonstrating how to use dask for our data processing. 

In [1]:
import dask

In [2]:
### Load the data

In [3]:
from data_processing import load_data, get_files, parse_filename
from feature_extraction import extract_fft_features, extract_time_domain_features
import pandas as pd
import dask
from dask import do
from os.path import split
from dask import delayed, compute

In [4]:
files =get_files()[:2]

In [5]:
def parse_filename(filename, split_file=False):
    """Parses m filename to get the pertinent information"""
    if split_file:
        filename = split(filename)[1]

    # strip out the .mat
    filename = filename.replace('.mat', '')

    # parse the remaing part
    return [int(part) for part in filename.split('_')]

def map_functions(data, functions):
    """maps a list of functions to data and returns as a list of results
    Parameters: 
        data: data to be computed on
        functions(list): a list of functions
    Returns: 
        results(list): a list of the results
    """
    return [fun(data) for fun in functions]


def process_data(file_name, functions=None):
    """Processes one file at a time for extracting features
    Parameters: 
        file_name(str): the file name
        functions(list): a list of functions for extracting features
    Returns: 
        res(pd.DataFrame): a one row data frame with the features in the columns    
    """
    
    if functions is None: 
        functions = [extract_time_domain_features,extract_fft_features]
    
    # get the time series and parse the filename for the info
    time_series = load_data(file_name,True)[0]
    patient,number,condition = parse_filename(file_name, True)
    
    # create an index and prefix df
    index = pd.MultiIndex.from_tuples([(patient, number, condition)], 
                                      names=['Patient', 'TraceNumber', 'Condition'])

    prefix_df = pd.DataFrame({'Patient':patient,
                              'TraceNumber':number,
                              'Condition':condition},
                             index = [0]
                              )
    
    # create a list two hold the data frames, call the functions and then concatenate the resulting dataframes
    res = [prefix_df]
    res.extend(map_functions(time_series,functions))
    res = pd.concat(res, axis =1)
    res.index = index
    return res
    
def process_multiple_data(files):
    """uses dask to process many files in parallel"""
    # set up the compute graph
    graph = delayed([delayed(process_data)(file_) for file_ in files])
    # compute the graph
    results = compute(graph)
    
    return pd.concat([results[0][i] for i in range(len(files))])

In [6]:
features = process_multiple_data(files)

  result = getattr(x, name)(y)


In [7]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Condition,Patient,TraceNumber,Variance 1,Variance 2,Variance 3,Variance 4,Variance 5,Variance 6,Variance 7,...,SpectralEntropy 7,SpectralEntropy 8,SpectralEntropy 9,SpectralEntropy 10,SpectralEntropy 11,SpectralEntropy 12,SpectralEntropy 13,SpectralEntropy 14,SpectralEntropy 15,SpectralEntropy 16
Patient,TraceNumber,Condition,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,1000,0,0,1,1000,1460.317993,2089.150146,2707.595215,4025.93042,3462.147217,2225.89502,1405.077759,...,-1.342669,-1.33292,-1.243326,-1.410041,-1.322903,-1.250799,-1.4125,-1.499203,-1.378572,-1.244428
1,1001,0,0,1,1001,1274.685547,1789.522217,2519.115479,3820.594727,3596.031982,2023.835205,1382.331299,...,-1.34516,-1.354946,-1.226694,-1.428975,-1.323394,-1.264476,-1.478456,-1.563301,-1.40259,-1.247789


In [8]:
def map_functions(data, functions):
    """maps a list of functions to data (slowly)"""
    return [fun(data) for fun in functions]

In [9]:
import numpy as np
functions = [np.sin, np.cos]
map_functions(1,functions)

[0.8414709848078965, 0.54030230586813977]