# BEAT-PD Challenge

## Team JHU-CLSP 

- Marie-Philippe Gill
- Nanxin Chen
- Saurabhchand Bhati
- Sonal Joshi
- Laureano Moro-Velazquez

Team page : https://www.synapse.org/#!Team:3404266

## Useful Links

Challenge website : https://www.synapse.org/#!Synapse:syn20825169/wiki/596118

Data information : https://www.synapse.org/#!Synapse:syn20825169/wiki/600405


In [3]:
# Import required libraries

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import HTML, display

# Imports for the high pass signal
from scipy.signal import butter, freqz, lfilter

# KFold
from sklearn.model_selection import KFold

# Import required modules
from sklearn.preprocessing import StandardScaler

import os.path

# To write WAV File
from scipy.io.wavfile import write

# To make derivative work on multiple CPUs
from concurrent.futures import ProcessPoolExecutor
from functools import partial

import sys



%load_ext autoreload
%autoreload 2

# import transform_data
from transform_data import *
from create_graphs import *

# Extract initial data 

In [16]:
# Data paths
data_dir = "<your-path-to-data>" 

In [None]:
!cd {data_dir}
!pwd
!tar -xvf {data_dir}cis-pd.data_labels.tar.bz2 --directory {data_dir}; mv {data_dir}data_labels {data_dir}cis-pd.data_labels
!tar -xvf {data_dir}real-pd.data_labels.tar.bz2 --directory {data_dir}; mv {data_dir}data_labels {data_dir}real-pd.data_labels
!tar -xvf {data_dir}real-pd.training_data_updated.tar.bz2 --directory {data_dir}; mv {data_dir}training_data/ {data_dir}real-pd.training_data; 
!tar -xvf {data_dir}cis-pd.training_data.tar.bz2 --directory {data_dir}; mv {data_dir}training_data/ {data_dir}cis-pd.training_data; 
!tar -xvf {data_dir}cis-pd.ancillary_data.tar.bz2 --directory {data_dir}; mv {data_dir}ancillary_data/ {data_dir}cis-pd.ancillary_data;
!tar -xvf {data_dir}real-pd.ancillary_data_updated.tar.bz2 --directory {data_dir}; mv {data_dir}ancillary_data {data_dir}real-pd.ancillary_data;
!tar -xvf {data_dir}cis-pd.testing_data.tar.bz2 --directory {data_dir}; mv {data_dir}testing_data/ {data_dir}cis-pd.testing_data/;
!tar -xvf {data_dir}real-pd.testing_data_updated.tar.bz2 --directory {data_dir}; mv {data_dir}testing_data/ {data_dir}real-pd.testing_data/;
!mv {data_dir}cis-pd.CIS-PD_Test_Data_IDs.csv {data_dir}CIS-PD_Test_Data_IDs_Labels.csv; mv {data_dir}CIS-PD_Test_Data_IDs_Labels.csv {data_dir}cis-pd.data_labels/;
!mv {data_dir}real-pd.REAL-PD_Test_Data_IDs.csv {data_dir}REAL-PD_Test_Data_IDs_Labels.csv; mv {data_dir}REAL-PD_Test_Data_IDs_Labels.csv {data_dir}real-pd.data_labels/


In [None]:
# !rm {data_dir}cis-pd.data_labels.tar.bz2
# !rm {data_dir}real-pd.data_labels.tar.bz2
# !rm {data_dir}real-pd.training_data_updated.tar.bz2;
# !rm {data_dir}cis-pd.training_data.tar.bz2;
# !rm {data_dir}cis-pd.ancillary_data.tar.bz2;
# !rm {data_dir}real-pd.ancillary_data_updated.tar.bz2;
# !rm {data_dir}cis-pd.testing_data.tar.bz2;
# !rm {data_dir}real-pd.testing_data_updated.tar.bz2;

# CIS-PD Database

### CIS-PD: Create High Pass Data

For the high pass filter, three parameters can be tuned, but we used these:

- `order` = 10
- `fs` = 50.0  # sample rate, Hz
- `cutoff` = 0.5  # desired cutoff frequency of the filter, Hz

In [None]:
from scipy import signal
import matplotlib.pyplot as plt

b, a = signal.butter(10, 100, 'low', analog=True)
w, h = signal.freqs(b, a)
plt.semilogx(w, 20 * np.log10(abs(h)))
plt.title('Butterworth filter frequency response')
plt.xlabel('Frequency [radians / second]')
plt.ylabel('Amplitude [dB]')
plt.margins(0, 0.1)
plt.grid(which='both', axis='both')
plt.axvline(100, color='green') # cutoff frequency
plt.show()

In [None]:
# This will create the following folders: 
# cis-pd.training_data.high_pass/ 
# cis-pd.ancillary_data.high_pass/ 
# cis-pd.testing_data.high_pass/ 

data_type = "cis"

# data_subset is to switch between training_data, ancillary_data or testing_data
for data_subset in ['training_data', 'ancillary_data', 'testing_data']:
    path_train_data, df_train_label = define_data_type(data_type, data_dir, data_subset)

    # Path where to save the data with high pass filter applied 
    high_pass_path=data_dir+'/cis-pd.'+data_subset+'.high_pass/'

    high_pass_filter(df_train_label, high_pass_path, path_train_data, data_type)

In [None]:
# Use the following code if one high pass file is empty because of a bug

# data_type = "cis"
# path_train_data, df_train_label = define_data_type(data_type, data_dir, data_subset)

# list_measurement_id = ["dc90dc36-b4e5-43ec-b3e8-47c39c763c71"]

# # Filter df_train_label according to the measurement_id we are most interested in
# df_train_label = interesting_patients(df_train_label=df_train_label, list_measurement_id=list_measurement_id)

# high_pass_filter(df_train_label, high_pass_path, path_train_data, data_type)

### CIS-PD: Create Masks for inactivity removal 

For the masks, two parameters can be tuned:

- `energy_threshold` : what percentage of the max energy do we consider as inactivity? The current masks generated have used the threshold of 5%

- `duration_threshold` : how long do we want to have inactivity before we remove it? For example 3000x0.02ms=1min of inactivity minimum before those candidates are considered inactivty and will be removed.

This code also creates frequency response graph if needed.

In [None]:
# This will create the following folders: 
# cis-pd.training_data.high_pass_mask/ 
# cis-pd.ancillary_data.high_pass_mask/ 
# cis-pd.testing_data.high_pass_mask/ 

data_dir = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/"

data_type = "cis"

# data_subset is to switch between training_data, ancillary_data or testing_data
for data_subset in ['training_data', 'ancillary_data', 'testing_data']:
    path_train_data, df_train_label = define_data_type(data_type, data_dir, data_subset)

    remove_inactivity_highpass(
        df_train_label,
        path_train_data,
        data_type,
        energy_threshold=5,
        duration_threshold=3000,
        plot_frequency_response=True,
        mask_path=data_dir+'/cis-pd.'+data_subset+'.high_pass_mask_2/')
    raise KeyboardInterrupt

### Facultative : Create Combhpfnoinact

In [1]:
def save_removenoinact(measurement_id, path_train_data, mask_path, path_removeinact):
    """
    Quick function just to remove inactivity on the provided data from path_train_data 
    and saves it in the provided path (path_removeinact)
    """
    
    df_train_data = pd.read_csv(path_train_data + measurement_id + ".csv")
    
    df_train_data = apply_mask(path_train_data, measurement_id, mask_path)
    
    # If the folder doesn't exists, we need to create it 
    if not os.path.exists(path_removeinact):
        os.makedirs(path_removeinact)
        print('The folder was created : ', path_removeinact)
    
    # Save to a folder 
    df_train_data.to_csv(
        path_removeinact + measurement_id + ".csv",
        index=False
    )

In [None]:
# This will create the following folders: 
# cis-pd.training_data.combhpfnoinact/ 

path_train_data, df_train_label = define_data_type(data_type,
                                                   data_dir,
                                                   training_or_ancillary)
do_work = partial(
        save_removenoinact, 
        path_train_data="/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.high_pass/",
        mask_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.high_pass_mask/",
        path_removeinact = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.combhpfnoinact/"
    )

num_jobs = 8
with ProcessPoolExecutor(num_jobs) as ex:
    results = list(ex.map(do_work, df_train_label['measurement_id']))

### Falcutative : Create Orig No Inact 

In [None]:
# This will create the following folders: 
# cis-pd.training_data.orignoinact/ 

path_train_data, df_train_label = define_data_type(data_type,
                                                   data_dir,
                                                   training_or_ancillary)

do_work = partial(
        save_removenoinact, 
        path_train_data="/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data/",
        mask_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.high_pass_mask/",
        path_removeinact = "/export/fs02/mpgill/BeatPD/cis-pd.training_data.orignoinact/"
    )

num_jobs = 8
with ProcessPoolExecutor(num_jobs) as ex:
    results = list(ex.map(do_work, df_train_label['measurement_id']))

### Facultative : CIS-PD: Create first derivative

This step is facultative as it is something we experimented with, but never ended up using

In [None]:
# This will create the following folders: 
# cis-pd.training_data.derivative_original_data/
# cis-pd.ancillary_data.derivative_original_data/
# cis-pd.testing_data.derivative_original_data/

# Edit line 21 and mention how many CPU you want to use simultaneously 

data_type = "cis"

# This is only to switch between training_data or ancillary_data which is additional data provided 
for data_subset in ["training_data", "ancillary_data", "testing_data"]:
    path_train_data, df_train_label = define_data_type(data_type, data_dir, data_subset)

    do_work = partial(
        get_first_derivative, 
        path_train_data=path_train_data,
        derivative_path=data_dir+"cis-pd."+data_subset+".derivative_original_data/",
        padding=True, 
        mask_path=data_dir+"cis-pd."+data_subset+".high_pass_mask/",
    )

    num_jobs = 6
    with ProcessPoolExecutor(num_jobs) as ex:
        results = list(ex.map(do_work, df_train_label['measurement_id']))

path_train_data :  /export/b19/mpgill/BeatPD_data/cis-pd.training_data/


# REAL-PD Database

### REAL-PD: Create High-Pass data 

In [None]:
# This will create the following folders: 
# real-pd.training_data.high_pass/ 
# real-pd.ancillary_data.high_pass/ 
# real-pd.testing_data.high_pass/ 

data_type = "real"

for data_subset in ["training_data", "ancillary_data", "testing_data"]:
    for data_real_subtype in ['smartphone_accelerometer','smartwatch_accelerometer','smartwatch_gyroscope']:
        path_train_data, df_train_label = define_data_type(data_type,
                                                           data_dir,
                                                           data_subset,
                                                           data_real_subtype)

        high_pass_path=data_dir+'/real-pd.'+data_subset+'.high_pass/'+data_real_subtype+'/'

        high_pass_filter(df_train_label, high_pass_path, path_train_data, data_type)

### REAL-PD: Create Masks for inactivity removal for all subtypes

In [None]:
# This will create the following folders: 
# real-pd.training_data.high_pass_mask/ 
# real-pd.ancillary_data.high_pass_mask/ 
# real-pd.testing_data.high_pass_mask/ 

data_type = "real"

for data_subset in ["training_data", "ancillary_data", "testing_data"]:
    for data_real_subtype in ['smartphone_accelerometer','smartwatch_accelerometer','smartwatch_gyroscope']:
        path_train_data, df_train_label = define_data_type(data_type,
                                                           data_dir,
                                                           data_subset,
                                                           data_real_subtype)
        remove_inactivity_highpass(
            df_train_label,
            path_train_data=path_train_data,
            data_type=data_type,
            energy_threshold=5,
            duration_threshold=3000,
            plot_frequency_response=False,
            plot_accelerometer_after_removal=False,
            mask_path=data_dir+'/real-pd.'+data_subset+'.high_pass_mask/'+data_real_subtype+'/')

### Facultative : REAL-PD: Create first derivative for all subtypes 

This step is facultative as it is something we experimented with, but never ended up using

In [None]:
# This will create the following folders: 
# real-pd.training_data.derivative_original_data// 
# real-pd.ancillary_data.derivative_original_data// 
# real-pd.testing_data.derivative_original_data// 


for data_real_subtype in ['smartphone_accelerometer','smartwatch_accelerometer','smartwatch_gyroscope']:
    for data_subset in ["training_data", "ancillary_data", "testing_data"]:
        path_train_data, df_train_label = define_data_type(data_type,
                                                               data_dir,
                                                               data_subset,
                                                               data_real_subtype)

        for idx in df_train_label.index:
            try:
                df_train_data = pd.read_csv(path_train_data + df_train_label["measurement_id"][idx] + ".csv")
            except FileNotFoundError:
                print('Removing ' + df_train_label["measurement_id"][idx] +
                      ' as it doesn\'t exist for ' +
                      data_real_subtype)
                df_train_label = df_train_label.drop(idx)
            print(len(df_train_label))
        do_work = partial(
            get_first_derivative, 
            path_train_data=path_train_data,
            derivative_path="real-pd."+data_subset+".derivative_original_data/"+data_real_subtype+"/",
            padding=True, 
            mask_path=data_dir+"/real-pd."+data_subset+".high_pass_mask/"+data_real_subtype+"/"
        )

        num_jobs = 8
        with ProcessPoolExecutor(num_jobs) as ex:
            results = list(ex.map(do_work, df_train_label['measurement_id']))

# KFolds for CIS-PD and REAL-PD

KFolds are created with StratifiedKFold. They are balanced on only the on/off labels as this problem has a multilabel output. 

### Create the K-Fold files for the CIS database 

In [None]:
# This will create the following folders: 
# cis-pd.training_data.k_fold_v3
# cis-pd.ancillary_data.k_fold_v3

# Define the data type as we have two databases
data_type = "cis"

# Go through the subject_id and k-fold their data
# FIXME: get_k_fold could me renamed to just create the folds, save them, not return anything
for data_subset in ['training_data', 'ancillary_data']:
    path_train_data, df_train_label = define_data_type(data_type,
                                                      data_dir,
                                                      data_subset)

    # Group data by subject_id
    df_train_label_subject_id = df_train_label.groupby("subject_id")

    for subject_id, value in df_train_label_subject_id:
        list_df_train_label, list_df_test_label = get_k_fold(
            df_train_label=df_train_label,
            data_dir=data_dir,
            data_type=data_type,
            n_splits=5,
            subject_id=subject_id,
            data_subset=data_subset,
        )

### Create the K-Fold Files for the REAL database

In [None]:
# This will create the following folders: 
# real-pd.training_data.k_fold_v2
# real-pd.ancillary_data.k_fold_v2

data_type = "real"

for data_subset in ['training_data', 'ancillary_data']:
    for data_real_subtype in ['smartphone_accelerometer','smartwatch_accelerometer','smartwatch_gyroscope']:

        path_train_data, df_train_label = define_data_type(data_type,
                                                       data_dir,
                                                       data_subset,
                                                       data_real_subtype)

        # Group data by subject_id
        df_train_label_subject_id = df_train_label.groupby("subject_id")

        # Go through the subject_id and k-fold their data
        for subject_id, value in df_train_label_subject_id:
            list_df_train_label, list_df_test_label = get_k_fold(
                df_train_label=df_train_label,
                data_dir=data_dir,
                data_type=data_type,
                n_splits=5,
                subject_id=subject_id,
                data_subset=data_subset,
                data_real_subtype=data_real_subtype,
            )

# Data Preparation for MFCC Embeddings

### CIS-PD: Create WAV files

#### CIS-PD: Write Wav Files - Training Data - Original

In [None]:
# This will create the following folders: 
# cis-pd.training_data.wav_X : Wav files of the training data — the inactivity is NOT removed
# cis-pd.training_data.wav_Y
# cis-pd.training_data.wav_Z

data_subset='training_data'

create_cis_wav_files(data_subset, data_dir, sAxis="X", data_type="cis", bMask=False)
create_cis_wav_files(data_subset, data_dir, sAxis="Y", data_type="cis", bMask=False)
create_cis_wav_files(data_subset, data_dir, sAxis="Z", data_type="cis", bMask=False)

#### CIS-PD: Write Wav Files - Original Training Data - High Pass + Inactivity Removed 

In [None]:
# This will create the following folders: 
# cis-pd.training_data.high_pass_mask.wav_X : Original training data where inactivity is removed to wav files
# cis-pd.training_data.high_pass_mask.wav_Y
# cis-pd.training_data.high_pass_mask.wav_Z

data_subset='training_data'

create_cis_wav_files(data_subset, data_dir, sAxis="X", data_type="cis", bMask=True)
create_cis_wav_files(data_subset, data_dir, sAxis="Y", data_type="cis", bMask=True)
create_cis_wav_files(data_subset, data_dir, sAxis="Z", data_type="cis", bMask=True)

#### CIS-PD: Write Wav Files - Training Data - High Pass Filter Applied

In [None]:
# This will create the following folders: 
# cis-pd.training_data.high_pass.wav_X : High Pass filtered data to wav files (inactivity is not removed)
# cis-pd.training_data.high_pass.wav_Y
# cis-pd.training_data.high_pass.wav_Z

data_subset='training_data.high_pass'

create_cis_wav_files(data_subset, data_dir, sAxis="X", data_type="cis", bMask=False)
create_cis_wav_files(data_subset, data_dir, sAxis="Y", data_type="cis", bMask=False)
create_cis_wav_files(data_subset, data_dir, sAxis="Z", data_type="cis", bMask=False)

#### CIS-PD: Write Wav Files - Training Data - Original

In [None]:
# Creates files in these folders:
# cis-pd.ancillary_data.wav_X : Original ancillary data where inactivity is NOT removed to wav files
# cis-pd.ancillary_data.wav_Y
# cis-pd.ancillary_data.wav_Z

data_subset='ancillary_data'

create_cis_wav_files(data_subset, data_dir, sAxis="X", data_type="cis", bMask=False)
create_cis_wav_files(data_subset, data_dir, sAxis="Y", data_type="cis", bMask=False)
create_cis_wav_files(data_subset, data_dir, sAxis="Z", data_type="cis", bMask=False)

#### CIS-PD: Write Wav Files - Ancillary Data - High Pass data + Inactivity Removed 

In [None]:
# Creates files in these folders:
# cis-pd.ancillary_data.high_pass_mask.wav_X : Original ancillary data where inactivity is removed to wav files
# cis-pd.ancillary_data.high_pass_mask.wav_Y
# cis-pd.ancillary_data.high_pass_mask.wav_Z

data_subset='ancillary_data'

create_cis_wav_files(data_subset, data_dir, sAxis="X", data_type="cis", bMask=True)
create_cis_wav_files(data_subset, data_dir, sAxis="Y", data_type="cis", bMask=True)
create_cis_wav_files(data_subset, data_dir, sAxis="Z", data_type="cis", bMask=True)

#### CIS-PD: Write Wav Files - Ancillary Data - High Pass Filter Applied

In [None]:
# Creates files in these folders:
# cis-pd.ancillary_data.high_pass.wav_X : High Pass filtered data to wav files (inactivity is not removed)
# cis-pd.ancillary_data.high_pass.wav_Y
# cis-pd.ancillary_data.high_pass.wav_Z

data_subset='ancillary_data.high_pass'

create_cis_wav_files(data_subset, data_dir, sAxis="X", data_type="cis", bMask=False)
create_cis_wav_files(data_subset, data_dir, sAxis="Y", data_type="cis", bMask=False)
create_cis_wav_files(data_subset, data_dir, sAxis="Z", data_type="cis", bMask=False)

### REAL-PD: Create WAV files for all subtypes

#### REAL-PD: Write Wav Files - Training Data - Original Data

In [None]:
# Creates files in these folders:
# real-pd.training_data.wav_X : Wav files of the training data — the inactivity is NOT removed
# real-pd.training_data.wav_Y
# real-pd.training_data.wav_Z

data_subset="training_data"

create_real_wav_files(data_subset, data_dir=data_dir, sAxis="X", data_type="real", bMask=False)
create_real_wav_files(data_subset, data_dir=data_dir, sAxis="Y", data_type="real", bMask=False)
create_real_wav_files(data_subset, data_dir=data_dir, sAxis="Z", data_type="real", bMask=False)

#### REAL-PD: Write Wav Files - Training Data - High Pass Data + Original Data

In [None]:
# Creates files in these folders:
# real-pd.training_data.high_pass_mask.wav_X : Original training data where inactivity is removed to wav files
# real-pd.training_data.high_pass_mask.wav_Y
# real-pd.training_data.high_pass_mask.wav_Z

data_subset="training_data"

create_real_wav_files(data_subset, data_dir=data_dir, sAxis="X", data_type="real", bMask=True)
create_real_wav_files(data_subset, data_dir=data_dir, sAxis="Y", data_type="real", bMask=True)
create_real_wav_files(data_subset, data_dir=data_dir, sAxis="Z", data_type="real", bMask=True)

#### REAL-PD: Write Wav Files - Ancillary Data - Original Data

In [None]:
# Creates files in these folders:
# real-pd.ancillary_data.wav_X : Original ancillary data where inactivity is NOT removed to wav files
# real-pd.ancillary_data.wav_Y
# real-pd.ancillary_data.wav_Z

data_subset="ancillary_data"

create_real_wav_files(data_subset, data_dir=data_dir, sAxis="X", data_type="real", bMask=False)
create_real_wav_files(data_subset, data_dir=data_dir, sAxis="Y", data_type="real", bMask=False)
create_real_wav_files(data_subset, data_dir=data_dir, sAxis="Z", data_type="real", bMask=False)

#### REAL-PD: Write Wav Files - Ancillary Data - High Pass Data + Inactivity Removed 

In [None]:
# Creates files in these folders:
# real-pd.ancillary_data.high_pass_mask.wav_X : Original ancillary data where inactivity is removed to wav files
# real-pd.ancillary_data.high_pass_mask.wav_Y
# real-pd.ancillary_data.high_pass_mask.wav_Z

data_subset="ancillary_data"

create_real_wav_files(data_subset, data_dir=data_dir, sAxis="X", data_type="real", bMask=True)
create_real_wav_files(data_subset, data_dir=data_dir, sAxis="Y", data_type="real", bMask=True)
create_real_wav_files(data_subset, data_dir=data_dir, sAxis="Z", data_type="real", bMask=True)