# Graphs for analysis

This notebook contains code to create graphs about the CIS-PD and REAL-PD dataset.

In [None]:
# Import required libraries

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import HTML, display

# Imports for the high pass signal
from scipy.signal import butter, freqz, lfilter

# KFold
from sklearn.model_selection import KFold

# Import required modules
from sklearn.preprocessing import StandardScaler

import os.path

# To write WAV File
from scipy.io.wavfile import write

# To make derivative work on multiple CPUs
from concurrent.futures import ProcessPoolExecutor
from functools import partial

import sys

# Confusion matrix
from sklearn import metrics

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from create_graphs import *
from transform_data import *

# Plot original accelerometers

In [None]:
data_type = "cis"
data_dir = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/"
path_save_accelerometer_plots = "/export/fs02/mpgill/plots/accelerometer_plots/"

# TODO: explain
path_train_data, df_train_label = define_data_type(data_type=data_type, data_dir=data_dir)

# display(df_train_label)
# List of interesting measurement id we want to look at
# list_measurement_id=[#'ab5287f4-8261-47ad-8ff2-22b5fe5d246e',
#'db2e053a-0fb8-4206-891a-6f079fb14e3a']#,
# 'ef5b1267-c212-46c5-aab0-4f4437bc6c67',
# '4ec74fb9-7347-435d-83dc-79ad74c3bc49',
# '8e8539ad-8841-476b-b15c-888ce3461989',
# '22b88456-fe8f-4138-af55-be12afca4b81',
# 'ad84583d-e5ae-4926-b077-531a0f7d08a9',
# 'eef56825-940a-4c3e-aebb-60838d60869e',
# 'e0441156-c4b8-467c-8f4f-3b532d594d8f',
# '464ac314-6c4b-4c4a-957c-28a2339150d6']

# List of interesting measurement id we want to look at
list_measurement_id = [
    "5cf68c8e-0b7a-4b73-ad4f-015c7a20fb5a",
    "cc7b822c-e310-46f0-a8ea-98c95fdb67a1",
    "5163afe8-a6b0-4ea4-b2ba-9b4501dd5912",
    "db2e053a-0fb8-4206-891a-6f079fb14e3a",
    "2d852742-10a9-4c56-9f38-779f2cd66879",
    "2e3a4c9c-ff01-4a28-bfcf-ce9b7633a39d",  # no inactivity should be removed
    "3cf49c01-0499-4bad-9167-67691711204a",  # no inactivity should be removed PAS LA??
    "3d0f965c-9d72-43d1-9369-1ea3acf963cc",  # PAS LA ???
    "4b269cc2-8f0c-4816-adbf-10c0069b8833",
    "4bc51b90-bfce-4231-85e1-5de3b4bc0745",
    "4fc3c295-857f-4920-8fa5-f21bfdc7ab4f",
]  # bit of inactivity in the middle]

# list_measurement_id = [
#     "2d852742-10a9-4c56-9f38-779f2cd66879",
#     "4fc3c295-857f-4920-8fa5-f21bfdc7ab4f",
#     "db2e053a-0fb8-4206-891a-6f079fb14e3a",
# ]


list_measurement_id = ["cc7b822c-e310-46f0-a8ea-98c95fdb67a1"]

# Filter df_train_label according to the measurement_id we are most interested in
df_train_label = interesting_patients(
    df_train_label=df_train_label, list_measurement_id=list_measurement_id
)

# Display filtered df_train_label
display(df_train_label)

# path_no_inactivity_data = remove_inactivity_pct_change(df_train_label)

# Plot the accelerometer data
plot_accelerometer(df_train_label,
                   data_type=data_type,
                   path_train_data=path_train_data,
                   path_accelerometer_plots=path_save_accelerometer_plots,
                   filename="original"
)

### Plot graph high pass

In [None]:
path_train_data = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.high_pass/"

# Plot the accelerometer data
plot_accelerometer(df_train_label,
                   data_type=data_type,
                   path_train_data=path_train_data,
                   path_accelerometer_plots=path_save_accelerometer_plots,
                   filename="hpf"
)

### Plot Graph Orig + Inactivity Removed

In [None]:
path_train_data = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data/"
    
# Plot the accelerometer data
plot_accelerometer(df_train_label,
                   data_type=data_type,
                   path_train_data=path_train_data,
                   path_accelerometer_plots=path_save_accelerometer_plots,
                   filename="orignoinact",
                   mask_path='/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.high_pass_mask/'
)

### Plot Graph HPF + Inactivity Removed

In [None]:
path_train_data = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.combhpfnoinact/"

# Plot the accelerometer data
plot_accelerometer(df_train_label,
                   data_type=data_type,
                   path_train_data=path_train_data,
                   path_accelerometer_plots=path_save_accelerometer_plots,
                   filename="combhpfnoinact"
)

### Plot Rotation

In [None]:
list_measurement_id = ["cc7b822c-e310-46f0-a8ea-98c95fdb67a1"]

# Filter df_train_label according to the measurement_id we are most interested in
df_train_label = interesting_patients(
    df_train_label=df_train_label, list_measurement_id=list_measurement_id
)

path_train_data = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.combhpfnoinact.rotate_5/"

# Plot the accelerometer data
plot_accelerometer(df_train_label,
                   data_type=data_type,
                   path_train_data=path_train_data,
                   path_accelerometer_plots=path_save_accelerometer_plots,
                   filename="combhpfnoinact.rotate_4"
)

# Which axis is more important? 

### CIS-PD 

In [None]:
# Define the data type as we have two databases
data_type = "cis"
training_or_ancillary = 'training_data'
data_dir = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/"
data_real_subtype="training_data"
path_train_data, df_train_label = define_data_type(data_type,
                                                   data_dir,
                                                   training_or_ancillary,
                                                   data_real_subtype)

# NOTE: plot_axis_on_top only shows 10 first subject_id 
plot_axis_on_top(df_train_label, path_train_data, highpass=False)

### Visualisation with inactivity removed, after applying a highpass filter

In [None]:
plot_axis_on_top(df_train_label, path_train_data, highpass=True)

### Analyze the kfold distribution V1

In [None]:
data_type = "cis"
data_real_subtype=""

if data_type == "cis":
    kfold_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.k_fold_v1/"

data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/"
data_path = data_path + data_type + "-pd.training_data/" + data_real_subtype + "/"

nb_folds = np.array([0,1,2,3,4])
nb_folds = np.array([0])

pids = np.array([1004,1006,1007,1019,1020,1023,1032,1034,1038,1039,1043,1044,1046,1048,1049,1051])
pids = np.array([1038])

for temp_pid in pids:
    for nb in nb_folds:
        for train_or_test in ['train','test']:
            file_name = str(temp_pid) + '_'+train_or_test+'_kfold_' + str(nb) + '.csv'
            print(file_name)
            df_train_label = pd.read_csv(kfold_path+file_name)
            
            # Compute the occurences of each symptoms for each patient
            df_occurences, df_train_label_subject_id = compute_symptoms_occurences_dataframe(
                df_train_label=df_train_label
            )

            # Plot the graphs
            plot_symptoms_occurences(
                df_occurences=df_occurences, df_train_label_subject_id=df_train_label_subject_id
            )

            #print(df_train_label.values[:,1:])
            #temp_train_X = pd.read_csv(data_path+df_train_label["measurement_id"][idx] + '.csv')

## Histograms - ALL FOLDS - True Labels

In [None]:
data_type = "cis"
data_real_subtype=""

kfold_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.k_fold_v3/"

data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/"
data_path = data_path + data_type + "-pd.training_data/" + data_real_subtype + "/"

nb_folds = np.array([0,1,2,3,4])
# nb_folds = np.array([0,1,2,3,4])

pids = np.array([1004,1006,1007,1019,1020,1023,1032,1034,1038,1039,1043,1044,1046,1048,1049,1051])
# pids = np.array([1038])



for temp_pid in pids:
    for train_or_test in ['train','test']:
        
        glob_df_train_label = pd.DataFrame()
        
        for nb in nb_folds:
            file_name = str(temp_pid) + '_'+train_or_test+'_kfold_' + str(nb) + '.csv'
            print(file_name)
            
            df_train_label = pd.read_csv(kfold_path+file_name)
            glob_df_train_label = glob_df_train_label.append(df_train_label)

#         print(glob_df_train_label)
        
        # Compute the occurences of each symptoms for each patient
        df_occurences, df_train_label_subject_id = compute_symptoms_occurences_dataframe(
            df_train_label=glob_df_train_label
        )

        # Plot the graphs
        plot_symptoms_occurences(
            df_occurences=df_occurences, df_train_label_subject_id=df_train_label_subject_id
        )


In [None]:
df_occurences, df_train_label_subject_id

### Bar plots of the training labels 

In [None]:
def plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title, additional_df=[]):
    """
    Plot a bar graph according to the csv file passed in parameters
    
    Keyword Arguments:
    - TODO 
    """
    df_train_predictions = pd.read_csv(data_path+sFileName)
    df_train_predictions = df_train_predictions.fillna(-1)

    for add_df in additional_df:
        df_train_predictions = df_train_predictions.append(add_df)
    df_train_predictions_reorder = df_train_predictions[sSubchallenge].round().value_counts(sort=True)
    order = [-1,0,1,2,3,4]
    df_train_predictions_reorder = df_train_predictions_reorder.reindex(order)

    plt.figure(figsize=(8,8))
    ax = df_train_predictions_reorder.plot('bar')

    plt.title(plot_title, fontdict = {'fontsize' : 15})
    plt.xlabel("Label",fontsize=15)
    plt.ylabel("Frequency",fontsize=15)
    
    add_value_labels(ax)
    plt.savefig(fname='/export/fs02/mpgill/plots/{0}.pdf'.format(plot_title.replace(" ", "_")), format='pdf')
    plt.savefig(fname='/export/fs02/mpgill/plots/{0}.png'.format(plot_title.replace(" ", "_")), format='png')
    plt.show()
    

### With Data Augmentation

In [None]:
data_type = "cis"
data_dir = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/"
training_or_ancillary='training_data'

path_train_data, df_train_label = define_data_type(data_type,
                                                   data_dir,
                                                   training_or_ancillary)

df_train_label_1 = df_train_label[((df_train_label.dyskinesia >= 1.0) | (df_train_label.on_off >= 1.0) | (df_train_label.tremor >= 1.0)) & (df_train_label.on_off != 0)]
# 
df_train_label_2 = df_train_label[((df_train_label.dyskinesia >= 2.0) | (df_train_label.on_off >= 2.0) | (df_train_label.tremor >= 2.0)) & (df_train_label.on_off != 0)]

df_train_label_3 = df_train_label[((df_train_label.dyskinesia >= 3.0) | (df_train_label.on_off >= 3.0) | (df_train_label.tremor >= 3.0)) & (df_train_label.on_off != 0)]

df_train_label_4 = df_train_label[((df_train_label.dyskinesia >= 4.0) | (df_train_label.on_off >= 4.0) | (df_train_label.tremor >= 4.0)) & (df_train_label.on_off != 0)]
df_train_label_5 = df_train_label[((df_train_label.dyskinesia >= 4.0)) & (df_train_label.on_off != 0)]
display(df_train_label_3)

data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.data_labels/"
sFileName = "CIS-PD_Training_Data_IDs_Labels.csv"
sSubchallenge = "on_off"
plot_title ="CIS-PD Training Data Labels - On Off"

plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title, additional_df=[df_train_label_3])

data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.data_labels/"
sFileName = "CIS-PD_Training_Data_IDs_Labels.csv"
sSubchallenge = "dyskinesia"
plot_title ="CIS-PD Training Data Labels - Dyskinesia"

plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title, additional_df=[df_train_label_3])

data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.data_labels/"
sFileName = "CIS-PD_Training_Data_IDs_Labels.csv"
sSubchallenge = "tremor"
plot_title ="CIS-PD Training Data Labels - tremor"

plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title, additional_df=[df_train_label_3])


### Without Data Augmentation 

In [None]:
data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.data_labels/"
sFileName = "CIS-PD_Training_Data_IDs_Labels.csv"
sSubchallenge = "on_off"
plot_title ="CIS-PD Training Data Labels - On Off"

plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title)

In [None]:
data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.data_labels/"
sFileName = "CIS-PD_Training_Data_IDs_Labels.csv"
sSubchallenge = "dyskinesia"
plot_title ="CIS-PD Training Data Labels - Dyskinesia"

plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title)

In [None]:
data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.data_labels/"
sFileName = "CIS-PD_Training_Data_IDs_Labels.csv"
sSubchallenge = "tremor"
plot_title ="CIS-PD Training Data Labels - tremor"

plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title)

In [None]:
data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/real-pd.data_labels/"
sFileName = "REAL-PD_Training_Data_IDs_Labels.csv"
sSubchallenge = "on_off"
plot_title ="REAL-PD Training Data Labels - on_off"

plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title)

In [None]:
data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/real-pd.data_labels/"
sFileName = "REAL-PD_Training_Data_IDs_Labels.csv"
sSubchallenge = "tremor"
plot_title ="REAL-PD Training Data Labels - tremor"

plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title)

In [None]:
data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/real-pd.data_labels/"
sFileName = "REAL-PD_Training_Data_IDs_Labels.csv"
sSubchallenge = "dyskinesia"
plot_title ="REAL-PD Training Data Labels - dyskinesia"

plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title)

### Histo of the predictions 

In [None]:
%load_ext autoreload
%autoreload 2
from pca_knn_bpd2 import *

def plot_bar_predictions_(df_predictions, plot_title):
    """
    Plot a bar graph according to the csv file passed in parameters
    
    Keyword Arguments:
    - TODO 
    """

    df_predictions_reorder = df_predictions.round().value_counts(sort=True)
    order = [-1,0,1,2,3,4]
    df_predictions_reorder = df_predictions_reorder.reindex(order)
    plt.figure(figsize=(5,5), dpi=80)
    ax = df_predictions_reorder.plot('bar')

    plt.title(plot_title)
    plt.xlabel("Label",fontsize=15)
    plt.ylabel("Frequency",fontsize=15)
#     plt.figure(figsize=(30,15))
#     plt.figure(figsize=(20,10))
    add_value_labels(ax)
    plt.tight_layout()
    plt.show()

import pickle

pid = "1004"

glob_test_pred = []

for fold in [0,1,2,3,4]:
    sFileTrai="/export/c08/lmorove1/kaldi/egs/beatPDivec/on_off_noinact_auto30_320fl/exp/ivec_550/ivectors_Training_Fold"+str(fold)+"/ivector.scp"
    sFileTest="/export/c08/lmorove1/kaldi/egs/beatPDivec/on_off_noinact_auto30_320fl/exp/ivec_550/ivectors_Testing_Fold"+str(fold)+"/ivector.scp"
    iComponents=400

    sOut="/home/mpgill/BeatPD/BeatPD-CLSP-JHU/ResiVecSVR_Fold"+str(fold)+"/"

    iNeighbors=None

    vTraiPCA, vLTrai, vTraiSubjectId, vTraiMeasurementId, vTestPCA, vLTest, vTestSubjectId, vTestMeasurementId = pca(sFileTrai, sFileTest, iComponents)

    pca_knn_bpd2(sFileTrai, sFileTest, sOut, iComponents, iNeighbors, sKernel='linear', fCValue=0.2, fEpsilon='0.1')


    fold_folder = "/home/mpgill/BeatPD/BeatPD-CLSP-JHU/ResiVecSVR_Fold"+str(fold)+"/"
    sFileName = pid+"_objs_400_kernel_linear_c_0.2_eps_0.1.pkl"

    [predictionsTrai,vLTrai_subjectid,predictions,vLTest_subjectid, vTraiMeasurementId, \
     mse_trai_subjectid, \
     mse_test_subjectid, \
     lTestMeasId_subjectid] = pickle.load(open(fold_folder+sFileName, "rb" ) )
    
    glob_test_pred=np.append(glob_test_pred,predictionsTrai,axis=0)
#     print(predictionsTrai.round().astype(int))
#     print(vLTrai_subjectid)
#     print("EQUAL? : ", predictionsTrai.round().astype(int) == vLTrai_subjectid)
    
    print(predictions)
    print(vLTest_subjectid)
    print("EQUAL? : ", predictions.round().astype(int) == vLTest_subjectid)
    
    # Plot per fold 
    plot_bar_predictions_(pd.Series(predictionsTrai), (pid+"_objs_400_kernel_linear_c_0.2_eps_0.01 - Fold "+str(fold)+" - Test Preds"))

plot_bar_predictions_(pd.Series(glob_test_pred), (pid+"_objs_400_kernel_linear_c_0.2_eps_0.01 - All Folds - Test Preds"))
#     do_confusion_matrix(y_test=vLTest_subjectid, predictions=predictions.round().astype(int))
# plt.close('all')

### Bar Plot of Predictions for all folds, all patients 

In [None]:
data_path = "/export/c08/lmorove1/kaldi/egs/beatPDivec/on_off_noinact_auto30/exp/ivec_450/resiVecSVR_Fold_all/"
sFileName = "objs_400_kernel_linear_c_0.2_eps_0.1.csv"
sSubchallenge = "on_off"
plot_title ="On/Off Predictions Labels"

plot_bar_labels(data_path, sFileName, sSubchallenge, plot_title)

In [None]:
# print(df_train_label.round())

data_path = "/export/c08/lmorove1/kaldi/egs/beatPDivec/trem_noinact_auto30/exp/ivec_450/resiVecSVR_Fold_all/"
sFileName = "objs_450_kernel_linear_c_0.02_eps_0.1.csv"
df_train_label_trem = pd.read_csv(data_path+sFileName)

data_path = "/export/c08/lmorove1/kaldi/egs/beatPDivec/v1_dysk_auto/exp/ivec_500/resiVecSVR_Fold/"
sFileName = "Dyskinesia_testing.csv"
df_train_label_dysk = pd.read_csv(data_path+sFileName)

### Analyze the kfold distribution V2

In [None]:
data_type = "cis"
data_real_subtype=""

if data_type == "cis":
    kfold_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.k_fold_v2/"

data_path = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/"
data_path = data_path + data_type + "-pd.training_data/" + data_real_subtype + "/"

nb_folds = np.array([0,1,2,3,4])

pids = np.array([1038])
#pids = np.array([1004,1006,1007,1019,1020,1023,1032,1034,1038,1039,1043,1044,1046,1048,1049,1051])

for temp_pid in pids:
    for nb in nb_folds:
        for train_or_test in ['train','test']:
            file_name = str(temp_pid) + '_'+train_or_test+'_kfold_' + str(nb) + '.csv'
            print(file_name)
            df_train_label = pd.read_csv(kfold_path+file_name)
            # Compute the occurences of each symptoms for each patient

            df_occurences, df_train_label_subject_id = compute_symptoms_occurences_dataframe(
                df_train_label=df_train_label
            )

            # Plot the graphs
            plot_symptoms_occurences(
                df_occurences=df_occurences, df_train_label_subject_id=df_train_label_subject_id
            )

            #print(df_train_label.values[:,1:])
            #temp_train_X = pd.read_csv(data_path+df_train_label["measurement_id"][idx] + '.csv')

# CIS Database

## Distribution of the length of the files

In [None]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

data_type = "cis"

path_train_data, df_train_label = define_data_type(data_type,
                                                   data_dir,
                                                   training_or_ancillary,
                                                   data_real_subtype)
len_distribution = []
for idx in df_train_label.index:
        df_train_data = pd.read_csv(path_train_data + df_train_label["measurement_id"][idx] + ".csv")
        len_distribution.append(len(df_train_data))


num_bins = 10
n, bins, patches = plt.hist(len_distribution, num_bins, facecolor='blue', alpha=0.5)
plt.show()

print('min : ', min(len_distribution))
print('max : ', max(len_distribution))

### Remove activity with pct_change and plot the accelerometer after

In [None]:
data_type = "cis"
data_dir = "/home/sjoshi/codes/python/BeatPD/data/BeatPD/"
path_save_accelerometer_plots = "/home/sjoshi/codes/python/BeatPD/code/accelerometer_plots/"
training_or_ancillary='training_data'

path_train_data, df_train_label = define_data_type(data_type,
                                                   data_dir,
                                                   training_or_ancillary)

list_measurement_id = ["5cf68c8e-0b7a-4b73-ad4f-015c7a20fb5a"]

# Filter df_train_label according to the measurement_id we are most interested in
df_train_label = interesting_patients(df_train_label=df_train_label,
                                      list_measurement_id=list_measurement_id)

plot_accelerometer(df_train_label=df_train_label,
                   data_type=data_type,
                   path_train_data=path_train_data,
                   path_accelerometer_plots=path_save_accelerometer_plots)

path_no_inactivity_data = remove_inactivity_pct_change(df_train_label,
                                                       data_dir,
                                                       path_train_data,
                                                       data_type)

# Plot the accelerometer data
plot_accelerometer(df_train_label=df_train_label,
                   data_type=data_type,
                   path_train_data=path_train_data,
                   path_accelerometer_plots=path_save_accelerometer_plots,
                   path_inactivity=path_no_inactivity_data)

### How to print accelerometers before/after and write a wav file for 1 file 

In [None]:
data_type = "cis"
path_save_accelerometer_plots = "/home/sjoshi/codes/python/BeatPD/code/accelerometer_plots/"
path_train_data, df_train_label = define_data_type(data_type,
                                                   data_dir,
                                                   training_or_ancillary,
                                                   data_real_subtype)

list_measurement_id = ['db2e053a-0fb8-4206-891a-6f079fb14e3a']


df_train_label = interesting_patients(df_train_label=df_train_label, list_measurement_id=list_measurement_id)

# Plot the accelerometer data
plot_accelerometer(data_type=data_type, path_accelerometer_plots=path_save_accelerometer_plots)

remove_inactivity_highpass(
    df_train_label,
    energy_threshold=10,
    duration_threshold=3000,
    plot_frequency_response=True,
    plot_accelerometer_after_removal=True,
    mask_path='/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.high_pass_mask/')


# Apply filter 
for idx in df_train_label.index:
    df_train_data = apply_mask(df_train_label["measurement_id"][idx],
                               mask_path='/home/sjoshi/codes/python/BeatPD/data/BeatPD/cis-pd.training_data.high_pass_mask/')
    print('len : ', len(df_train_data))
    great_title = get_plot_title(idx, df_train_label)
    
    print('AFTER REMOVAL')
    #Plot accelerometer 
    print('len : ', len(df_train_data))
    x_axis_data_type = "t" if data_type == "real" else "Timestamp"
    df_train_data.plot(
                    x=x_axis_data_type, legend=True, subplots=True, title=great_title
                )
    plt.show()
    plt.clf()
    plt.cla()
    plt.close()

# REAL-PD Database

In [None]:
data_type = "real"
data_real_subtype = 'smartphone_accelerometer'
training_or_ancillary='training_data' #training_data
path_train_data, df_train_label = define_data_type(data_type,
                                                   data_dir,
                                                   training_or_ancillary,
                                                   data_real_subtype)

list_measurement_id=['b50d1b0c-2cd1-45f8-9097-0742e5cbbcc8']

# Filter df_train_label according to the measurement_id we are most interested in
df_train_label = interesting_patients(df_train_label=df_train_label, list_measurement_id=list_measurement_id)


# Compute the occurences of each symptoms for each patient
df_occurences, df_train_label_subject_id = compute_symptoms_occurences_dataframe(
    df_train_label=df_train_label
)

# Plot the graphs
plot_symptoms_occurences(
    df_occurences=df_occurences, df_train_label_subject_id=df_train_label_subject_id
)

## Distribution of the length of the files

In [None]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

data_type = "real"
training_or_ancillary='training_data'
data_real_subtype='smartphone_accelerometer'

for data_real_subtype in ['smartphone_accelerometer','smartwatch_accelerometer','smartwatch_gyroscope']:
    path_train_data, df_train_label = define_data_type(data_type,
                                                   data_dir,
                                                   training_or_ancillary,
                                                   data_real_subtype)
    len_distribution = []

    for idx in df_train_label.index:
        try:
            df_train_data = pd.read_csv(path_train_data + df_train_label["measurement_id"][idx] + ".csv")
            len_distribution.append(len(df_train_data))
        except FileNotFoundError:
            continue
    print(data_real_subtype)
    num_bins = 10
    n, bins, patches = plt.hist(len_distribution, num_bins, facecolor='blue', alpha=0.5)
    plt.show()


# Confusion Matrix

In [None]:
def do_confusion_matrix(y_test, predictions):
    print('y test : ', np.unique(y_test))
    LABELS_NEW = np.unique(y_test)
    n_classes=np.unique(y_test)
    print("Confusion Matrix:")
    confusion_matrix = metrics.confusion_matrix(y_test, predictions)
    print(confusion_matrix)
    normalised_confusion_matrix = np.array(confusion_matrix, dtype=np.float32)/np.sum(confusion_matrix)*100

    print("")
    print("Confusion matrix (normalised to % of total test data):")
    print(normalised_confusion_matrix)

    # Plot Results:
    width = 12
    height = 12
    plt.figure(figsize=(width, height))
    plt.imshow(
        normalised_confusion_matrix,
        interpolation='nearest',
        cmap=plt.cm.rainbow
    )
    plt.title("Confusion matrix \n(normalised to % of total test data)")
    plt.colorbar()
    tick_marks = np.arange(n_classes)
    plt.xticks(tick_marks, LABELS_NEW, rotation=90)
    plt.yticks(tick_marks, LABELS_NEW)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()