<center><h1>Parkinson disease diagnostic using time series data</h1></center>
<center><h2>Data preprocessing</h2></center>

# Importation of librairies

In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Import all the txt files of each patient

In [2]:
folder_path = 'gait-in-parkinsons-disease-1.0.0'
txt_files = [file for file in os.listdir(folder_path) if file.endswith('.txt')][2:]
# get index of 'SHA256SUMS.txt' file
txt_files.index('SHA256SUMS.txt')
# remove the index from the list
txt_files.pop(txt_files.index('SHA256SUMS.txt'))

'SHA256SUMS.txt'

# Data preprocessing functions

In [None]:
def get_start_condition(foot_data, foot):
    """
    Returns a boolean mask indicating the start conditions for foot lifts off from the ground.

    Parameters:
    - foot_data: DataFrame containing foot sensor data.
    - foot: Name of the foot sensor.

    Returns:
    - Boolean mask indicating start conditions.
    """

    # Identify periods where foot sensor readings are zero for at least 6 consecutive samples
    mask = foot_data.loc[foot_data.eq(0).sum(axis=1) >= 6]
    # Define start conditions based on foot sensor readings and their shifts
    return (foot_data[foot] < mask[foot].max() + 10) & (foot_data[foot].shift(1) > mask[foot].max() + 10)

def get_start_foot_air(foot_data, foot):
    """
    Returns indices where the foot lifts off from the ground.

    Parameters:
    - foot_data: DataFrame containing foot sensor data.
    - foot: Name of the foot sensor.

    Returns:
    - Array of indices where foot lifts off from the ground.
    """
    # Obtain start conditions using the previous function
    start = get_start_condition(foot_data, foot)
    # Get indices where start conditions are met
    start = np.where(start == True)[0]
    return start

def get_end_conditions(foot_data, foot):
    """
    Returns a boolean mask indicating the end conditions for foot lifts off from the ground.

    Parameters:
    - foot_data: DataFrame containing foot sensor data.
    - foot: Name of the foot sensor.

    Returns:
    - Boolean mask indicating end conditions.
    """

    # Identify periods where foot sensor readings are zero for at least 6 consecutive samples
    mask = foot_data.loc[foot_data.eq(0).sum(axis=1) >= 6]
    # Define end conditions based on foot sensor readings and their shifts
    return (foot_data[foot] > mask[foot].max() + 10) & (foot_data[foot].shift(1) < mask[foot].max()  + 10)

def get_end_foot_air(foot_data, foot):
    """
    Returns indices where the foot lmakes contact with the ground.

    Parameters:
    - foot_data: DataFrame containing foot sensor data.
    - foot: Name of the foot sensor.

    Returns:
    - Array of indices where foot makes contact with the ground.
    """

    # Obtain end conditions using the previous function
    end = get_end_conditions(foot_data, foot)
    # Get indices where end conditions are met
    end = np.where(end == True)[0]
    return end

def get_stride_and_swing_data(foot_data, start, end):
    """
    Computes various parameters related to stride and swing times.

    Parameters:
    - foot_data: DataFrame containing foot sensor data.
    - start: Array of indices where foot lifts off from the ground.
    - end: Array of indices where foot makes contact with the ground.

    Returns:
    - Various parameters related to stride and swing times.
    """

    # Initialize empty lists to store stride and swing times
    stride_time = []
    swing_time = []

    # Initialize a variable to keep track of the current position in time
    x = 0
    
    for i in range(min(len(start), len(end))):
        # Calculate stride and swing times based on indices
        stride_time.append(foot_data["time"][end[i]] - foot_data["time"][x])
        swing_time.append(foot_data["time"][end[i]] - foot_data["time"][start[i]])
        x = end[i] + 1

    # Calculate standard deviations, max, min, coefficients of variation, ratios, etc.
    std_stride_time = np.std(stride_time)
    std_swing_time = np.std(swing_time)
    max_swing_time = max(swing_time)
    min_swing_time = min(swing_time)
    cv_stride_time = (100 * (std_stride_time / np.mean(stride_time)))
    cv_swing_time = (100 * (std_swing_time / np.mean(swing_time)))
    swing_stride_ratio = ((np.array(swing_time) / np.array(stride_time)) * 100)
    log_swing_stride_ratio = (100 * np.log(min_swing_time / max_swing_time))

    return stride_time, swing_time, std_stride_time, std_swing_time, max_swing_time, min_swing_time, cv_stride_time, cv_swing_time, swing_stride_ratio, log_swing_stride_ratio
            

def get_double_stance_time(df_p, start_left, end_left, start_right, end_right):
    """
    Computes the double stance time during walking.

    Parameters:
    - df_p: DataFrame containing relevant data.
    - start_left: Array of indices where left foot makes contact with the ground.
    - end_left: Array of indices where left foot lifts off from the ground.
    - start_right: Array of indices where right foot makes contact with the ground.
    - end_right: Array of indices where right foot lifts off from the ground.

    Returns:
    - Array containing double stance times.
    """
    # Initialize an empty list to store double stance times
    double = []
    # Create lists of pairs of indices representing air time for each foot
    right_air = [[x, y] for x, y in zip(start_right, end_right)]
    left_air = [[x, y] for x, y in zip(start_left, end_left)]

    # Initialize a variable to keep track of the current position in time
    x = 0 

    # Determine the minimum length between left and right foot air times
    length = min(len(right_air), len(left_air))

    # Loop over the common length of left and right foot air times
    for i in range(length):
        # Determine the minimum and maximum start and end times for the current step
        min_start = min(right_air[i][0], left_air[i][0])
        max_start = max(right_air[i][0], left_air[i][0])
        min_end = min(right_air[i][1], left_air[i][1])
        max_end = max(right_air[i][1], left_air[i][1])

        # Check for gaps between left and right foot contacts
        if x < min_start:
            # Compute and append the double stance time for the gap
            double.append(df_p['time'][min_start] - df_p['time'][x])

        # Check for overlapping periods of left and right foot contacts
        if min_end < max_start:
            # Compute and append the double stance time for the overlap
            double.append(df_p['time'][max_start] - df_p['time'][min_end])

        # Update the current position in time to the end of the current step
        x = max_end

    # Handle remaining contacts if one foot has more contacts than the other
    if len(right_air) > len(left_air):
        for i in range(len(left_air), len(right_air)):
            # Check for gaps in right foot contacts
            if x < right_air[i][0]:
                # Compute and append the double stance time for the gap
                double.append(df_p['time'][right_air[i][0]] - df_p['time'][x])
            # Update the current position in time to the end of the current right foot step
            x = right_air[i][1]
    else:
        for i in range(len(right_air), len(left_air)):
            # Check for gaps in left foot contacts
            if x < left_air[i][0]:
                # Compute and append the double stance time for the gap
                double.append(df_p['time'][left_air[i][0]] - df_p['time'][x])
            # Update the current position in time to the end of the current left foot step
            x = left_air[i][1]

    # Return the array containing double stance times
    return double

# Preprocess the data

In [33]:
def get_caracteristics(df_p):
    """
    Computes various characteristics related to gait analysis using foot sensor data.

    Parameters:
    - df_p: DataFrame containing relevant data.

    Returns:
    - List containing computed characteristics.
    """
    # Extract left and right foot sensor data from the DataFrame
    left_foot_data = df_p[['time', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L']]
    right_foot_data = df_p[['time', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R']]

    # Obtain indices of foot contacts and lifts for both left and right feet
    start_right = get_start_foot_air(right_foot_data, 'R')
    end_right = get_end_foot_air(right_foot_data, 'R')
    start_left = get_start_foot_air(left_foot_data, 'L')
    end_left = get_end_foot_air(left_foot_data, 'L')

    # Calculate stride and swing characteristics for both left and right feet
    stride_time_right, swing_time_right, std_stride_right, std_swing_right, max_swing_right, min_swing_right, \
    cv_stride_right, cv_swing_right, swing_stride_ratio_right, log_right = get_stride_and_swing_data(right_foot_data, start_right, end_right)

    stride_time_left, swing_time_left, std_stride_left, std_swing_left, max_swing_left, min_swing_left, \
    cv_stride_left, cv_swing_left, swing_stride_ratio_left, log_left = get_stride_and_swing_data(left_foot_data, start_left, end_left)

    # Calculate double stance time during walking
    double_stance_time = get_double_stance_time(df_p, start_left, end_left, start_right, end_right)

    # Return a list containing computed characteristics
    return [np.mean(stride_time_right), np.mean(swing_time_right), std_stride_right, std_swing_right, max_swing_right, min_swing_right, cv_stride_right, \
            cv_swing_right, np.mean(swing_stride_ratio_right), log_right, \
            np.mean(stride_time_left), np.mean(swing_time_left), std_stride_left, std_swing_left, max_swing_left, min_swing_left, cv_stride_left, \
            cv_swing_left, np.mean(swing_stride_ratio_left), log_left, np.mean(double_stance_time)]


# Save caracteristics of each patient in a dataframe

In [64]:
preprocessed_data = []
columns = ['ID', 'mean_stride_time_right', 'mean_swing_time_right', 'std_stride_right', 'std_swing_right', 'max_swing_right', 'min_swing_right', 'cv_stride_right', \
            'cv_swing_right', 'mean_swing_stride_ratio_right', 'log_right', \
            'mean_stride_time_left', 'mean_swing_time_left', 'std_stride_left', 'std_swing_left', 'max_swing_left', 'min_swing_left', 'cv_stride_left', \
            'cv_swing_left', 'mean_swing_stride_ratio_left', 'log_left', 'mean_double_stance_time']

for file in txt_files:
    if file[8] == '1': # We only take regular patients not the onew performing dual-task walking and those are the ones with a 1 in the 9th position of the file name
        # Read the data of the patient from the file
        df_p = pd.read_csv(folder_path + "/" + file, sep='\t')
        # Rename the columns for easier access
        df_p.columns = ['time', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'L', 'R']
        # get the caracteristics of the patient
        caracteristics = get_caracteristics(df_p)
        # add the ID of the patient to the list
        caracteristics.insert(0, file[:6])
        # append the caracteristics to the list of caracteristics
        preprocessed_data.append(caracteristics)

# Create a DataFrame from the list of caracteristics
df = pd.DataFrame(preprocessed_data, columns=columns)

df

Unnamed: 0,ID,mean_stride_time_right,mean_swing_time_right,std_stride_right,std_swing_right,max_swing_right,min_swing_right,cv_stride_right,cv_swing_right,mean_swing_stride_ratio_right,...,mean_swing_time_left,std_stride_left,std_swing_left,max_swing_left,min_swing_left,cv_stride_left,cv_swing_left,mean_swing_stride_ratio_left,log_left,mean_double_stance_time
0,GaCo01,1.232388,0.501716,0.090278,0.033513,0.5900,0.3600,7.325457,6.679663,40.800549,...,0.515820,0.103633,0.063560,0.5999,0.1000,8.541046,12.322112,42.485962,-179.159279,0.374014
1,GaCo02,1.099281,0.471161,0.157617,0.066424,0.8900,0.0100,14.338227,14.097944,43.031592,...,0.489966,0.120293,0.050259,0.7599,0.2200,10.993286,10.257646,45.317046,-123.955930,0.076399
2,GaCo03,1.248558,0.541105,0.116528,0.039434,0.7799,0.5000,9.332993,7.287708,43.458051,...,-0.780357,0.156130,0.111758,-0.6900,-1.3099,12.569836,-14.321416,-67.847389,64.101448,0.748853
3,GaCo04,1.237852,0.516249,0.089756,0.027285,0.5900,0.4100,7.250967,5.285171,41.942028,...,-0.746642,0.141206,0.057937,-0.6599,-0.9600,11.465549,-7.759619,-99.092476,37.484498,0.739070
4,GaCo05,1.039058,0.391710,0.049033,0.031493,0.4300,0.1900,4.718949,8.039892,37.716744,...,-0.668037,0.088362,0.051481,-0.6100,-0.9799,8.541963,-7.706329,-66.373725,47.399157,0.662649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,SiPt36,1.062049,0.438907,0.069181,0.022555,0.4900,0.3000,6.513928,5.138950,41.409734,...,0.481821,0.037910,0.023214,0.5800,0.3699,3.584462,4.818011,45.557965,-44.979540,0.075616
161,SiPt37,1.200018,-0.675003,0.072358,0.050045,-0.3899,-0.8300,6.029733,-7.414112,-56.446385,...,0.530571,0.072575,0.044216,0.6699,0.2400,6.017981,8.333575,43.984573,-102.648952,0.680154
162,SiPt38,1.148092,-0.678320,0.075983,0.025765,-0.5899,-0.8199,6.618169,-3.798358,-59.690842,...,0.481312,0.039348,0.018399,0.5599,0.4199,3.410994,3.822715,41.741058,-28.774161,0.680242
163,SiPt39,1.056712,-0.627277,0.076340,0.018663,-0.5900,-0.7300,7.224335,-2.975298,-60.194407,...,0.481468,0.033823,0.024071,0.5600,0.3500,3.185784,4.999468,45.381500,-47.000363,0.608625


# import the demographic dataset to merge it with the preprocessed time series data

In [3]:
demo = pd.read_excel('gait-in-parkinsons-disease-1.0.0\demographics.xls')

# Remove the patient with ID Juc010 because it is not in the time series data
demo = demo.drop(demo[demo['ID'] == 'Juc010'].index)

demo

Unnamed: 0,ID,Study,Group,Subjnum,Gender,Age,Height (meters),Weight (kg),HoehnYahr,UPDRS,UPDRSM,TUAG,Speed_01 (m/sec),Speed_10
0,GaPt03,Ga,PD,3,female,82,1.45,50.0,3.0,20.0,10.0,36.34,,0.778
1,GaPt04,Ga,PD,4,male,68,1.71,,2.5,25.0,8.0,11.00,0.642,0.818
2,GaPt05,Ga,PD,5,female,82,1.53,51.0,2.5,24.0,5.0,14.50,0.908,0.614
3,GaPt06,Ga,PD,6,male,72,1.70,82.0,2.0,16.0,13.0,10.47,0.848,0.937
4,GaPt07,Ga,PD,7,female,53,1.67,54.0,3.0,44.0,22.0,18.34,0.677,0.579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,SiCo26,Si,CO,26,male,60,1.74,72.0,,,,9.20,1.000,
162,SiCo27,Si,CO,27,male,67,1.90,95.0,,,,12.52,1.120,
163,SiCo28,Si,CO,28,male,61,1.70,72.0,,,,12.65,0.990,
164,SiCo29,Si,CO,29,male,53,1.70,87.0,,,,11.41,1.290,


## merge the two dataframes

In [111]:
merged_df = pd.merge(demo, df, on='ID')

#Replace NaN values with appropriate metrics
merged_df['Height (meters)'] = merged_df['Height (meters)'].fillna(merged_df['Height (meters)'].mean())
merged_df['Weight (kg)'] = merged_df['Weight (kg)'].fillna(merged_df['Weight (kg)'].mean())
merged_df['HoehnYahr'] = merged_df['HoehnYahr'].fillna(merged_df['HoehnYahr'].median())
merged_df['UPDRS'] = merged_df['UPDRS'].fillna(merged_df['UPDRS'].mean())
merged_df['UPDRSM'] = merged_df['UPDRSM'].fillna(merged_df['UPDRSM'].mean())
merged_df['TUAG'] = merged_df['TUAG'].fillna(merged_df['TUAG'].mean())
merged_df['Speed_01 (m/sec)'] = merged_df['Speed_01 (m/sec)'].fillna(merged_df['Speed_01 (m/sec)'].mean())
merged_df['Speed_10'] = merged_df['Speed_10'].fillna(merged_df['Speed_10'].mean())

merged_df.to_csv('preprocessed-data\preprocessed_data.csv', index=False)

merged_df

Unnamed: 0,ID,Study,Group,Subjnum,Gender,Age,Height (meters),Weight (kg),HoehnYahr,UPDRS,...,mean_swing_time_left,std_stride_left,std_swing_left,max_swing_left,min_swing_left,cv_stride_left,cv_swing_left,mean_swing_stride_ratio_left,log_left,mean_double_stance_time
0,GaPt03,Ga,PD,3,female,82,1.45,50.000000,3.0,20.000000,...,0.543326,0.250426,0.124893,0.7299,0.0100,16.856355,22.986800,36.693073,-429.032245,0.935088
1,GaPt04,Ga,PD,4,male,68,1.71,72.524691,2.5,25.000000,...,-0.859187,0.182767,0.161860,-0.7200,-1.8899,14.242531,-18.838775,-69.058996,96.502798,0.865798
2,GaPt05,Ga,PD,5,female,82,1.53,51.000000,2.5,24.000000,...,-0.609242,0.108519,0.044731,-0.5400,-0.8200,10.185169,-7.342067,-61.899132,41.773520,0.606564
3,GaPt06,Ga,PD,6,male,72,1.70,82.000000,2.0,16.000000,...,0.413048,0.051538,0.019363,0.4600,0.3000,4.494682,4.687939,36.047953,-42.744401,0.154679
4,GaPt07,Ga,PD,7,female,53,1.67,54.000000,3.0,44.000000,...,0.535093,0.535390,0.079130,0.7000,0.1800,36.544871,14.788033,38.060962,-135.812348,0.592562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,SiCo26,Si,CO,26,male,60,1.74,72.000000,2.0,21.626866,...,0.514308,0.042496,0.017759,0.5699,0.4300,3.520216,3.452996,42.638873,-28.167570,0.706111
161,SiCo27,Si,CO,27,male,67,1.90,95.000000,2.0,21.626866,...,0.483932,0.029749,0.016072,0.5299,0.4400,2.881987,3.321194,46.890049,-18.591358,0.054823
162,SiCo28,Si,CO,28,male,61,1.70,72.000000,2.0,21.626866,...,-0.742298,0.077380,0.034445,-0.6599,-0.8999,6.615770,-4.640265,-64.043164,31.019534,0.732547
163,SiCo29,Si,CO,29,male,53,1.70,87.000000,2.0,21.626866,...,-0.638376,0.080804,0.022878,-0.6099,-0.7699,7.772106,-3.583841,-62.603731,23.296563,0.634429


In [13]:
X = []
y = []
y_multi_class = []
for file in txt_files:
    patient_class = 0
    patient_adv_class = 0
    # Read the data of the patient from the file
    df_p = pd.read_csv(folder_path + "/" + file, sep='\t')
    # Rename the columns for easier access
    df_p.columns = ['time', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'L', 'R']
    if file[2:4] == 'Co':
        patient_class = 0
    elif file[2:4] == 'Pt':
        patient_class = 1

    patient_adv_class = demo[demo['ID'] == file[:6]]['HoehnYahr'].values[0]
    #if patient_adv_class is nan print the file name
    if np.isnan(patient_adv_class):
        patient_adv_class = 0
    elif patient_adv_class == 2:
        patient_adv_class = 1
    elif patient_adv_class == 2.5:
        patient_adv_class = 2

    df_np = df_p.iloc[1:].to_numpy()
    full_size = 100
    overlap = 0.5
    overlap_size = int(full_size * overlap / 2)
    entry_size = full_size - overlap_size

    for i in range(0, df_np.shape[0], entry_size):
        if df_np.shape[0] >= i + entry_size + overlap_size:
            y.append(patient_class)
            y_multi_class.append(patient_adv_class)
            X.append(df_np[i:i + entry_size + overlap_size, :])
    

        
X = np.array(X)
y = np.array(y)
y_multi_class = np.array(y_multi_class)
X.shape, y.shape, y_multi_class.shape

((44021, 100, 19), (44021,), (44021,))

In [8]:
# save X and y
np.save('preprocessed-data\X.npy', X)
np.save('preprocessed-data\y.npy', y)
np.save('preprocessed-data\y_multi_class.npy', y_multi_class)