In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from helper_functions import graph

# Parkinson's Disease Classification EDA

Here we would like to uncover a lot about the data as the documentation for the data is rather blank aside from the study this stems from.

In [3]:
raw_df = pd.read_csv("/Users/marko/Parkinson_Classification/data/raw/pd_speech_features.csv", header=1)
raw_df.head()

Unnamed: 0,id,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,0,1,0.85247,0.71826,0.57227,240,239,0.008064,8.7e-05,0.00218,...,1.562,2.6445,3.8686,4.2105,5.1221,4.4625,2.6202,3.0004,18.9405,1
1,0,1,0.76686,0.69481,0.53966,234,233,0.008258,7.3e-05,0.00195,...,1.5589,3.6107,23.5155,14.1962,11.0261,9.5082,6.5245,6.3431,45.178,1
2,0,1,0.85083,0.67604,0.58982,232,231,0.00834,6e-05,0.00176,...,1.5643,2.3308,9.4959,10.7458,11.0177,4.8066,2.9199,3.1495,4.7666,1
3,1,0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,...,3.7805,3.5664,5.2558,14.0403,4.2235,4.6857,4.846,6.265,4.0603,1
4,1,0,0.3279,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,...,6.1727,5.8416,6.0805,5.7621,7.7817,11.6891,8.2103,5.0559,6.1164,1


755 features! A cursory glance yields that it looks like lots of these features were one-hot encoded or time series were placed in separate columns which is quite interesting. Anyhow, continuing with our analysis:

In [4]:
raw_df.describe()

Unnamed: 0,id,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
count,756.0,756.0,756.0,756.0,756.0,756.0,756.0,756.0,756.0,756.0,...,756.0,756.0,756.0,756.0,756.0,756.0,756.0,756.0,756.0,756.0
mean,125.5,0.515873,0.746284,0.700414,0.489058,323.972222,322.678571,0.00636,0.000383,0.002324,...,26.237251,22.840337,18.587888,13.872018,12.218953,12.375335,14.79923,14.751559,31.48111,0.746032
std,72.793721,0.500079,0.169294,0.069718,0.137442,99.219059,99.402499,0.001826,0.000728,0.002628,...,42.220693,32.626464,25.537464,20.046029,17.783642,16.341665,15.722502,14.432979,34.230991,0.435568
min,0.0,0.0,0.041551,0.5435,0.1543,2.0,1.0,0.002107,1.1e-05,0.00021,...,1.5098,1.5317,1.5829,1.7472,1.7895,1.6287,1.8617,1.9559,2.364,0.0
25%,62.75,0.0,0.762833,0.647053,0.386537,251.0,250.0,0.005003,4.9e-05,0.00097,...,2.408675,3.4528,3.354825,3.07745,2.937025,3.114375,3.665925,3.741275,3.94875,0.0
50%,125.5,1.0,0.809655,0.700525,0.484355,317.0,316.0,0.006048,7.7e-05,0.001495,...,5.5863,7.06275,6.0774,4.77085,4.30045,4.74145,6.7257,7.33425,10.63725,1.0
75%,188.25,1.0,0.834315,0.754985,0.586515,384.25,383.25,0.007528,0.000171,0.00252,...,28.958075,29.83085,21.94405,13.188,10.87615,12.201325,21.92205,22.495175,61.125325,1.0
max,251.0,1.0,0.90766,0.85264,0.87123,907.0,905.0,0.012966,0.003483,0.02775,...,239.7888,203.3113,121.5429,102.207,85.5717,73.5322,62.0073,57.5443,156.4237,1.0


Alright, nothing particularly meaningful or interesting coming from these routine checks. Although I will be interested in looking at say the class of variables indexed by time to see if there's any discrepency. The other thing to note is just simply looking at the features most correlated to the outcome as opposed to the features cited in the paper. The baseline features that were most popular according to the paper below are: Jitter, shimmer, fundamental frequency parameters, harmonicity parameters, RPDE, DFA, and PPE. 

In [6]:
corr_mat = raw_df.drop("class", axis=1).apply(lambda col: col.corr(raw_df["class"],
                                                                   method="spearman"))

# 50 largest correlations in absolute value
largest_corr = corr_mat.iloc[corr_mat.abs().argsort()][-50:].sort_values(ascending=True)
graph.barplot(largest_corr,
              savefig=True, 
              title=None,
              xlab='Spearman Correlation with Class')
plt.savefig("./figs/spearman_corr_to_class.png", bbox_inches="tight")

I see some of the features mentioned in the paper - jitter, fundamental frequency parameters, and quite a lot of the time-series elements are all negatively correlated. However we need to test significance of time trends in the entropy, TKEO, and stdValue features to see if we can perhaps combine these to condense the data down.

In [21]:
def na_pct(col):
    return len(col[col.isna()]) / len(col)

na_pcts = raw_df.apply(na_pct, axis=0)
na_pcts[na_pcts != 0]

Series([], dtype: float64)

There is no missing data in any of the columns surprisingly. We can also check for duplicate IDs. 

In [23]:
raw_df["id"].value_counts()

0      3
173    3
160    3
161    3
162    3
      ..
88     3
89     3
90     3
91     3
251    3
Name: id, Length: 252, dtype: int64

Okay, so it appears as though each patient was scanned three times. Let us check to see if these scans were consistent for all patients - it would be surprising if the outcome for some patients changed over different measurements. Let us see:

In [31]:
# look at variance of outcome - if not zero, then there's another 
id_var = raw_df.groupby("id")["class"].var()
id_var[id_var != 0]

Series([], Name: class, dtype: float64)

That's a good sanity check - there's no variance in the outcomes for data with the same IDs. However, the variances for other features such as "PPE" are nonzero, so these columns are not immediately redundant. 

# Sources

The original paper is given here: https://www.sciencedirect.com/science/article/pii/S1568494618305799?via%3Dihub