In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#Upload CSV file
data = pd.read_csv("Data/pd_speech_features.csv")

In [None]:
print(data.iloc[:,:10].head())

In [None]:
# General information of the dataset
print(data.info())

In [None]:
# Total number of null data
print(data.isnull().sum().sum())

In [None]:
# Plot variables
def plot_variables(df,n_rows, n_cols):
    classe = df["class"].astype("category")
    
    #Grid for plots
    n_rows=76
    n_cols=10
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(50,400))
    axes = axes.flatten()
    
    #Create plots
    for i,column in enumerate(df.columns[2:754]):
        ax=axes[i]
        sns.boxplot(x=classe, y=df[column], ax=ax)
        plt.xlabel("Class")
        plt.ylabel(column)
    
    plt.tight_layout()
    plt.show()


In [None]:
plot_variables(data, 76, 10)

In [None]:
# Descriptive statistics of the first 15 columns and the last one
print(data.iloc[:,list(range(10))+[-1]].describe())

In [None]:
# Descriptive gender and class  --> Només contempla observacions; per nombre pacients (:3)
sns.countplot(x=data["class"], hue=data["gender"])
plt.show()

For the first descriptibes obtained:
Dataset has: 756 rows, 755 columns (749 float numbers, 64 integer)
Fisrt column is patient ID, second column is gender (integer that has to be replaced for a categorical variable) last column is class (also represented as integer).

For the gender variable there is a balanced data for men and women. 
For the class variable there is no balance data: Its about 3/4 of data for Parkinson patients and 1/4 of data for healthy subjects.

There is no null data. 
Data requires normalization: variable PPE min value is arround 0.045 and it's maximum is arround 0.9 with a mean at 0.74. Variable numPulses has a min value of 2 and a max value of 907 with a mean of arround 324. 

## Standarization

For standarization, NOT use MinMaxScaler. As data shown previously there are a lot of outliers that may affect normalization. 
Insted can use Robust Scaler normalization since is not sensitive to outliers and there is no need to assume Gaussian distribution.


In [None]:
from sklearn.preprocessing import RobustScaler

# RobustScaler normalization
normal_data = RobustScaler().fit_transform(data.iloc[:,2:754])
norm_df =pd.DataFrame(normal_data, columns=data.columns[2:754])
norm_data = pd.concat([data.iloc[:,:2], norm_df, data.iloc[:,754]], axis=1)
norm_data

In [None]:
# Create new file with normalization data
# norm_data.to_csv("Data/norm_data.csv", index=False)

In [None]:
plot_variables(norm_data, 10,10)

In [None]:
# List of variables according to group
Gender_Features = norm_data.columns[1]
Baseline_Features = norm_data.columns[2:23].tolist()
Time_Freq_Features = norm_data.columns[23:34].tolist()
Vocal_Fold_Features = norm_data.columns[34:56].tolist()
MFCC_Features = norm_data.columns[56:140].tolist()
WT_FT_Features = norm_data.columns[140:322].tolist()
TQWT_Features = norm_data.columns[322:754].tolist()
Class = norm_data.columns[754]

In [None]:
# Correlation of different Features groups

feature_groups = [Baseline_Features, Time_Freq_Features, 
                  Vocal_Fold_Features, MFCC_Features]
feature_names = ["Baseline_Features", "Time_Freq_Features", 
                 "Vocal_Fold_Features", "MFCC_Features"]
for i,features in enumerate(feature_groups):
    selected_features = norm_data[features]
    correlation_matrix = selected_features.corr()
    plt.figure(figsize=(8,8))
    sns.heatmap(correlation_matrix, cmap="coolwarm", square=True)
    plt.title(f"Heatmap of {feature_names[i]}")
    plt.show()

Some of the variables in each group of variables present high correlation. At some point could be interesting to perform feature selection or extraction of characteristics to reduce dimensionality and redundancy.