# Investigating the amount of Zero and NaN values for the GITA database 

# Imports

In [1]:
import os
import numpy as np
import pandas as pd


from functions import *
from definition_of_data_and_feature_groups import *
from constants import *


# Setup for the experiment 

### Things to choose: 

In [2]:
###########################################

# Seed number 
seed_number = 42
np.random.seed(seed_number) 

# Features 
features_for_model = all_features # phonation_all_features # [] or [articulation_all_features + phonation_all_features + prosody_all_features]

print(len(ZOF))
print(len(all_features))

# Utterances 
utterance_type =  [""] # [""] # ["Vowels", "modulated_vowels"] or etc. 
specific_utterance = [""] # ["a", "viaje"] or etc. 

feature_sheet_name = "GITA-all" # "GITA-all" # "EWA-100" # EWA-balanced-69 # Sheet name we want to get features from. 
fold_file = "kfold-groups-tsv.csv"

###########################################

213
619


# Data processing 
- Get features, metadata and fold info
- Change all ID columns to be on the same form as fold info --> Like: "ID" and "A0013 or AC0013"
- Add metadata columns to feature info 
- Restructure the data.

In [3]:

feature_path = personal_path_to_code + '/Features.xlsx'   # Defines path to feature folder to use. 
features = pd.read_excel(feature_path, sheet_name=feature_sheet_name, index_col=0)

metadata_path = os.path.join(personal_path_to_PC_GITA,'PCGITA_metadata.xlsx')
metadata = pd.read_excel(metadata_path)

fold_info_path = os.path.join(personal_path_to_balanced_folds, fold_file) # File where we have the fold distribution saved. 
fold_info = pd.read_csv(fold_info_path)

metadata = restructure_id(metadata)

metadata_columns =  ["SEX", "AGE"] 
fold_info_columns = ["Fold"]
feature_info_columns = ["ID", "Utterance", "Utterance type", "Group"]

# patient_info_columns will be used later on to remove all metadata and then also want to remove id. 
patient_info_columns = feature_info_columns + metadata_columns + fold_info_columns

# Add metadata and fold info: 
features = add_columns_to_dataframe(features, metadata, metadata_columns)
features = add_columns_to_dataframe(features, fold_info, fold_info_columns)


# Sorting out the data and features we want to use: 
if utterance_type != [""]: # If defined utterance sub groups (like vowels or words or combinations)
        features = features[features["Utterance type"].isin(utterance_type)]

if specific_utterance != [""]: # If defined specific utterances (like "a" or "viaje" or combinations)
        features = features[features["Utterance"].isin(specific_utterance)]

if features_for_model:
        features = features.loc[:, features_for_model + patient_info_columns]
        


In [4]:
# ZERO VALUES FOR HC AND PD: 

pd = features[features['Group'] == 1]
hc = features[features['Group'] == 0]

zero_analysis_pd = pd.loc[:, features_for_model]
zero_analysis_hc = hc.loc[:, features_for_model]

print(" PARKINSONS ")
print("Number of Zero values:", (zero_analysis_pd == 0).sum().sum())
print("Number of NaN values:", zero_analysis_pd.isna().sum().sum())
print("Total number of values from features: ", zero_analysis_pd.size)

print(" ")
print(" HEALTHY ")
print("Number of Zero values:", (zero_analysis_hc == 0).sum().sum())
print("Number of NaN values:", zero_analysis_hc.isna().sum().sum())
print("Total number of values from features: ", zero_analysis_hc.size)


 PARKINSONS 
Number of Zero values: 721225
Number of NaN values: 10168
Total number of values from features:  1948612
 
 HEALTHY 
Number of Zero values: 680015
Number of NaN values: 10254
Total number of values from features:  1948612


In [5]:
# zero analysis: 

zero_analysis = features.loc[:, features_for_model]

# print("Number of Zero values:", (f_test_nan == 0).sum().sum())
print("Number of Zero values:", (zero_analysis == 0).sum().sum())
print("Total number of values from features: ", zero_analysis.size)

columns_with_nan = zero_analysis.columns[~(zero_analysis == 0).any()].tolist()

print("Columns with Zero values:", (columns_with_nan))
print("Amount of diff columns with Zero: ", len(columns_with_nan))

Number of Zero values: 1401240
Total number of values from features:  3897224
Columns with Zero values: ['avg BBEoff_1', 'avg BBEoff_2', 'avg BBEoff_3', 'avg BBEoff_4', 'avg BBEoff_5', 'avg BBEoff_6', 'avg BBEoff_7', 'avg BBEoff_8', 'avg BBEoff_9', 'avg BBEoff_10', 'avg BBEoff_11', 'avg BBEoff_12', 'avg BBEoff_13', 'avg BBEoff_14', 'avg BBEoff_15', 'avg BBEoff_16', 'avg BBEoff_17', 'avg BBEoff_18', 'avg BBEoff_19', 'avg BBEoff_20', 'avg BBEoff_21', 'avg BBEoff_22', 'avg MFCCoff_1', 'avg MFCCoff_2', 'avg MFCCoff_3', 'avg MFCCoff_4', 'avg MFCCoff_5', 'avg MFCCoff_6', 'avg MFCCoff_7', 'avg MFCCoff_9', 'avg MFCCoff_10', 'avg MFCCoff_11', 'avg MFCCoff_12', 'avg F1', 'avg DF1', 'avg DDF1', 'avg F2', 'avg DF2', 'avg DDF2', 'std BBEoff_1', 'std BBEoff_2', 'std BBEoff_3', 'std BBEoff_4', 'std BBEoff_5', 'std BBEoff_6', 'std BBEoff_7', 'std BBEoff_8', 'std BBEoff_9', 'std BBEoff_10', 'std BBEoff_11', 'std BBEoff_12', 'std BBEoff_13', 'std BBEoff_14', 'std BBEoff_15', 'std BBEoff_16', 'std BBEoff

In [6]:
# Nan analysis: 

nan_analysis = features.loc[:, features_for_model]

# print("Number of Zero values:", (f_test_nan == 0).sum().sum())
print("Number of NaN values:", nan_analysis.isna().sum().sum())
print("Total number of values from features: ", nan_analysis.size)

f_test_nan = features.loc[:, features_for_model + ["Utterance type"]]


columns_with_nan = f_test_nan.columns[f_test_nan.isna().any()].tolist()

print("Columns with NaN values:", (columns_with_nan))
print("Amount of diff columns with nan: ", len(columns_with_nan))

nan_counts = f_test_nan.isna().sum()
print("Amount from each column number: ")
print(nan_counts[nan_counts > 500])

rows_with_nan = f_test_nan[f_test_nan.isna().any(axis=1)]

utterance_types_with_nan = list(np.array(rows_with_nan['Utterance type']))

unique_list = []
for item in utterance_types_with_nan:
    if item not in unique_list:
        unique_list.append(item)

print("Unique utterance types with NaN. ", unique_list)
print("Unique number of utterance types with NaN. ", len(unique_list))

Number of NaN values: 20422
Total number of values from features:  3897224
Columns with NaN values: ['skewness MFCCon_9', 'skewness MFCCon_12', 'skewness DMFCCon_9', 'skewness DMFCCon_12', 'skewness MFCCoff_1', 'skewness MFCCoff_2', 'skewness MFCCoff_3', 'skewness MFCCoff_4', 'skewness MFCCoff_5', 'skewness MFCCoff_6', 'skewness MFCCoff_7', 'skewness MFCCoff_8', 'skewness MFCCoff_9', 'skewness MFCCoff_10', 'skewness MFCCoff_11', 'skewness MFCCoff_12', 'skewness DMFCCoff_1', 'skewness DMFCCoff_2', 'skewness DMFCCoff_3', 'skewness DMFCCoff_4', 'skewness DMFCCoff_5', 'skewness DMFCCoff_6', 'skewness DMFCCoff_7', 'skewness DMFCCoff_8', 'skewness DMFCCoff_9', 'skewness DMFCCoff_10', 'skewness DMFCCoff_11', 'skewness DMFCCoff_12', 'skewness DDMFCCoff_9', 'kurtosis MFCCon_9', 'kurtosis MFCCon_12', 'kurtosis DMFCCon_9', 'kurtosis DMFCCon_12', 'kurtosis MFCCoff_1', 'kurtosis MFCCoff_2', 'kurtosis MFCCoff_3', 'kurtosis MFCCoff_4', 'kurtosis MFCCoff_5', 'kurtosis MFCCoff_6', 'kurtosis MFCCoff_7',

In [7]:
# Remove NaN
features = remove_NaN(features, patient_info_columns, print_out = True)

Number of Zero values: 1401240
Number of NaN values: 20422
Total number of values from features:  3897224


In [8]:
# ZERO VALUES FOR HC AND PD AFTER CHANGING NAN TO ZERO: 

pd = features[features['Group'] == 1]
hc = features[features['Group'] == 0]

zero_analysis_pd = pd.loc[:, features_for_model]
zero_analysis_hc = hc.loc[:, features_for_model]

print(" PARKINSONS ")
print("Number of Zero values:", (zero_analysis_pd == 0).sum().sum())
print("Number of NaN values:", zero_analysis_pd.isna().sum().sum())
print("Total number of values from features: ", zero_analysis_pd.size)

print(" ")
print(" HEALTHY ")
print("Number of Zero values:", (zero_analysis_hc == 0).sum().sum())
print("Number of NaN values:", zero_analysis_hc.isna().sum().sum())
print("Total number of values from features: ", zero_analysis_hc.size)


 PARKINSONS 
Number of Zero values: 731393
Number of NaN values: 0
Total number of values from features:  1948612
 
 HEALTHY 
Number of Zero values: 690269
Number of NaN values: 0
Total number of values from features:  1948612


In [9]:
# zero analysis after changing NaN values: 

zero_analysis = features.loc[:, features_for_model]

# print("Number of Zero values:", (f_test_nan == 0).sum().sum())
print("Number of Zero values:", (zero_analysis == 0).sum().sum())
print("Total number of values from features: ", zero_analysis.size)

columns_with_zeros_and_nan = zero_analysis.columns[(zero_analysis == 0).any()].tolist()
columns_without_zeros_and_nan = zero_analysis.columns[~(zero_analysis == 0).any()].tolist()


# print("Columns with Zero values:", (columns_with_zeros_and_nan))
# print("Amount of diff columns with Zero: ", len(columns_with_zeros_and_nan))

# print("Columns without Zero values:", (columns_without_zeros_and_nan))
# print("Amount of diff columns with out Zero: ", len(columns_without_zeros_and_nan))

Number of Zero values: 1421662
Total number of values from features:  3897224
