In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from xgboost import XGBClassifier
from sklearn import preprocessing
import joblib
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import skfuzzy as fuzz
import seaborn as sns
import plotly as py
import plotly.graph_objs as go
import os
from copy import deepcopy
import warnings
import ls_functions as lsf
import shap
import scipy.stats as stats



# DATASET

In [2]:

dataStreamPath = os.getcwd() + "\\"
graphsStreamPath= os.getcwd() + "\\graphs\\"
shapvaluesStreamPath= os.getcwd() + "\\shapvalues\\"
oneheadmodelsStreamPath= os.getcwd() + "\\onehead_models\\"
baselinemodelsStreamPath= os.getcwd() + "\\baseline_models\\"
protoformsStreamPath= os.getcwd() + "\\protoforms_github\\"
new_shapvalues = os.getcwd() + "\\new_shapvalues\\"

In [3]:
protoformsStreamPath

'c:\\Users\\PRECISION 7X10\\Documents\\GitHub\\plenary\\protoforms_github\\'

In [4]:
file_info = [
    ("data_shap_base", "data_shap_baseline.csv"),
    ("shap_values_0_class_base", "shap_values_class_0_baseline_states.csv"),
    ("shap_values_1_class_base", "shap_values_class_1_baseline_states.csv"),
    ("shap_values_2_class_base", "shap_values_class_2_baseline_states.csv"),
    ("shap_values_3_class_base", "shap_values_class_3_baseline_states.csv"),
    ("data_shap_oh", "data_shap_onehead.csv"),
    ("shap_values_0_class_oh", "shap_values_class_0_onehead_states.csv"),
    ("shap_values_1_class_oh", "shap_values_class_1_onehead_states.csv"),
    ("shap_values_2_class_oh", "shap_values_class_2_onehead_states.csv"),
    ("shap_values_3_class_oh", "shap_values_class_3_onehead_states.csv"),
    ("shap_values_0_symptom_oh", "shap_values_class_0_onehead_symptoms.csv"),
    ("shap_values_1_symptom_oh", "shap_values_class_1_onehead_symptoms.csv"),
    ("shap_values_2_symptom_oh", "shap_values_class_2_onehead_symptoms.csv"),
    ("shap_values_3_symptom_oh", "shap_values_class_3_onehead_symptoms.csv"),
    ("shap_values_4_symptom_oh", "shap_values_class_4_onehead_symptoms.csv"),
    ("shap_values_5_symptom_oh", "shap_values_class_5_onehead_symptoms.csv"),
    ("shap_values_6_symptom_oh", "shap_values_class_6_onehead_symptoms.csv"),
    ("shap_values_7_symptom_oh", "shap_values_class_7_onehead_symptoms.csv"),
    ("shap_values_8_symptom_oh", "shap_values_class_8_onehead_symptoms.csv"),
    ("shap_values_9_symptom_oh", "shap_values_class_9_onehead_symptoms.csv")
]

# Loop through the list and read each file into a DataFrame with the correct name
for var_name, file_name in file_info:
    file_path = new_shapvalues + file_name
    globals()[var_name] = pd.read_csv(file_path)

## Groups of variables

In [5]:
#group of energy related variables 
acoustic_group_energy_label='energy'
acoustic_group_energy=['pcm_LOGenergy_sma',
'pcm_fftMag_fband0-250_sma',
'pcm_fftMag_fband0-650_sma',
'audspec_lengthl1norm_sma',
'audspecrasta_lengthl1norm_sma',
'pcm_rmsenergy_sma',
'audSpec_Rfilt_sma_compare_0_',
'audSpec_Rfilt_sma_compare_1_',
'audSpec_Rfilt_sma_compare_2_',
'audSpec_Rfilt_sma_compare_3_',
'audSpec_Rfilt_sma_compare_4_',
'audSpec_Rfilt_sma_compare_5_',
'audSpec_Rfilt_sma_compare_6_',
'audSpec_Rfilt_sma_compare_7_',
'audSpec_Rfilt_sma_compare_8_',
'audSpec_Rfilt_sma_compare_9_',
'audSpec_Rfilt_sma_compare_10_',
'audSpec_Rfilt_sma_compare_11_',
'audSpec_Rfilt_sma_compare_12_',
'audSpec_Rfilt_sma_compare_13_',
'audSpec_Rfilt_sma_compare_14_',
'audSpec_Rfilt_sma_compare_15_',
'audSpec_Rfilt_sma_compare_16_',
'audSpec_Rfilt_sma_compare_17_',
'audSpec_Rfilt_sma_compare_18_',
'audSpec_Rfilt_sma_compare_19_',
'audSpec_Rfilt_sma_compare_20_',
'audSpec_Rfilt_sma_compare_21_',
'audSpec_Rfilt_sma_compare_22_',
'audSpec_Rfilt_sma_compare_23_',
'audSpec_Rfilt_sma_compare_24_',
'audSpec_Rfilt_sma_compare_25_',
'pcm_fftMag_fband250-650_sma_compare',
'pcm_fftMag_fband1000-4000_sma_compare',
'loudness_sma3']

In [6]:
#group of pitch related variables 
acoustic_group_pitch_label='pitch'
acoustic_group_pitch=['voiceprob_sma',
       'f0_sma', 'f0env_sma',
      'f0final_sma', 'F0semitoneFrom27_5Hz_sma3nz',
      'f1frequency_sma3nz',
       'f1bandwidth_sma3nz', 'f1amplitudelogrelf0_sma3nz',
       'f2frequency_sma3nz', 'f2amplitudelogrelf0_sma3nz',
       'f3frequency_sma3nz', 'f3amplitudelogrelf0_sma3nz']

In [7]:

#group of spectral related variables 
acoustic_group_spectral_label='spectral'
acoustic_group_spectral=['pcm_fftMag_mfcc_0_',
                         #'pcm_fftMag_mfcc_1_',
                         #'pcm_fftMag_mfcc_2_',
                         #'pcm_fftMag_mfcc_3_',
                         #'pcm_fftMag_mfcc_4_',
                         #'pcm_fftMag_mfcc_5_',
                         #'pcm_fftMag_mfcc_6_',
                         #'pcm_fftMag_mfcc_7_',
                         #'pcm_fftMag_mfcc_8_',
                         #'pcm_fftMag_mfcc_9_',
                         #'pcm_fftMag_mfcc_10_',
                         #'pcm_fftMag_mfcc_11_',
                         #'pcm_fftMag_mfcc_12_',
                         #'pcm_fftMag_spectralRollOff25_0_sma',
                         #'pcm_fftMag_spectralRollOff50_0_sma',
                         #'pcm_fftMag_spectralRollOff75_0_sma',
                         #'pcm_fftMag_spectralRollOff90_0_sma',
                         'pcm_fftmag_spectralflux_sma',
                         'pcm_fftmag_spectralcentroid_sma',
                         #'pcm_fftmag_spectralmaxpos_sma',
                         #'pcm_fftmag_spectralminpos_sma',
                         'pcm_fftmag_spectralentropy_sma_compare',
                         'pcm_fftmag_spectralvariance_sma_compare',
                         'pcm_fftmag_spectralskewness_sma_compare',
                         'pcm_fftmag_spectralkurtosis_sma_compare',
                         'pcm_fftmag_psysharpness_sma_compare',
                         'pcm_fftmag_spectralharmonicity_sma_compare'       
                         #'alpharatio_sma3',
                         #'hammarbergindex_sma3',
                         #'slope0-500_sma3',
                         #'slope500-1500_sma3', 
                         #'logRelF0-H1-H2_sma3nz',
                         #'logRelF0-H1-A3_sma3nz'
                         ]

In [8]:

#group of quality related variables 
acoustic_group_quality_label='quality'
acoustic_group_quality=['voicingfinalunclipped_sma',
                        'jitterlocal_sma',
                        'jitterddp_sma',
                        'shimmerlocal_sma',
                        'loghnr_sma']

In [9]:
acoustic_group_labels = {acoustic_group_energy_label : acoustic_group_energy,
                             acoustic_group_pitch_label : acoustic_group_pitch ,
                             acoustic_group_spectral_label : acoustic_group_spectral,
                             acoustic_group_quality_label: acoustic_group_quality}

In [10]:
data_classes_base = {'0': shap_values_0_class_base ,
                '1': shap_values_0_class_base ,
                '2': shap_values_0_class_base ,
                '3': shap_values_0_class_base }

## BASELINE Model

### INDIVIDUAL MODEL FOR CLASSES

In [11]:
tab_ind = []

for class_, data_class in data_classes_base.items():
        df = lsf.ls_ind_params(data = data_class , shapdata = data_shap_base, classtoprint = class_)
        df["class"] = class_
        tab_ind.append(df)  
individual_classes_base = pd.concat(tab_ind)   

filename_individual_classes_base = protoformsStreamPath + "baseline_PROTOFORM_individual_classes.csv"
individual_classes_base.to_csv(filename_individual_classes_base)


invalid value encountered in double_scalars


### GROUP MODEL FOR CLASSES

In [12]:
tab_group = []

for class_, data_class in data_classes_base.items():
    for label, feature in acoustic_group_labels.items():
        print("Label:")
        print(label)
        print("Class:")
        print(class_)
        df = lsf.ls_group_params(acoustic_group = feature, acoustic_group_label = label,
                      data = data_class , shapdata = data_shap_oh, classtoprint = class_, model = "base")
        df["class"] = class_
        df["label"] = label  
        tab_group.append(df)  
base_group_classes = pd.concat(tab_group)    

filename_base_group_classes = protoformsStreamPath + "base_PROTOFORM_group_classes.csv"
base_group_classes.to_csv(filename_base_group_classes) 

Label:
energy
Class:
0
Label:
pitch
Class:
0
Label:
spectral
Class:
0
Label:
quality
Class:
0
Label:
energy
Class:
1
Label:
pitch
Class:
1
Label:
spectral
Class:
1
Label:
quality
Class:
1
Label:
energy
Class:
2
Label:
pitch
Class:
2
Label:
spectral
Class:
2
Label:
quality
Class:
2
Label:
energy
Class:
3
Label:
pitch
Class:
3
Label:
spectral
Class:
3
Label:
quality
Class:
3


## ONE HEAD Model

In [13]:
data_classes = {'0': shap_values_0_class_oh ,
                '1': shap_values_1_class_oh ,
                '2': shap_values_2_class_oh ,
                '3': shap_values_3_class_oh }

In [14]:
data_symptoms = {'0': shap_values_0_symptom_oh , #anxiety
                '1': shap_values_1_symptom_oh , #'decreased_activity'
                '2': shap_values_2_symptom_oh , 
                '3': shap_values_3_symptom_oh , 
                '4': shap_values_4_symptom_oh , #elevated activity
                '5': shap_values_5_symptom_oh , 
                '6': shap_values_6_symptom_oh , 
                '7': shap_values_7_symptom_oh , 
                '8': shap_values_8_symptom_oh , 
                '9': shap_values_9_symptom_oh
                } 


### individual model for classes

In [15]:
tab_ind = []

for class_, data_class in data_classes.items():
        df = lsf.ls_ind_params(data = data_class , shapdata = data_shap_oh, classtoprint = class_)
        df["class"] = class_
        tab_ind.append(df)  
individual_classes = pd.concat(tab_ind)   

filename_ind_class = protoformsStreamPath + "oh_PROTOFORM_individual_classes.csv"
individual_classes.to_csv(filename_ind_class)

invalid value encountered in double_scalars


### group model for classes

In [16]:
tab_group = []

for class_, data_class in data_classes.items():
    for label, feature in acoustic_group_labels.items():
        print("Label:")
        print(label)
        print("Class:")
        print(class_)
        df = lsf.ls_group_params(acoustic_group = feature, acoustic_group_label = label,
                      data = data_class , shapdata = data_shap_oh, classtoprint = class_, model = "oh")
        df["class"] = class_
        df["label"] = label  
        tab_group.append(df)  
group_classes = pd.concat(tab_group)    

filename_group_classes = protoformsStreamPath + "oh_PROTOFORM_group_classes.csv"
group_classes.to_csv(filename_group_classes) 

Label:
energy
Class:
0
Label:
pitch
Class:
0
Label:
spectral
Class:
0
Label:
quality
Class:
0
Label:
energy
Class:
1
Label:
pitch
Class:
1
Label:
spectral
Class:
1
Label:
quality
Class:
1
Label:
energy
Class:
2
Label:
pitch
Class:
2
Label:
spectral
Class:
2
Label:
quality
Class:
2
Label:
energy
Class:
3
Label:
pitch
Class:
3
Label:
spectral
Class:
3
Label:
quality
Class:
3


In [17]:
group_classes

Unnamed: 0,Id,protoform,DoT,DoS,DoF,class,label
0,0.0,Among records that contribute against predicti...,0.000000,0.601333,0.713199,0,energy
1,1.0,Among records that contribute against predicti...,0.000000,0.215429,0.713199,0,energy
2,2.0,Among records that contribute against predicti...,0.000000,0.596476,0.713199,0,energy
3,3.0,Among records that contribute around zero to p...,0.000000,0.191333,0.059406,0,energy
4,4.0,Among records that contribute around zero to p...,0.000000,0.062476,0.059406,0,energy
...,...,...,...,...,...,...,...
4,4.0,Among records that contribute around zero to p...,0.000000,0.006000,0.013188,3,quality
5,5.0,Among records that contribute around zero to p...,1.000000,0.022000,0.013188,3,quality
6,6.0,Among records that contribute positively to pr...,0.000000,0.121333,0.212947,3,quality
7,7.0,Among records that contribute positively to pr...,0.000000,0.092000,0.212947,3,quality


### individual model for symptoms

In [18]:

tab_ind = []

for symptom, data_symptom in data_symptoms.items():
        df = lsf.ls_ind_params(data = data_symptom , shapdata = data_shap_oh, classtoprint = symptom)
        df["symptom"] = symptom
        tab_ind.append(df)  
individual_symptoms = pd.concat(tab_ind)   

individual_symptoms.to_csv("oh_PROTOFORM_individual_symptoms.csv")

invalid value encountered in double_scalars


### grouped model for symptoms

In [19]:

tab_group = []

for symptom, data_symptom in data_symptoms.items():
    print(symptom)
    for label, feature in acoustic_group_labels.items():
        print(label)
        df = lsf.ls_group_params(acoustic_group = feature, acoustic_group_label = label,
                      data = data_symptom , shapdata = data_shap_oh, classtoprint = symptom, model = "oh")
        df["symptom"] = symptom
        df["label"] = label  
        tab_group.append(df)  
group_symptoms = pd.concat(tab_group)    

filename_group_symptoms = protoformsStreamPath + "oh_PROTOFORM_group_symptoms.csv"
group_symptoms.to_csv(filename_group_symptoms) 

0
energy
pitch
spectral
quality
1
energy
pitch
spectral
quality
2
energy
pitch
spectral
quality
3
energy
pitch
spectral
quality
4
energy
pitch
spectral
quality
5
energy
pitch
spectral
quality
6
energy
pitch
spectral
quality
7
energy
pitch
spectral
quality
8
energy
pitch
spectral
quality
9
energy
pitch
spectral
quality


# SUMMARY

In [22]:
base_group_classes.loc[base_group_classes['class'] == cv, d]
group_classes.loc[group_classes['class'] == cv, d]

0    0.713199
1    0.713199
2    0.713199
3    0.059406
4    0.059406
5    0.059406
6    0.224062
7    0.224062
8    0.224062
0    0.481612
1    0.481612
2    0.481612
3    0.031408
4    0.031408
5    0.031408
6    0.483646
7    0.483646
8    0.483646
0    0.559180
1    0.559180
2    0.559180
3    0.050400
4    0.050400
5    0.050400
6    0.387087
7    0.387087
8    0.387087
0    0.770532
1    0.770532
2    0.770532
3    0.013188
4    0.013188
5    0.013188
6    0.212947
7    0.212947
8    0.212947
Name: DoF, dtype: float64

In [25]:
definitions = ['DoT','DoS' ]
classes_values = ["0","1","2","3"]

test_results = []
i=0
for d in definitions :
    for cv in classes_values:

        stat = stats.wilcoxon(base_group_classes.loc[base_group_classes['class'] == cv, d],
                                           group_classes.loc[group_classes['class'] == cv, d])
        stat_df = pd.DataFrame(data={"definition" : d,
                             "W_statistics" : stat[0],
                             "p_value" : stat[1],
                             "Class" : cv}, index=[i])
        test_results.append(stat_df) 



Sample size too small for normal approximation.
