In [None]:
import pandas as pd
import numpy as np
from glob import glob
import ipywidgets as widgets
from IPython.display import clear_output, display
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
from pipe_scripts.input_parser import *
from pipe_scripts.feature_extraction import *
from pipe_scripts.widget_scripts import *

In [None]:
#folder where .xls files are present for Spirometric readings 

ground_truth_folder = #ground_truth
#The folder for recordings for individual task
tasks_folder =  #tasks

#Task name
task_name = 'cough'

tasks = None

#This is a dictionary for location of all the target variables in the xls file
cell_locations = {'FVC': 'B23', 'FEV1': 'B24', 'FEV1/FVC': 'B25'}

tasks_dict = {
            'Shallow_Breath': {
                                'folder': '10_3_shallowclean',
                                'suffix': 'shallowbreath'},
            'Rainbow': {
                                'folder': '11_4_rainbow', 
                                'suffix': 'rainbow'},
            'Describe_Sth': {
                                'folder': '12_5_describe_sth',
                                 'suffix': 'describesth'},
            'Long': {
                                'folder': '13_6_long', 
                                'suffix': 'long'},
            'Short_1': {
                                'folder': '14_6_short1', 
                                'suffix': 'short1'},
            'Short_2': {
                                'folder': '15_6_short2', 
                                'suffix': 'short2'},
            'Describe_pic': {
                                'folder': '16_7_describe_pic', 
                                'suffix': 'describepic'},
            'Action': {
                                'folder': '17_8_action', 
                                'suffix': 'action'},
            'Non_Action': {
                                'folder': '18_8_nonaction', 
                                'suffix': 'nonaction'},
            'Cough': {
                                'folder': '1_1_cough', 
                                'suffix': 'cough'},
            'A_Single': {
                                'folder': '2_2_A_single', 
                                'suffix': 'vowela'},
            'E_Single': {
                                'folder': '3_2_E_single', 
                                'suffix': 'vowele'},
            'I_Single': {
                                'folder': '4_2_I_single', 
                                'suffix': 'voweli'},
            'O_Single': {
                                'folder': '5_2_O_single', 
                                'suffix': 'vowelo'},
            'O_Phone_Single': {
                                'folder': '6_2_Ophonation_single',
                                'suffix': 'vowelophonation'},
            'U_Single': {
                                'folder': '7_2_U_Single', 
                                'suffix': 'vowelu'},
            'U_Phone_Single': {
                                'folder': '8_2_Uphonation_single',
                                'suffix': 'voweluphonation'},
            'Deep_Breath': {
                                'folder': '9_3_deepbreath', 
                                'suffix': 'deepbreaths'}
            }

In [None]:
#Boilerplate code to display and choose target variables to be extracted
options_widget = get_gridbox_from_list(tasks_dict.keys())


extract_button = widgets.Button(description= 'Extract target values')

select_all_button = widgets.Button(description= 'Select All')

dataset_df = None



def extract_task_files(b):

    global tasks 
    
    selected_options = get_selected_checkboxes(options_widget)

    tasks = selected_options
    clear_output(wait=False)
    if len(selected_options) == 0:
        print('No tasks selected, please choose an option')
        display(options_widget)
        display(b)
    else:
        print(f'Extracting filepaths from tasks folder: {tasks_folder}...')
        
        global dataset_df 
        
        
        #Extracting task filepaths based on choice
        dataset_df = extract_file_paths(ground_truth_folder, tasks_folder, tasks_dict, selected_options)
        print(f'Task filepaths extracted')


def select_all_boxes(b):
    for w in options_widget.children:
        w.value = True

extract_button.on_click(extract_task_files)
select_all_button.on_click(select_all_boxes)


print('Select the tasks:')
display(select_all_button)
display(options_widget)
display(extract_button)

In [None]:


target_names = None
variables = None


In [None]:
#Boilerplate code to display and choose target variables to be extracted
options_widget = get_Hbox_from_list(cell_locations.keys())

display(options_widget)
button = widgets.Button(description= 'Extract target values')

def extract_targets(b):
    selected_options = get_selected_checkboxes(options_widget)
    clear_output(wait=False)
    if len(selected_options) == 0:
        print('No targets selected, please choose an option')
        display(options_widget)
        display(b)
    else:
        print(f'Extracting {selected_options} values from ground truth folder: {ground_truth_folder}...')
        cell_coordinates = [cell_locations[x] for x in selected_options]
        
        global target_names, dataset_df
        target_names = selected_options
        
        #Extracting target variables based on choice
        dataset_df[selected_options] = dataset_df.apply(get_target_columns, axis=1, args=(cell_coordinates,), result_type='expand')

        print(f'Targets extracted')

button.on_click(extract_targets)

display(button)

In [None]:
#Boilerplate code for choosing features and corresponding aggregates

whole_signal_options_widget = get_Hbox_from_list(whole_signal_statistics_dictionary.keys())

f_options_pyAudio_widget = get_gridbox_from_list(feature_dictionary_pyAudio.keys())

a_options_widget_pyAudio = get_Hbox_from_list(aggregate_dictionary_pyAudio.keys())

coeff_options_widget = get_gridbox_from_list(spectral_coefficient_dictionary.keys())

f_options_wavelets_widget = get_gridbox_from_list(feature_dictionary_wavelets.keys())

a_options_widget_wavelets = get_gridbox_from_list(aggregate_dictionary_pyAudio.keys())

button = widgets.Button(description= 'Extract Features')

select_all_button = widgets.Button(description= 'Select all')

def extract_targets(b):

    selected_options = {'pyAudio': {
                                    'features': get_selected_checkboxes(f_options_pyAudio_widget), 'aggregates': get_selected_checkboxes(a_options_widget_pyAudio)
                                    },
                        'whole_signal': get_selected_checkboxes(whole_signal_options_widget),
                        'coefficients': get_selected_checkboxes(coeff_options_widget),
                        'wavelets': {
                            'features': get_selected_checkboxes(f_options_wavelets_widget), 'aggregates': get_selected_checkboxes(a_options_widget_wavelets)
                        }}

    
    clear_output(wait=False)

    global variables

    for task in tasks:
        
        print(f'Extracting features of task: {task}')

        feature_names = get_feature_names(selected_options)

        task_columns = [f'{task}_{feature_name}' for feature_name in feature_names]
        if(variables is None):
            variables =  task_columns.copy()
        else:
            variables.extend(task_columns)
        #Extracting target variables based on choice
        dataset_df[task_columns] = dataset_df.progress_apply(extract_features_from_file, axis=1, args=(task, selected_options,), result_type='expand')

    print(f'Features extracted: {variables}')


def select_all_boxes(b):
    for w in whole_signal_options_widget.children:
        w.value = True
    for w in f_options_pyAudio_widget.children:
        w.value = True
    for w in a_options_widget_pyAudio.children:
        w.value = True
    for w in coeff_options_widget.children:
        w.value = True
    for w in f_options_wavelets_widget.children:
        w.value = True
    for w in a_options_widget_wavelets.children:
        w.value = True    
        

button.on_click(extract_targets)
select_all_button.on_click(select_all_boxes)

display(select_all_button)
print('Select Whole signal statistics:')
display(whole_signal_options_widget)


print('Select Short-term features:')
display(f_options_pyAudio_widget)
print('Select Aggregates for above features:')
display(a_options_widget_pyAudio)
print('Select coefficients (Ref: https://superkogito.github.io/spafe/v0.2.0/features/_features.html):')
display(coeff_options_widget)

print('Select features to be extracted from each sub-band after Discrete Wavelet Transform:')
display(f_options_wavelets_widget)
print('Select aggregates of features extracted from DWT signals:')
display(a_options_widget_wavelets)

display(button)

In [None]:
dataset_df.to_excel(F'{task_name}_features_testtaa.xlsx')

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

import seaborn as sns
import matplotlib.pyplot as plt

from pipe_scripts.visualization import plot_metrics
from pipe_scripts.model_bank import model_bank

import eli5
from sklearn import linear_model

from numpy.linalg import eigh

In [None]:
#dataset_df.to_excel(F'{task_name}_features_all.xlsx')

### Scaling

In [None]:

X_scaled = dataset_df[variables]



scaler_dict = {
    'MinMax': MinMaxScaler,
    'Standard': StandardScaler
}

scaler_options = widgets.RadioButtons(
    options=list(scaler_dict.keys()),

    disabled=False
)

def scale_features(b):
    global dataset_df,X_scaled
    scaler_key = scaler_options.get_interact_value()
    
    scaler = scaler_dict[scaler_key]()
    X_scaled[variables] = scaler.fit_transform(X_scaled[variables])

    print(f'Features scaled using {scaler_key}')


scale_button = widgets.Button(description='Scale Features')
scale_button.on_click(scale_features)
display(widgets.Label('Select Scaler'))
display(scaler_options)
display(scale_button)

### Feature reduction

In [None]:

reduced_X = None

def run_PCA():
    pca = PCA(n_components=0.80)
    #n_components=0.99
    global reduced_X, X_scaled
    print("SACLED?", reduced_X)
    reduced_X = pca.fit_transform(X_scaled) # << to retain the components in an object

    cov_matrix = np.cov(X_scaled, rowvar=False)

    # Determine eigenvalues and eigenvectors
    egnvalues, egnvectors = eigh(cov_matrix)

    # Determine explained variance
    #
    total_egnvalues = sum(egnvalues)
    var_exp = [(i/total_egnvalues) for i in sorted(egnvalues, reverse=True)]
    cum_sum_exp = np.cumsum(var_exp)


    #pca.explained_variance_ratio_
    print ( "Components = ", pca.n_components_ , "\nTotal explained variance = ",
        round(pca.explained_variance_ratio_.sum(),5)*100,'%'  )

    plt.plot(cum_sum_exp)
    plt.xlabel('number of components')
    plt.ylabel('cumulative explained variance')
    plt.show()

def run_Lasso():
    global reduced_X, X_scaled
    #Running Lasso regression on first target variable
    regressor = linear_model.Lasso(alpha=0.0001,
                               positive=True,
                               fit_intercept=False, 
                               max_iter=100000,
                               tol=0.00001)
    regressor.fit(X_scaled, y = dataset_df[target_names[0]])

    print('Feature importance table')
    feature_importance_df = eli5.explain_weights_df(regressor, top=-1, feature_names = variables).drop(['target'], axis=1)
    display(feature_importance_df)
    important_variables = feature_importance_df['feature'].to_list()

    reduced_X = X_scaled[important_variables]


reduction_dict = {
    'PCA': run_PCA,
    'Lasso': run_Lasso
}


reduction_options = widgets.RadioButtons(
    options=list(reduction_dict.keys()),
    disabled=False)

def reduce_features(b):
    reduction_key = reduction_options.get_interact_value()
    
    reduction_dict[reduction_key]()
    

    print(f'Features reducted using {reduction_key}')

reduction_button = widgets.Button(description='Scale Features')
reduction_button.on_click(reduce_features)
display(widgets.Label('Select Reduction Algorithm'))
display(reduction_options)
display(reduction_button)


In [None]:
reduced_X.head()

In [None]:
X = reduced_X

In [None]:
X.head()

### Leave one out

In [None]:
model_options = [widgets.Checkbox(description=statistic, value=False) for statistic in model_bank.keys()]
model_options_widget = widgets.GridBox(model_options, layout=widgets.Layout(grid_template_columns="repeat(4, 250px)"))

button = widgets.Button(description= 'Train models')

metrics_df = pd.DataFrame(columns=['Target', 'Model', 'MSE', 'MAPE', 'R_Squared'])

def train_models(b):
    #clear_output(wait=False)

    print('R2 values not calculated for Leave one out validation.')

    models = [w.description for w in model_options_widget.children if w.value]
    fig, axs = plt.subplots(ncols=len(target_names), nrows=len(models), figsize=(5*len(target_names), 5*len(models)), constrained_layout=True)
    fig.suptitle(f'Prediction vs Actual distributions for {tasks}', fontsize=16)

    for row, model in tqdm(enumerate(models), leave=True):
        for column, target in tqdm(enumerate(target_names), leave=False):
            y = dataset_df[target]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
            pipe = Pipeline([(model, model_bank[model]['model']())])
            params = {}
            for param in model_bank[model]['parameters'].keys():
                params[f'{model}__{param}'] = model_bank[model]['parameters'][param]
            search = GridSearchCV(pipe, params,n_jobs=3)

            search.fit(X_train, y_train)
            best_estimator = search.best_estimator_

            #define cross-validation method to use
            cv = LeaveOneOut()
            #use LOOCV to evaluate model
            scores = cross_validate(best_estimator, X, y, scoring=[ 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error'], cv=cv)
            y_pred = search.predict(X)

            mape = scores['test_neg_mean_absolute_percentage_error']
            mape = mape[~np.isnan(mape)]
            mape = np.abs(mape.mean())


            mse = scores['test_neg_mean_squared_error'].mean()
            mse = mse[~np.isnan(mse)]
            mse = np.abs(mse.mean())
            #As a null value since test data is of size 1 which is not good for r2 calculations
            r2 = -10

            metrics_df.loc[len(metrics_df.index)] = [target, model, mse, mape, r2]

            if(len(models)>1 & len(target_names)>1):
                axis = axs[row, column]

            elif(len(models)==len(target_names)):
                axis = axs
            elif(len(models)==1):
                axis = axs[column]
            elif(len(target_names)==1):
                axis = axs[row]

            # plotting both distibutions on the same figure
            fig = sns.kdeplot(y_pred, shade=True, color="red", legend='Predicted', ax=axis)
            fig = sns.kdeplot(y_test, shade=True, color="blue", legend='Actual', ax=axis)

            axis.title.set_text(f'{model} on {target}')

            axis.legend(title='Legend', loc='upper right', labels=['Predicted', 'Actual'])

    plot_metrics(metrics_df, tasks, no_r2 = True)
    plt.show()

button.on_click(train_models)

print('Select models:')
display(model_options_widget)
display(button)


In [None]:
metrics_df

### 80/20 Train Test Split

In [None]:
model_options = [widgets.Checkbox(description=statistic, value=False) for statistic in model_bank.keys()]
model_options_widget = widgets.GridBox(model_options, layout=widgets.Layout(grid_template_columns="repeat(4, 250px)"))

button = widgets.Button(description= 'Train models')

metrics_df = pd.DataFrame(columns=['Target', 'Model', 'MSE', 'MAPE', 'R_Squared'])

def train_models(b):
    #clear_output(wait=False)

    models = [w.description for w in model_options_widget.children if w.value]
    fig, axs = plt.subplots(ncols=len(target_names), nrows=len(models), figsize=(5*len(target_names), 5*len(models)), constrained_layout=True)
    fig.suptitle(f'Prediction vs Actual distributions for {tasks}', fontsize=16)

    for row, model in tqdm(enumerate(models), leave=True):
        for column, target in tqdm(enumerate(target_names), leave=False):
            y = dataset_df[target]
            #Test train split is 20/80
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            pipe = Pipeline([ (model, model_bank[model]['model']())])
            params = {}
            for param in model_bank[model]['parameters'].keys():
                params[f'{model}__{param}'] = model_bank[model]['parameters'][param]
            search = GridSearchCV(pipe, params, n_jobs=3)
            search.fit(X_train, y_train)
            y_pred = search.predict(X_test)

            mape = mean_absolute_percentage_error(y_test, y_pred) 
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            metrics_df.loc[len(metrics_df.index)] = [target, model, mse, mape, r2]

            if(len(models)>1 & len(target_names)>1):
                axis = axs[row, column]
            elif(len(models)==1):
                axis = axs[column]
            elif(len(target_names)==1):
                axis = axs[row]

            # plotting both distibutions on the same figure
            fig = sns.kdeplot(y_pred, shade=True, color="red", legend='Predicted', ax=axis)
            fig = sns.kdeplot(y_test, shade=True, color="blue", legend='Actual', ax=axis)

            axis.title.set_text(f'{model} on {target}')

            axis.legend(title='Legend', loc='upper right', labels=['Predicted', 'Actual'])

    plot_metrics(metrics_df, tasks, no_r2 = False)
    plt.show()

button.on_click(train_models)

print('Select models:')
display(model_options_widget)
display(button)


In [None]:
metrics_df

In [None]:
from datetime import datetime
x = range(10)
for n in x:
    

    metrics_df = pd.DataFrame(columns=['Target', 'Model', 'MSE', 'MAPE', 'R_Squared'])

    models = [w.description for w in model_options_widget.children if w.value]

    for row, model in tqdm(enumerate(models), leave=True):
        for column, target in tqdm(enumerate(target_names), leave=False):
            y = dataset_df[target]
            #Test train split is 20/80
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
            pipe = Pipeline([ (model, model_bank[model]['model']())])
            params = {}
            for param in model_bank[model]['parameters'].keys():
                params[f'{model}__{param}'] = model_bank[model]['parameters'][param]
            search = GridSearchCV(pipe, params, n_jobs=3)
            search.fit(X_train, y_train)
            y_pred = search.predict(X_test)

            mape = mean_absolute_percentage_error(y_test, y_pred) 
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            ##metrics_df is where the matrix has the reauslts 
            metrics_df.loc[len(metrics_df.index)] = [target, model, mse, mape, r2]
            #dataframe_collection[n] = pd.DataFrame(metrics_df, columns=['Target', 'Model', 'MSE', 'MAPE', 'R_Squared'])
    metrics_df.to_excel(f'./{task_name}_{datetime.now().strftime("%Y%m%d-%H%M%S")}.xlsx')


    #scores = [ {'model': 'LR', 'scores': [target, model, mse, mape, r2]}, {'model': 'LSTM', 'scores': [target, model, mse, mape, r2]} ]    
    
#     from datetime import datetime
      


In [None]:
df = metrics_df
df
import glob

files = #data files
excel_dfs = []
for f in files:
  df = pd.read_excel(f)
  excel_dfs.append(df)
    
excel_dfs
excel_dfs[4]
len(excel_dfs)
    

    
mdf =  (
    # combine dataframes into a single dataframe
    pd.concat(excel_dfs)
    # replace 0 values with nan to exclude them from mean calculation
    .replace(0, np.nan)
    .reset_index()
    # group by the row within the original dataframe
    .groupby("index")
    # calculate the mean
    .mean()
)

print(mdf)

    
sdf =  (
    # combine dataframes into a single dataframe
    pd.concat(excel_dfs)
    # replace 0 values with nan to exclude them from mean calculation
    .replace(0, np.nan)
    .reset_index()
    # group by the row within the original dataframe
    .groupby("index")
    # calculate the mean
    .std()
)

print(sdf)


vdf =  (
    # combine dataframes into a single dataframe
    pd.concat(excel_dfs)
    # replace 0 values with nan to exclude them from mean calculation
    .replace(0, np.nan)
    .reset_index()
    # group by the row within the original dataframe
    .groupby("index")
    # calculate the mean
    .var()
)

print(vdf)