# Machine learning pipeline

In [1]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import time
from datetime import datetime
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import LeaveOneGroupOut, StratifiedGroupKFold, GroupKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import pickle
from sklearn.decomposition import PCA

In [2]:
def prepare_data(df):
    """
        Preparar los datos en los formatos requeridos (arrays, lista o dataframe)
        Returns: X, y, groups, filenames
    """
    y = np.array(df['label'].values)
    groups = list(df['user'].values)
    filenames = np.array(list(df['user'].values))
    X = df.loc[:,'WC':].copy()
    return X, y, groups, filenames

def pred_speaker(filenames, y_true, y_pred, threshold):
    """
        Calcular la prediccion por speaker a partir de la prediccion de cada uno de sus segmentos.
        Para ello, calcula la media de sus predicciones y, si es > 0.5 le asigna 1, si es < es 0.
    """
    y_trues_speaker = []
    y_preds_speaker = []
    predmeans_speaker = []
    for file in set(filenames):
        indexes = np.where(np.array(filenames) == file)[0]
        preds = list(np.array(y_pred)[indexes])
        predmean = np.mean(preds)
        predmeans_speaker.append(predmean)
        if(predmean > threshold):
            y_preds_speaker.append(1)
        else:
            y_preds_speaker.append(0)
        y_trues_speaker.append(y_true[indexes[0]])
    return y_trues_speaker, y_preds_speaker, predmeans_speaker

def unison_shuffled_copies(X, y, filenames):
    """
        Shuffle los datos de forma pareada, manteniendo la correspondencia de filas en X, y filenames.
    """
    assert len(X) == len(y)
    assert len(y) == len(filenames)
    np.random.seed(13)
    p = np.random.permutation(len(X))
    return X[p], y[p], filenames[p]

def makeDirIfNotExists(dir):
    if not os.path.isdir(dir):
        os.mkdir(dir)
        
def load_combinations():
    """
        Cargar los diccionarios con los clasificadores, la combinacion de parámetros a probar y las métricas de evaluación.
    """
    dict_classifiers = {
            "SVC": SVC(C=1, kernel= 'linear', gamma = 'scale'),
            "KNN": KNN(n_neighbors=2, weights='uniform', metric='minkowski'),
    }
    dict_parameters = {
            "SVC": {'SVC__kernel': ['rbf','sigmoid','poly','linear'],
                    'SVC__gamma': [0.001, 0.01, 0.1, 1, 'auto','scale'],
                    'SVC__C': [1, 10, 100, 1000]},
            "KNN": {'KNN__n_neighbors': [1,2,3,5,7],
                    'KNN__weights': ['uniform','distance'],
                    'KNN__metric': ['euclidean','manhattan','minkowski']}
            }
    
    scoring = {'kappa': make_scorer(cohen_kappa_score), 'acc': 'accuracy', 'mcc': make_scorer(matthews_corrcoef)}
    
    return dict_classifiers, dict_parameters, scoring

def load_train():
    info_train_cc = pd.read_csv('C:/Users/lugoza/Documents/AnacondaFiles/Dementia/data/ADReSS-IS2020/train/cc_meta_data.txt',sep=';')
    info_train_cd = pd.read_csv('C:/Users/lugoza/Documents/AnacondaFiles/Dementia/data/ADReSS-IS2020/train/cd_meta_data.txt',sep=';')

    info_train_cc['Label'] = 0 
    info_train_cd['Label'] = 1

    info_train = pd.concat([info_train_cc, info_train_cd])
    info_train.reset_index(inplace=True, drop=True)

    info_train = info_train.rename(columns={'ID   ': 'ID', ' age': 'age', ' gender ':'gender'})
    info_train.loc[0,'mmse'] = np.nan
    info_train["mmse"] = pd.to_numeric(info_train["mmse"])
    
    return info_train

def list_drop_subjects(info_train):
    subjects_drop_mal = list(info_train.loc[(info_train.mmse>23)&(info_train.Label == 1)]['ID'].values)
    subjects_drop = []
    for s in subjects_drop_mal:
        subjects_drop.append(s[:-1])
    return subjects_drop

def compute_metrics(y_true, y_pred):
    """
        A partir de las predicciones y las ground truth, calcula diferentes métricas.
        (si se le pasan las labels por sujeto, las métricas son por sujeto)
        https://towardsdatascience.com/should-i-look-at-precision-recall-or-specificity-sensitivity-3946158aace1
    """
    
    acc = round(accuracy_score(y_true, y_pred)*100,2)
    kappa = round(cohen_kappa_score(y_true,y_pred),3)
    cm = confusion_matrix(y_true, y_pred) 
    tn = cm[0][0] # True negative => negative = non-AD = 0 (control)
    fp = cm[0][1]
    fn = cm[1][0]
    tp = cm[1][1] # True positive => positive = AD = 1 (dementia)
    sensitivity = round(tp/(tp+fn),3) # = recall
    specificity = round(tn/(tn+fp),3)
    precision = round(tp/(tp+fp),3)
    recall = round(tp/(tp+fn),3)
    f1score = round(2*(precision * recall)/(precision + recall),3) # = f1_score(y_true,y_pred)
    
    return acc, kappa, f1score, cm, sensitivity, specificity, precision

# Here

In [6]:
files = os.listdir('0-publication/features/')
all_features = [f for f in files if f.endswith('xlsx')]
print(all_features)

['ASR_wav2vec2_noPunc_LIWC2015.xlsx', 'ASR_whisper_noPunc_LIWC2015.xlsx', 'ASR_whisper_noPunc_LIWC2015_pause.xlsx', 'ASR_whisper_Punc_LIWC2015.xlsx', 'ASR_whisper_Punc_LIWC2015_pause.xlsx', 'manual_noPunc_LIWC2015.xlsx', 'manual_noPunc_LIWC2015_pause.xlsx', 'manual_Punc_LIWC2015.xlsx', 'manual_Punc_LIWC2015_pause.xlsx']


In [1]:
save = True 
path_save = '0-publication/results_AASPAA/'
tic_total = time.time()
df_results = pd.DataFrame()
# ['ASR_wav2vec2_noPunc_LIWC2015','ASR_whisper_noPunc_LIWC2015','ASR_whisper_noPunc_LIWC2015_pause','ASR_whisper_Punc_LIWC2015','ASR_whisper_Punc_LIWC2015_pause']
for cleaning_training in [False]:
    if save:
        makeDirIfNotExists(path_save)
    for name_features in all_features: 
        # (1) Load features 
        print('Loading features ...')
        df_features_all = pd.read_excel('0-publication/features/'+name_features)
                
        if cleaning_training:
            info_train = load_train()
            subjects_drop = list_drop_subjects(info_train)
            df_features = df_features_all[~df_features_all['user'].isin(subjects_drop)].copy()
            df_features.reset_index(inplace=True, drop=True)
            print('df_features:',df_features.shape)
        else:
            df_features = df_features_all.copy()

        df_dev = df_features.loc[df_features.partition == 'train'].copy()
        df_dev.reset_index(inplace=True, drop=True)
        X_dev, y_dev, groups_dev, filenames_dev = prepare_data(df_dev)
        df_test = df_features.loc[df_features.partition == 'test'].copy()
        df_test.reset_index(inplace=True, drop=True)
        X_test, y_test, groups_test, filenames_test = prepare_data(df_test)

        # (2) Feature selection 
        print('Feature selection...')
        tic = time.time()
        # (2.1) Feature normalization => solo para este paso!
        std_scaler = StandardScaler() 
        X_dev_norm = std_scaler.fit_transform(X_dev)
        # (2.2) Recursive Feature Elimination (RFE) 
        selector = RFE(SVC(kernel="linear"), n_features_to_select=1, step=1, verbose=0)
        selector.fit(X_dev_norm, y_dev)
        sortedFeatures = sorted(zip(selector.ranking_, X_dev.columns))
        bestFeatures = [] # Quedarse solo con el nombre de las features ya ordenadas
        for sf in sortedFeatures:
            bestFeatures.append(sf[1])
        toc = time.time()
        print('    Duration:',round((toc-tic)/60,2),'min')

        # (3) Training models 
        print('Training models...')

        dict_classifiers = {
                    "SVC": SVC(C=1, kernel= 'linear', gamma = 'scale'),
                    "KNN": KNN(n_neighbors=5, weights='uniform', metric='minkowski'),
            }
        for clas in dict_classifiers.keys():
            tic_clas = time.time()
            print('   ', clas)
            dictionary_nr = dict()
            for nr in range(1,len(sortedFeatures)+1):
                tic_nr = time.time()
                topNrFeatures = bestFeatures[0:nr]
                seleccion_X_dev = np.array(X_dev.loc[:, topNrFeatures]) # X_dev sin normalizar, se hace despues
                seleccion_X_test = np.array(X_test.loc[:, topNrFeatures])

                # (3.1) Cross-validation
                logo = LeaveOneGroupOut()
                i = 0; y_pred_tots = []; y_true_tots = []
                for train_index, val_index in logo.split(seleccion_X_dev, y_dev, groups_dev):
                    X_train_i, X_val_i = seleccion_X_dev[train_index], seleccion_X_dev[val_index]
                    y_train_i, y_val_i = y_dev[train_index], y_dev[val_index]
                    filenames_train_i, filenames_val_i = filenames_dev[train_index], filenames_dev[val_index]

                    # Shuffle data
                    X_train, y_train, filenames_train = unison_shuffled_copies(X_train_i, y_train_i, filenames_train_i)
                    X_val, y_val, filenames_val = unison_shuffled_copies(X_val_i, y_val_i, filenames_val_i)

                    # Normalize data
                    new_scaler = StandardScaler() #StandardScaler(), MinMaxScaler
                    X_train_norm = new_scaler.fit_transform(X_train)
                    X_val_norm = new_scaler.transform(X_val)

                    clf = dict_classifiers[clas]
                    clf.fit(X_train_norm, y_train)
                    cv_y_pred, cv_y_true = clf.predict(X_val_norm), y_val

                    subject_y_true = int(cv_y_true[0])
                    subject_y_pred = int(cv_y_pred[0])

                    y_true_tots.append(subject_y_true)
                    y_pred_tots.append(subject_y_pred)  
                    
                    i+=1 
                
                predictions_nr = dict()
                predictions_nr['subject_y_true'] = y_true_tots
                predictions_nr['subject_y_pred'] = y_pred_tots
                dictionary_nr[str(nr)] = predictions_nr
            
            for nr in range(1,len(sortedFeatures)+1):

                y_true_tots = dictionary_nr[str(nr)]['subject_y_true']
                y_pred_tots = dictionary_nr[str(nr)]['subject_y_pred']
                
                # (3.2) Calculate metrics CV (per speaker)  
                cv_acc, cv_kappa, cv_f1score, cv_cm, cv_sensitivity, cv_specificity, cv_precision = compute_metrics(y_true_tots, y_pred_tots)

                # (4) Testing
                topNrFeatures = bestFeatures[0:nr]
                seleccion_X_dev = np.array(X_dev.loc[:, topNrFeatures]) # X_dev sin normalizar, se hace despues
                seleccion_X_test = np.array(X_test.loc[:, topNrFeatures])

                std_scaler = StandardScaler() #StandardScaler(), MinMaxScaler
                X_dev_norm = std_scaler.fit_transform(seleccion_X_dev)
                X_test_norm = std_scaler.transform(seleccion_X_test)

                clf_test = dict_classifiers[clas]
                clf_test.fit(X_dev_norm, y_dev)
                test_y_true, test_y_pred = y_test, clf_test.predict(X_test_norm) 
                test_acc, test_kappa, test_f1score, test_cm, test_sensitivity, test_specificity, test_precision = compute_metrics(test_y_true, test_y_pred)
                
                errors = []
                for i in range(len(groups_test)):
                    if test_y_true[i] != test_y_pred[i]:
                        user = groups_test[i]
                        errors.append(user)
        
                new_row ={'features':name_features,'n_features':nr,
                          'model': clas,'params':clf.get_params(),
                          'cleaning_training':cleaning_training, 

                          'cv_acc':cv_acc,'cv_kappa':cv_kappa, 'cv_f1score':cv_f1score,'cv_cm':cv_cm,
                          'cv_sensitivity':cv_sensitivity,'cv_specifity':cv_specificity,'cv_precision':cv_precision,

                          'test_acc':test_acc,'test_kappa':test_kappa,'test_f1score':test_f1score,'test_cm':test_cm,
                          'test_sensitivity':test_sensitivity,'test_specifity':test_specificity,'test_precision':test_precision, 

                          'top_features':topNrFeatures, 'errors_test':errors
                         }
                df_new_row = pd.DataFrame.from_records([new_row]) 
                df_results = pd.concat([df_results, df_new_row]) 

            toc_clas = time.time()
            print('    Duration:',round((toc_clas-tic_clas)/60,2),'min')
            if save:
                df_results.to_csv(path_save+'resultados_AASPAA_'+name_features+'.csv',index=False)
                df_results.to_excel(path_save+'resultados_AASPAA'+name_features+'.xlsx',index=False)

toc_total = time.time()
print('Total duration:',round((toc_total - tic_total)/60,2),'min') 

In [18]:
all_features = ['ASR_wav2vec2_noPunc_LIWC2015.xlsx', 'ASR_whisper_noPunc_LIWC2015.xlsx', 
                'ASR_whisper_noPunc_LIWC2015_pause.xlsx', 'ASR_whisper_Punc_LIWC2015.xlsx', 
                'ASR_whisper_Punc_LIWC2015_pause.xlsx', 'manual_noPunc_LIWC2015.xlsx', 
                'manual_noPunc_LIWC2015_pause.xlsx', 'manual_Punc_LIWC2015.xlsx', 'manual_Punc_LIWC2015_pause.xlsx']
path_save = '0-publication/results_AASPAA/'
new_df = pd.DataFrame(columns = df_results.columns)
for features in all_features:
    for model in ['SVC','KNN']:
        for cleaning_training in [False]:
            df_select = df_results.loc[(df_results.features == features)&(df_results.model == model)&
                           (df_results.cleaning_training == cleaning_training)].copy()
            df_select.reset_index(inplace=True, drop=True)
            a = pd.DataFrame(df_select.sort_values(by='cv_acc',ascending=False).iloc[0]).T
            new_df = pd.concat([new_df,a])
# new_df.to_excel(path_save+'resultados_AASPAA_summary.xlsx',index=False)