In [1]:
#from collections import Counter
from scipy.stats import mode
import statistics
import pandas as pd
import numpy as np
import itertools
import statistics
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.metrics import specificity_score
from sklearn.utils import shuffle
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('../wer_manASR_feat_v30.csv')

In [3]:
#replace all +-inf with NaNs
df = df.replace([np.inf, -np.inf], np.nan)

In [4]:
#return list of columns that have only NaN values
df.columns[df.isnull().all()].tolist()

['diff_norm_pos_SPACE',
 'diff_norm_sub_coord_ratio',
 'diff_norm_tag_""',
 'diff_norm_tag_#',
 'diff_norm_tag_$',
 'diff_norm_tag_-PRB-',
 'diff_norm_tag_BES',
 'diff_norm_tag_GW',
 'diff_norm_tag_HVS',
 'diff_norm_tag_SP']

In [5]:
#delete the NaNs columns
df.drop(['diff_norm_pos_SPACE','diff_norm_sub_coord_ratio','diff_norm_tag_""', 'diff_norm_tag_#', 'diff_norm_tag_$', 'diff_norm_tag_-PRB-', 'diff_norm_tag_BES', 'diff_norm_tag_GW', 'diff_norm_tag_HVS', 'diff_norm_tag_SP'], axis=1, inplace=True)

In [6]:
#impute NaN for feature columns and store them in the new dataframe
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
clean_df = pd.DataFrame(imp.fit_transform(df.loc[:, "ADJP_->_JJ_x":"zcr_var_y"]), columns = list(df)[df.columns.get_loc("ADJP_->_JJ_x"):(df.columns.get_loc("zcr_var_y") + 1)])

In [7]:
#append new dataframe with selected columns
clean_df = clean_df.join([df['subject_id'], df['firstname'], df['age_at_record'], df['year_diag'], df['group_label_id']])

In [8]:
#get list of subject names
subj_names = df.iloc[:, -1].dropna().unique().tolist()

In [9]:
#select feature names
man_feat = list(df)[df.columns.get_loc("ADJP_->_JJ_x"):(df.columns.get_loc("zcr_var_x") + 1)]
asr_feat = list(df)[df.columns.get_loc("ADJP_->_JJ_y"):(df.columns.get_loc("zcr_var_y") + 1)]

In [10]:
#shuffle rows for better classification
clean_df = shuffle(clean_df)

In [11]:
#pick random forest for classifier
clf = RandomForestClassifier(n_estimators = 2, max_depth = 70)

In [12]:
#UPDATED - for SAMPLE - LOOCV function
def sample_transcript_loocv(subject_names, classifier_name, clf, df_clean, featureset):
    prediction = []
    y_test = []
    for name in subject_names:        
        df_data = df_clean.copy()
        df_data = df_data[df_data.firstname.isin(subject_names)]
        ind_train = [~df_data['firstname'].isin([name])]
        ind_test = [df_data['firstname'].isin([name])]
        df_data = df_data.sort_values('age_at_record')
        #for every train-test split, obtain means only from training HC group (so that model is completely agnostic to labels)
        X = df_data[featureset].values
        y_label = df_data['group_label_id'].values
        years = df_data['age_at_record'].values
        years_diag = df_data['year_diag'].values
        sub_id = df_data['subject_id'].values
        X = StandardScaler().fit_transform(X)
        sel = VarianceThreshold(threshold=(.99 * (1 - .99)))
        X = sel.fit_transform(X)
        ind_train = [~df_data['firstname'].isin([name])]
        ind_test = [df_data['firstname'].isin([name])] 
        X_train = X[tuple(ind_train)]
        X_test = X[tuple(ind_test)]
        y_train = y_label[tuple(ind_train)]
        y_test.append(y_label[tuple(ind_test)])
        X_train,y_train = SMOTE(random_state=1,k_neighbors=3).fit_sample(X_train, y_train)     
        clf.fit(X_train,y_train)
        prediction.append(clf.predict(X_test))
        
    y_test = [item for sublist in y_test for item in sublist]
    prediction = [item for sublist in prediction for item in sublist]
    
    #don't need lists anymore since metrics return one value
    sensitivity = recall_score(y_pred = prediction, y_true = y_test, average='macro')
    specificity = specificity_score(y_true = y_test, y_pred = prediction, average='macro')
    precision = precision_score(y_pred = prediction, y_true = y_test, average='macro')
    f1 = f1_score(y_pred = prediction, y_true = y_test, average='macro')  
    accuracy = accuracy_score(y_true = y_test, y_pred = prediction)
        
    return accuracy,precision,sensitivity,f1,specificity #don't need mean since metrics return one value

In [13]:
#function to return a set of modes
def mode_set(array):
    most = max(list(map(array.count, array)))
    return list(set(filter(lambda x: array.count(x) == most, array)))

In [19]:
#UPDATED - for SUBJECT - LOOCV function
def subject_transcript_loocv(subject_names, classifier_name, clf, df_clean, featureset):
    prediction_mode = []
    y_test_mode = []
    for name in subject_names:        
        df_data = df_clean.copy()
        df_data = df_data[df_data.firstname.isin(subject_names)]
        ind_train = [~df_data['firstname'].isin([name])]
        ind_test = [df_data['firstname'].isin([name])]
        df_data = df_data.sort_values('age_at_record')
        #for every train-test split, obtain means only from training HC group (so that model is completely agnostic to labels)
        X = df_data[featureset].values
        y_label = df_data['group_label_id'].values
        years = df_data['age_at_record'].values
        years_diag = df_data['year_diag'].values
        sub_id = df_data['subject_id'].values
        X = StandardScaler().fit_transform(X)
        sel = VarianceThreshold(threshold=(.99 * (1 - .99)))
        X = sel.fit_transform(X)
        ind_train = [~df_data['firstname'].isin([name])]
        ind_test = [df_data['firstname'].isin([name])] 
        X_train = X[tuple(ind_train)]
        X_test = X[tuple(ind_test)]
        y_train = y_label[tuple(ind_train)]
        y_test = y_label[tuple(ind_test)].tolist() #don't need to append anymore, can just store as an array converted to list
        X_train,y_train = SMOTE(random_state=1,k_neighbors=3).fit_sample(X_train, y_train)     
        clf.fit(X_train,y_train)
        prediction = clf.predict(X_test).tolist() #don't need to append anymore, can just store as an array converted to list
        
        temp = mode_set(prediction)    #store results from mode for prediction in temp values to test if there are multiple modes returned
        y_test_mode.append(mode_set(y_test))
        if len(temp) == 2:
            y_test_mode.append(mode_set(y_test)) #store one more mode into y_test so that total numbers of modes is the same in test and prediction lists
            a, b = temp         #store the two modes as separate values
            pred_one = []
            pred_two = []
            pred_one.append(a)         #convert to list
            pred_two.append(b)
            prediction_mode.append(pred_one)       #add list value to predictions
            prediction_mode.append(pred_two)
        else:
            prediction_mode.append(temp)
    
    #don't need 'fold_...' lists anymore since metrics return only one value
    sensitivity = recall_score(y_pred = prediction_mode, y_true = y_test_mode, average='macro')
    specificity = specificity_score(y_true = y_test_mode, y_pred = prediction_mode, average='macro')
    precision = precision_score(y_pred = prediction_mode, y_true = y_test_mode, average='macro')
    f1 = f1_score(y_pred = prediction_mode, y_true = y_test_mode, average='macro')  
    accuracy = accuracy_score(y_true = y_test_mode, y_pred = prediction_mode)
        
    return accuracy,precision,sensitivity,f1,specificity

In [28]:
#run sample and subject LOOCV on manual transcript features
sample_manual_tr_performance = []
subject_manual_tr_performance = []
for i in range(5):
    sample_manual_tr_performance.append([sample_transcript_loocv(subj_names, 'random_forest', clf, clean_df, man_feat)])
    subject_manual_tr_performance.append([subject_transcript_loocv(subj_names, 'random_forest', clf, clean_df, man_feat)])
sample_avg_manual_tr_performance = np.mean(sample_manual_tr_performance, axis = 0)
subject_avg_manual_tr_performance = np.mean(subject_manual_tr_performance, axis = 0)

print('Sample performance for manual transcripts', sample_avg_manual_tr_performance)
print('Subject performance for manual transcripts', subject_avg_manual_tr_performance)

Sample performance [[0.43298097 0.40572959 0.4272924  0.39330853 0.4272924 ]]
Subject performance [[0.43333333 0.23161765 0.43333333 0.30174359 0.43333333]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [29]:
#run sample and subject LOOCV on asr transcript features
sample_asr_tr_performance = []
subject_asr_tr_performance = []
for i in range(5):
    sample_asr_tr_performance.append([sample_transcript_loocv(subj_names, 'random_forest', clf, clean_df, asr_feat)])
    subject_asr_tr_performance.append([subject_transcript_loocv(subj_names, 'random_forest', clf, clean_df, asr_feat)])
sample_avg_asr_tr_performance = np.mean(sample_asr_tr_performance, axis = 0)
subject_avg_asr_tr_performance = np.mean(subject_asr_tr_performance, axis = 0)

print('Sample performance for asr transcripts', sample_avg_manual_tr_performance)
print('Subject performance for asr transcripts', subject_avg_manual_tr_performance)

Sample performance for asr transcripts [[0.43298097 0.40572959 0.4272924  0.39330853 0.4272924 ]]
Subject performance for asr transcripts [[0.43333333 0.23161765 0.43333333 0.30174359 0.43333333]]
