In [None]:
import sys
import warnings
import pandas as pd
import numpy as np
import itertools
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.metrics import specificity_score
from sklearn.utils import shuffle
#from sklearn.preprocessing import Imputer

from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('../wer_manASR_feat_v30.csv')

In [None]:
#replace all +-inf with NaNs
df = df.replace([np.inf, -np.inf], np.nan)

In [None]:
#return list of columns that have only NaN values
df.columns[df.isnull().all()].tolist()

In [None]:
#delete the NaNs columns
df.drop(['diff_norm_pos_SPACE','diff_norm_sub_coord_ratio','diff_norm_tag_""', 'diff_norm_tag_#', 'diff_norm_tag_$', 'diff_norm_tag_-PRB-', 'diff_norm_tag_BES', 'diff_norm_tag_GW', 'diff_norm_tag_HVS', 'diff_norm_tag_SP'], axis=1, inplace=True)

In [None]:
#impute NaN for feature columns and store them in the new dataframe
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
clean_df = pd.DataFrame(imp.fit_transform(df.iloc[:, 22:982]), columns = list(df)[22:982])

In [None]:
#append new dataframe with selected columns
clean_df = clean_df.join([df.iloc[:,-5], df.iloc[:,-4], df.iloc[:,-3], df.iloc[:,-2], df.iloc[:,-1]])

In [None]:
#get list of subject names
subj_names = df.iloc[:, -1].dropna().unique().tolist()

In [None]:
#select feature names
man_feat = list(df)[22:502]
asr_feat = list(df)[502:982]

In [None]:
#shuffle rows for better classification
clean_df = shuffle(clean_df)

In [None]:
#pick random forest for classifier
clf = RandomForestClassifier(n_estimators = 10, max_depth = 20, n_jobs=2)

In [None]:
#LOOCV function
def transcript_loocv(subject_names, classifier_name, clf, df_clean, featureset):
    warnings.simplefilter('error')
    accuracy_fold = []
    accuracy_fold1 = []
    precision_fold = []
    recall_fold = []
    spec_fold = []
    f1_arr = []
    report = []
    for name in subject_names:
        df_data = df_clean.copy()
        df_data = df_data[df_data.firstname.isin(subject_names)]
        ind_train = [~df_data['firstname'].isin([name])]
        ind_test = [df_data['firstname'].isin([name])]
        df_data = df_data.sort_values('age_at_record')
        #for every train-test split, obtain means only from training HC group (so that model is completely agnostic to labels)
        #X = df_data.iloc[:,0:len(features)].values
        X = df_data[featureset].values

        y_label = df_data['group_label_id'].values
        years = df_data['age_at_record'].values
        years_diag = df_data['year_diag'].values
        sub_id = df_data['subject_id'].values
        y_label[y_label==2]=0
        #print('this is weird ==2 y-lable', y_label)

        X = StandardScaler().fit_transform(X)
        sel = VarianceThreshold(threshold=(.99 * (1 - .99)))
        X = sel.fit_transform(X)
        #print(name)
        ind_train = [~df_data['firstname'].isin([name])]
        ind_test = [df_data['firstname'].isin([name])]
        #print(ind_test)

        X_train = X[tuple(ind_train)]
        X_test = X[tuple(ind_test)]
        #print(X_test)
        y_train = y_label[tuple(ind_train)]
        #print('y-train', y_train)
        y_test = y_label[tuple(ind_test)]
        #print('y-test', y_test)
        X_train,y_train = SMOTE(random_state=1,k_neighbors=3).fit_sample(X_train, y_train)
        
        
        clf.fit(X_train,y_train)
        prediction = clf.predict(X_test)
        
        #print("y_test", y_test)
        #print("pred", prediction)
        
        sensitivity = recall_score(y_pred = prediction, y_true = y_test, average='macro')
        specificity = specificity_score(y_true = y_test, y_pred = prediction, average='macro')
        precision = precision_score(y_pred = prediction, y_true = y_test, average='macro')
        f1 = f1_score(y_pred = prediction, y_true = y_test, average='macro')  
        
        accuracy_fold.append(accuracy_score(y_true = y_test, y_pred = prediction))
        precision_fold.append(precision)
        recall_fold.append(sensitivity)
        f1_arr.append(f1)
        spec_fold.append(specificity)
        
    return np.mean(accuracy_fold),np.mean(precision_fold),np.mean(recall_fold),np.mean(f1_arr),np.mean(spec_fold)

In [None]:
#run LOOCV on manual transcript features
manual_tr_performance = []
for i in range(5):
    manual_tr_performance.append([transcript_loocv(subj_names, 'random_forest', clf, clean_df, man_feat)])
avg_manual_tr_performance = np.mean(manual_tr_performance, axis = 0)

In [None]:
print(avg_manual_tr_performance)

In [None]:
#run LOOCV on asr transcript features
asr_tr_performance = []
for i in range(5):
    asr_tr_performance.append([transcript_loocv(subj_names, 'random_forest', clf, clean_df, asr_feat)])
avg_asr_tr_performance = np.mean(asr_tr_performance, axis = 0)

In [None]:
print(avg_asr_tr_performance)