In [None]:
#!/usr/bin/env python
# coding: utf-8

'''
feature selection
'''

In [None]:
# ** import package **
import os
import sys
import json
import pathlib
sys.path.append("..")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import traceback
from tqdm import tqdm
from datetime import timedelta
from _utils.preprocessing_xgboost import *
from _utils.customlogger import customlogger as CL

In [None]:
# ** loading config **
with open('./../{}'.format("config.json")) as file:
    cfg = json.load(file)

In [None]:
# ** loading info **
current_dir = pathlib.Path.cwd()
parent_dir = current_dir.parent
current_date = cfg["working_date"]
curr_file_name = os.path.splitext(os.path.basename(os.path.abspath('')))[0]

In [None]:
# **create Logger**
log = CL("custom_logger")
pathlib.Path.mkdir(pathlib.Path('{}/_log/'.format(parent_dir)), mode=0o777, parents=True, exist_ok=True)
log = log.create_logger(file_name="../_log/{}.log".format(curr_file_name), mode="a", level="DEBUG")  
log.debug('start {}'.format(curr_file_name))

In [None]:
def intersect(a, b):
    return list(set(a) & set(b))

def getPairedTTest(baseline_df, abnormal_df, concept_list):
    baseline_df = baseline_df[baseline_df['label']==1]
    abnormal_df = abnormal_df[abnormal_df['label']==1]
    import scipy.stats
    selected_var_df = pd.DataFrame()
    concept_set = list(set(baseline_df.columns) & set(abnormal_df.columns) & set(concept_list))
    # print(len(concept_set), concept_set)
    for concept in concept_set:
        # print(abnormal_df[concept].mean(), baseline_df[concept].mean())
        statistic, pvalue = scipy.stats.ttest_ind(abnormal_df[concept], baseline_df[concept], equal_var=False, nan_policy='omit')
        label_1_before = len(baseline_df[concept].dropna()) 
        label_1_after = len(abnormal_df[concept].dropna()) 
        label_1_before_mean = baseline_df[concept].dropna().mean() 
        label_1_after_mean = abnormal_df[concept].dropna().mean()
        # print(concept, pvalue)
        if statistic>1 and pvalue<0.05 :
            # print(concept)
            var_temp = {}
            var_temp['concept_id'] = concept
            var_temp['pvalue'] = pvalue
            var_temp['label_1_before'] = label_1_before
            var_temp['label_1_after'] = label_1_after
            var_temp['label_1_before_mean'] = label_1_before_mean
            var_temp['label_1_after_mean'] = label_1_after_mean
            selected_var_df = selected_var_df.append(var_temp, ignore_index=True)
    return selected_var_df

def getMcnemarTest(baseline_df, abnormal_df, concept_list):
    import scipy.stats
    selected_var_df = pd.DataFrame()
    concept_set = list(set(baseline_df.columns) & set(abnormal_df.columns) & set(concept_list))
    # print(len(concept_set), concept_set)
    for concept in concept_set:
        label_0_before = len(baseline_df[(baseline_df['label']==0) & (baseline_df[concept]==1)])
        label_1_before = len(baseline_df[(baseline_df['label']==1) & (baseline_df[concept]==1)])
        label_0_after = len(abnormal_df[(abnormal_df['label']==0) & (abnormal_df[concept]==1)]) 
        label_1_after = len(abnormal_df[(abnormal_df['label']==1) & (abnormal_df[concept]==1)]) 
        arr_before = np.array([label_1_before, label_0_before])
        arr_after = np.array([label_1_after, label_0_after])
        table = np.vstack([arr_before, arr_after]) # vertical stack
        table = np.transpose(table)             # trans pose
        result = mcnemar(table, exact=True) # 샘플 수<25 일 경우 mcnemar(table, exact=False, correction=True)
        if result.pvalue < 0.05 :
            # print(concept)
            var_temp = {}
            var_temp['concept_id'] = concept
            var_temp['pvalue'] = result.pvalue
            var_temp['label_0_before'] = label_0_before
            var_temp['label_0_after'] = label_0_after
            var_temp['label_1_before'] = label_1_before
            var_temp['label_1_after'] = label_1_after
            selected_var_df = selected_var_df.append(var_temp, ignore_index=True)
    return selected_var_df

def average_duration_of_adverse_events(df):
    df = df[['person_id', 'cohort_start_date', 'first_abnormal_date']].drop_duplicates() #.subject_id.unique()
    df['c_f'] = df['first_abnormal_date'] - df['cohort_start_date']
    # print(df['c_f'].describe())
    return df['c_f'].mean().days

def make_pivot(df):
    print("person_id(count) : ", df.person_id.nunique(), "concept_name(count) : ", df.concept_name.nunique())
    df = df.sort_values(by=['person_id', 'concept_id', 'concept_date'], axis=0, ascending=True)
    df['first_abnormal_date'] = pd.to_datetime(df['first_abnormal_date']).fillna(pd.to_datetime('1900-01-01'))
    last_record_df = df.groupby(by=['person_id', 'concept_id']).apply(lambda x: x.iloc[-1]).reset_index(drop=True)
    def subtract(x, y):
        return [item for item in x if item not in set(y)]
    pivot_cols = subtract(last_record_df.columns, ['concept_name', 'concept_date', 'concept_id', 'concept_value', 'concept_domain'])
    pivot_df = pd.pivot_table(data = last_record_df, index = pivot_cols, columns='concept_id', values='concept_value').reset_index()
    return pivot_df

def impute_conditional_data(df, concept_ids):
    cols = list(set(df.columns)&set(concept_ids))
    df[cols] = df[cols].fillna(df[cols].median())
    return df
    
def impute_binary_data(df, concept_ids):
    cols = list(set(df.columns)&set(concept_ids))
    df[cols] = df[cols].fillna(0)
    return df

def normalization_Robust(df):
    from sklearn.preprocessing import RobustScaler
    transformer = RobustScaler()
    transformer.fit(df)
    df = transformer.transform(df) 
    return df 

def normalization_std(df):
    from sklearn.preprocessing import StandardScaler
    transformer = StandardScaler()
    transformer.fit(df)
    df = transformer.transform(df) 
    return df 

def normalization_minmax(df):
    from sklearn.preprocessing import MinMaxScaler
    transformer = MinMaxScaler()
    transformer.fit(df)
    df = transformer.transform(df) 
    return df 


In [None]:
# In[ ]:
def runTask(outcome_name):
    # outcome_name = 'Acetaminophen'
    log.debug("{}".format(outcome_name))

    # input file path
    importsql_output_dir = pathlib.Path('{}/data/{}/importsql/{}/'.format(parent_dir, current_date, outcome_name))
    # output file path
    output_dir = pathlib.Path('{}/data/{}/feature_selection/{}/'.format(parent_dir, current_date, outcome_name))
    pathlib.Path.mkdir(output_dir, mode=0o777, parents=True, exist_ok=True)
    # output file path (features)
    output_result_dir = pathlib.Path('{}/result/{}/feature_selection/{}/'.format(parent_dir, current_date, outcome_name))
    pathlib.Path.mkdir(output_result_dir, mode=0o777, parents=True, exist_ok=True)

    # In[]:
    # @load data
    meas_df = pd.read_csv('{}/{}_meas_df.txt'.format(importsql_output_dir, outcome_name), low_memory=False)
    drug_df = pd.read_csv('{}/{}_drug_df.txt'.format(importsql_output_dir, outcome_name), low_memory=False)
    proc_df = pd.read_csv('{}/{}_proc_df.txt'.format(importsql_output_dir, outcome_name), low_memory=False)
    cond_df = pd.read_csv('{}/{}_cond_df.txt'.format(importsql_output_dir, outcome_name), low_memory=False)

    # @fill concept_value
    drug_df['concept_value'] = 1 # temp code
    proc_df['concept_value'] = 1
    cond_df['concept_value'] = 1

    # @use only necessary columns
    common_cols = ['person_id', 'age', 'sex', 'cohort_start_date', 'first_abnormal_date', 'concept_date', 'concept_id', 'concept_name', 'concept_value', 'concept_domain', 'label']

    meas_df = meas_df[common_cols]
    drug_df = drug_df[common_cols]
    proc_df = proc_df[common_cols]
    cond_df = cond_df[common_cols]

    log.info("[nData] m : {} d : {} p : {}  c : {} all : {}".format(len(meas_df), len(drug_df), len(proc_df), len(cond_df), (len(meas_df) + len(drug_df) + len(proc_df) + len(cond_df))))

    # In[ ]: @Remove feature used in outcome define
    drug_name = outcome_name
    drug_concept_ids_excluded = map(int,cfg['drug'][drug_name]['@drug_concept_set'].split(','))
    drug_df = drug_df.loc[~drug_df.concept_id.isin(drug_concept_ids_excluded)]
    meas_concept_ids_excluded = map(int,[cfg['meas'][meas_name]['@meas_concept_id'] for meas_name in cfg['meas']])
    meas_df = meas_df.loc[~meas_df.concept_id.isin(meas_concept_ids_excluded)]

    # @valid data processing for cohorts.
    meas_df = cohortConditionSetting(meas_df, pre_observation_period=60, post_observation_peroid=60)
    drug_df = cohortConditionSetting(drug_df, pre_observation_period=60, post_observation_peroid=60)
    proc_df = cohortConditionSetting(proc_df, pre_observation_period=60, post_observation_peroid=60)
    cond_df = cohortConditionSetting(cond_df, pre_observation_period=60, post_observation_peroid=60)

    ndays = average_duration_of_adverse_events(cond_df)
    log.debug('average_duration_of_adverse_events : {}'.format(ndays))

    all_domain_df = pd.concat([meas_df, drug_df, proc_df, cond_df], axis=0, ignore_index=True)
    all_domain_baseline_df = all_domain_df.query('cohort_start_date >= concept_date')

    all_domain_pivot_df = make_pivot(all_domain_df)
    all_domain_pivot_baseline_df = make_pivot(all_domain_baseline_df)

    def resumetable(df):
        print(f'data frame shape: {df.shape}')
        summary = pd.DataFrame(df.dtypes, columns=['data_type'])
        summary = summary.reset_index()
        summary = summary.rename(columns={'index': 'feature'})
        summary['n_missingvalues'] = df.isnull().sum().values
        summary['n_missingrate'] = df.isnull().sum().values/len(df)
        summary['n_eigenvalues'] = df.nunique().values
        return summary
    summary = resumetable(all_domain_pivot_df)
    summary.to_csv('{}/{}_summary.csv'.format(output_result_dir, outcome_name), index=False, float_format='%g')

    concept_id_name_dict = dict(zip(all_domain_df.concept_id, all_domain_df.concept_name))
    concept_id_domain_dict = dict(zip(all_domain_df.concept_id, all_domain_df.concept_domain))

    meas_concept_ids = list(set(all_domain_pivot_df.columns).intersection(set(meas_df.concept_id)))
    drug_concept_ids = list(set(all_domain_pivot_df.columns).intersection(set(drug_df.concept_id)))
    proc_concept_ids = list(set(all_domain_pivot_df.columns).intersection(set(proc_df.concept_id)))
    cond_concept_ids = list(set(all_domain_pivot_df.columns).intersection(set(cond_df.concept_id)))

    # ** 1. Statistical Feature Selection **
    selected_features_with_t_test_df = getPairedTTest(all_domain_pivot_baseline_df, all_domain_pivot_df, meas_concept_ids)
    selected_features_with_mcnemar_df = getMcnemarTest(all_domain_pivot_baseline_df, all_domain_pivot_df, drug_concept_ids + cond_concept_ids + proc_concept_ids)

    selected_features_df = pd.concat([selected_features_with_t_test_df, selected_features_with_mcnemar_df], axis=0)
    selected_features_df.concept_id = selected_features_df.concept_id.astype(np.object)
    selected_features_df['concept_name'] = selected_features_df.apply(lambda x: concept_id_name_dict[x.concept_id], axis = 1)
    selected_features_df['concept_domain'] = selected_features_df.apply(lambda x: concept_id_domain_dict[x.concept_id], axis = 1)
    selected_features_df.to_csv('{}/{}_statistics.csv'.format(output_result_dir, outcome_name), index=False, float_format='%g')

    selected_features_meas = intersect(selected_features_df.concept_id, meas_concept_ids)
    selected_features_drug = intersect(selected_features_df.concept_id, drug_concept_ids)
    selected_features_proc = intersect(selected_features_df.concept_id, proc_concept_ids)
    selected_features_cond = intersect(selected_features_df.concept_id, cond_concept_ids)

    len(selected_features_meas), len(selected_features_drug), len(selected_features_proc), len(selected_features_cond)
    len(all_domain_df.concept_id.unique()), len(all_domain_baseline_df.concept_id.unique())

    # ** 2. imputation **
    all_domain_pivot_df = impute_conditional_data(all_domain_pivot_df, meas_concept_ids)
    all_domain_pivot_df = impute_binary_data(all_domain_pivot_df, drug_concept_ids + proc_concept_ids + cond_concept_ids)

    meas_concept_ids = list(set(all_domain_pivot_baseline_df.columns).intersection(set(meas_df.concept_id)))
    drug_concept_ids = list(set(all_domain_pivot_baseline_df.columns).intersection(set(drug_df.concept_id)))
    proc_concept_ids = list(set(all_domain_pivot_baseline_df.columns).intersection(set(proc_df.concept_id)))
    cond_concept_ids = list(set(all_domain_pivot_baseline_df.columns).intersection(set(cond_df.concept_id)))

    all_domain_pivot_baseline_df = impute_conditional_data(all_domain_pivot_baseline_df, meas_concept_ids)
    all_domain_pivot_baseline_df = impute_binary_data(all_domain_pivot_baseline_df, drug_concept_ids + proc_concept_ids + cond_concept_ids)

    def getxydata(df):
        x_df = df.drop(['person_id', 'cohort_start_date', 'first_abnormal_date', 'label'], axis=1) 
        # x_df = (x_df-x_df.min())/(x_df.max()-x_df.min()) # normalize
        x_data = x_df.to_numpy()
        y_data = df['label'].to_numpy()
        cols = x_df.columns
        return x_data, y_data, cols

    X_total, y_total, cols = getxydata(all_domain_pivot_df)
    X_total = normalization_minmax(X_total)

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X_total, y_total, test_size=0.3, random_state=1, stratify=y_total) 

    # check for nan / infinite value 
    np.argwhere(np.isnan(X_train)), np.argwhere(np.isinf(X_train))

    from sklearn.feature_selection import VarianceThreshold
    selector = VarianceThreshold(1e-1)
    X_train_sel = selector.fit_transform(X_train)
    X_test_sel = selector.transform(X_test)
    print(X_train.shape, X_train_sel.shape)
    selected_features = selector.get_feature_names_out(cols)
    if len(selected_features) > 0:
        selected_features_with_VT_df = pd.DataFrame(selected_features, columns =['concept_id'])
        selected_features_with_VT_df['concept_name'] = selected_features_with_VT_df.apply(lambda x: concept_id_name_dict[x.concept_id] if x.concept_id in concept_id_name_dict.keys() else x.concept_id, axis = 1)
        selected_features_with_VT_df['concept_domain'] = selected_features_with_VT_df.apply(lambda x: concept_id_domain_dict[x.concept_id] if x.concept_id in concept_id_domain_dict.keys() else 'common', axis = 1)
        selected_features_meas = intersect(selected_features_with_VT_df.concept_id, meas_concept_ids)
        selected_features_drug = intersect(selected_features_with_VT_df.concept_id, drug_concept_ids)
        selected_features_proc = intersect(selected_features_with_VT_df.concept_id, proc_concept_ids)
        selected_features_cond = intersect(selected_features_with_VT_df.concept_id, cond_concept_ids)
        selected_features_with_VT_df.to_csv('{}/{}_VT.csv'.format(output_result_dir, outcome_name), index=False, float_format='%g')
        print(len(selected_features_meas), len(selected_features_drug), len(selected_features_proc), len(selected_features_cond))

    from sklearn.feature_selection import SelectPercentile, chi2
    selector = SelectPercentile(chi2, percentile=3) # now select features based on top 10 percentile
    X_train_sel = selector.fit_transform(X_train, y_train)
    X_test_sel = selector.transform(X_test)
    print(X_train.shape, X_train_sel.shape)
    selected_features = selector.get_feature_names_out(cols)
    if len(selected_features) > 0:
        selected_features_with_percentile_df = pd.DataFrame(selected_features, columns =['concept_id'])
        selected_features_with_percentile_df['concept_name'] = selected_features_with_percentile_df.apply(lambda x: concept_id_name_dict[x.concept_id] if x.concept_id in concept_id_name_dict.keys() else x.concept_id, axis = 1)
        selected_features_with_percentile_df['concept_domain'] = selected_features_with_percentile_df.apply(lambda x: concept_id_domain_dict[x.concept_id] if x.concept_id in concept_id_domain_dict.keys() else 'common', axis = 1)
        selected_features_meas = intersect(selected_features_with_percentile_df.concept_id, meas_concept_ids)
        selected_features_drug = intersect(selected_features_with_percentile_df.concept_id, drug_concept_ids)
        selected_features_proc = intersect(selected_features_with_percentile_df.concept_id, proc_concept_ids)
        selected_features_cond = intersect(selected_features_with_percentile_df.concept_id, cond_concept_ids)
        selected_features_with_percentile_df.to_csv('{}/{}_percentile.csv'.format(output_result_dir, outcome_name), index=False, float_format='%g')
        print(len(selected_features_meas), len(selected_features_drug), len(selected_features_proc), len(selected_features_cond))

    from sklearn.feature_selection import chi2, SelectKBest
    selector = SelectKBest(score_func=chi2, k=50)
    X_train_sel = selector.fit_transform(X_train, y_train)
    X_test_sel = selector.transform(X_test)
    print(X_train.shape, X_train_sel.shape)
    selected_features = selector.get_feature_names_out(cols)
    if len(selected_features) > 0:
        selected_features_with_KBest_df = pd.DataFrame(selected_features, columns =['concept_id'])
        selected_features_with_KBest_df['concept_name'] = selected_features_with_KBest_df.apply(lambda x: concept_id_name_dict[x.concept_id] if x.concept_id in concept_id_name_dict.keys() else x.concept_id, axis = 1)
        selected_features_with_KBest_df['concept_domain'] = selected_features_with_KBest_df.apply(lambda x: concept_id_domain_dict[x.concept_id] if x.concept_id in concept_id_domain_dict.keys() else 'common', axis = 1)
        selected_features_meas = intersect(selected_features_with_KBest_df.concept_id, meas_concept_ids)
        selected_features_drug = intersect(selected_features_with_KBest_df.concept_id, drug_concept_ids)
        selected_features_proc = intersect(selected_features_with_KBest_df.concept_id, proc_concept_ids)
        selected_features_cond = intersect(selected_features_with_KBest_df.concept_id, cond_concept_ids)
        selected_features_with_KBest_df.to_csv('{}/{}_KBest.csv'.format(output_result_dir, outcome_name), index=False, float_format='%g')
        print(len(selected_features_meas), len(selected_features_drug), len(selected_features_proc), len(selected_features_cond))

    from sklearn.feature_selection import SelectFromModel
    from sklearn.ensemble import ExtraTreesClassifier
    treebasedclf = ExtraTreesClassifier(n_estimators=50)
    treebasedclf = treebasedclf.fit(X_train, y_train)
    model = SelectFromModel(treebasedclf, prefit=True)
    X_train_sel = model.transform(X_train)
    print(X_train.shape, X_train_sel.shape)
    selected_features = model.get_feature_names_out(cols)
    if len(selected_features) > 0:
        selected_features_with_ExtraTrees_df = pd.DataFrame(selected_features, columns =['concept_id'])
        selected_features_with_ExtraTrees_df['concept_name'] = selected_features_with_ExtraTrees_df.apply(lambda x: concept_id_name_dict[x.concept_id] if x.concept_id in concept_id_name_dict.keys() else x.concept_id, axis = 1)
        selected_features_with_ExtraTrees_df['concept_domain'] = selected_features_with_ExtraTrees_df.apply(lambda x: concept_id_domain_dict[x.concept_id] if x.concept_id in concept_id_domain_dict.keys() else 'common', axis = 1)
        selected_features_meas = intersect(selected_features_with_ExtraTrees_df.concept_id, meas_concept_ids)
        selected_features_drug = intersect(selected_features_with_ExtraTrees_df.concept_id, drug_concept_ids)
        selected_features_proc = intersect(selected_features_with_ExtraTrees_df.concept_id, proc_concept_ids)
        selected_features_cond = intersect(selected_features_with_ExtraTrees_df.concept_id, cond_concept_ids)
        selected_features_with_ExtraTrees_df.to_csv('{}/{}_ExtraTrees.csv'.format(output_result_dir, outcome_name), index=False, float_format='%g')
        print(len(selected_features_meas), len(selected_features_drug), len(selected_features_proc), len(selected_features_cond))

    from sklearn.linear_model import Lasso
    lasso = Lasso(alpha=0.1)
    lasso.fit(X_train, y_train)
    y_train_pred = lasso.predict(X_train)
    y_test_pred = lasso.predict(X_test)
    print(lasso.coef_)
    importance = np.abs(lasso.coef_)
    selected_features = np.array(cols)[importance > 0]
    if len(selected_features) > 0:
        selected_features_with_lasso_df = pd.DataFrame(selected_features, columns =['concept_id'])
        selected_features_with_lasso_df['concept_name'] = selected_features_with_lasso_df.apply(lambda x: concept_id_name_dict[x.concept_id] if x.concept_id in concept_id_name_dict.keys() else x.concept_id, axis = 1)
        selected_features_with_lasso_df['concept_domain'] = selected_features_with_lasso_df.apply(lambda x: concept_id_domain_dict[x.concept_id] if x.concept_id in concept_id_domain_dict.keys() else 'common', axis = 1)
        selected_features_meas = intersect(selected_features_with_lasso_df.concept_id, meas_concept_ids)
        selected_features_drug = intersect(selected_features_with_lasso_df.concept_id, drug_concept_ids)
        selected_features_proc = intersect(selected_features_with_lasso_df.concept_id, proc_concept_ids)
        selected_features_cond = intersect(selected_features_with_lasso_df.concept_id, cond_concept_ids)
        selected_features_with_lasso_df.to_csv('{}/{}_lasso_0_1.csv'.format(output_result_dir, outcome_name), index=False, float_format='%g')
        print(len(selected_features_meas), len(selected_features_drug), len(selected_features_proc), len(selected_features_cond))

    from sklearn.linear_model import Lasso
    lasso = Lasso(alpha=0.01)
    lasso.fit(X_train, y_train)
    y_train_pred = lasso.predict(X_train)
    y_test_pred = lasso.predict(X_test)
    print(lasso.coef_)
    importance = np.abs(lasso.coef_)
    selected_features = np.array(cols)[importance > 0]
    if len(selected_features) > 0:
        selected_features_with_lasso_df = pd.DataFrame(selected_features, columns =['concept_id'])
        selected_features_with_lasso_df['concept_name'] = selected_features_with_lasso_df.apply(lambda x: concept_id_name_dict[x.concept_id] if x.concept_id in concept_id_name_dict.keys() else x.concept_id, axis = 1)
        selected_features_with_lasso_df['concept_domain'] = selected_features_with_lasso_df.apply(lambda x: concept_id_domain_dict[x.concept_id] if x.concept_id in concept_id_domain_dict.keys() else 'common', axis = 1)
        selected_features_meas = intersect(selected_features_with_lasso_df.concept_id, meas_concept_ids)
        selected_features_drug = intersect(selected_features_with_lasso_df.concept_id, drug_concept_ids)
        selected_features_proc = intersect(selected_features_with_lasso_df.concept_id, proc_concept_ids)
        selected_features_cond = intersect(selected_features_with_lasso_df.concept_id, cond_concept_ids)
        selected_features_with_lasso_df.to_csv('{}/{}_lasso_0_0_1.csv'.format(output_result_dir, outcome_name), index=False, float_format='%g')
        print(len(selected_features_meas), len(selected_features_drug), len(selected_features_proc), len(selected_features_cond))

    from sklearn.feature_selection import mutual_info_classif
    import matplotlib.pyplot as plt
    %matplotlib inline
    importances = mutual_info_classif(X_total, y_total, discrete_features='auto')
    threshold = 0.001
    selected_features = np.array(cols)[importance > threshold]
    if len(selected_features) > 0:
        selected_features_with_mutual_df = pd.DataFrame(selected_features, columns =['concept_id'])
        selected_features_with_mutual_df['concept_name'] = selected_features_with_mutual_df.apply(lambda x: concept_id_name_dict[x.concept_id] if x.concept_id in concept_id_name_dict.keys() else x.concept_id, axis = 1)
        selected_features_with_mutual_df['concept_domain'] = selected_features_with_mutual_df.apply(lambda x: concept_id_domain_dict[x.concept_id] if x.concept_id in concept_id_domain_dict.keys() else 'common', axis = 1)
        selected_features_meas = intersect(selected_features_with_mutual_df.concept_id, meas_concept_ids)
        selected_features_drug = intersect(selected_features_with_mutual_df.concept_id, drug_concept_ids)
        selected_features_proc = intersect(selected_features_with_mutual_df.concept_id, proc_concept_ids)
        selected_features_cond = intersect(selected_features_with_mutual_df.concept_id, cond_concept_ids)
        selected_features_with_mutual_df.to_csv('{}/{}_mutual.csv'.format(output_result_dir, outcome_name), index=False, float_format='%g')
        print(len(selected_features_meas), len(selected_features_drug), len(selected_features_proc), len(selected_features_cond))

    methods = ['statistics', 'VT', 'KBest', 'percentile', 'ExtraTrees', 'lasso_0_1', 'lasso_0_0_1', 'mutual']
    concat_df = pd.DataFrame()
    for method in methods:
        full_file_path = pathlib.Path('{}/{}_{}.csv'.format(output_result_dir, outcome_name, method))
        if pathlib.Path.exists(full_file_path):
            method_df = pd.read_csv('{}/{}_{}.csv'.format(output_result_dir, outcome_name, method))
            if method_df.empty():
                continue
            method_df['method'] = method
            concat_df = pd.concat([concat_df, method_df], axis=0)
    if not concat_df.empty():
        concat_df.to_csv('{}/{}_all_methods.csv'.format(output_result_dir, outcome_name), index=False, float_format='%g')    
   

In [None]:
for outcome_name in tqdm(cfg['drug'].keys()) :
    try :
        runTask(outcome_name)        
    except :
        traceback.print_exc()
        log.error(traceback.format_exc())