## 4. preprocessing for XGBoost (baseline)
  * 1) feature selection
    2) convert to baseline data
    3) normalization 

---------

In [None]:
# In[ ]:
"""
    1) import package
"""
import os
import sys
import json
import pathlib
sys.path.append("..")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import traceback
from tqdm import tqdm
from datetime import timedelta
from _utils.preprocessing_xgboost import *
from _utils.customlogger import customlogger as CL

In [None]:
# In[ ]:
"""
    2) loading config
"""
with open('./../{}'.format("config.json")) as file:
    cfg = json.load(file)

In [None]:
# In[ ]:
"""
    3) load information 
"""
current_dir = pathlib.Path.cwd()
parent_dir = current_dir.parent
current_date = cfg["working_date"]
curr_file_name = os.path.splitext(os.path.basename(os.path.abspath('')))[0]

In [None]:
# In[ ]:
"""
    4) create Logger
"""
log = CL("custom_logger")
pathlib.Path.mkdir(pathlib.Path('{}/_log/'.format(parent_dir)), mode=0o777, parents=True, exist_ok=True)
log = log.create_logger(file_name="../_log/{}.log".format(curr_file_name), mode="a", level="DEBUG")  
log.debug('start {}'.format(curr_file_name))

In [None]:
def runTask(outcome_name):
    # In[ ]:
    """
        (1) set path & make directory
    """
    importsql_data_dir  = pathlib.Path('{}/data/{}/importsql/{}/'.format(parent_dir, current_date, outcome_name))
    output_data_dir     = pathlib.Path('{}/data/{}/preprocess_xgboost/{}/'.format(parent_dir, current_date, outcome_name))
    output_result_dir   = pathlib.Path('{}/result/{}/preprocess_xgboost/{}/'.format(parent_dir, current_date, outcome_name))
    pathlib.Path.mkdir(output_data_dir, mode=0o777, parents=True, exist_ok=True)
    pathlib.Path.mkdir(output_result_dir, mode=0o777, parents=True, exist_ok=True)
    
    # In[ ]:
    """
        (2) Skip if there is no patient data
    """
    if not pathlib.Path.exists(importsql_data_dir.joinpath('all_domain_df.txt')):
        log.debug(f"{outcome_name} data frame is empty")
        return
    
    # In[ ]:
    """
        (3) read data frame
    """
    all_domain_df = pd.read_csv(importsql_data_dir.joinpath('all_domain_df.txt'), low_memory=False)
    print(all_domain_df.concept_domain.value_counts())
    print('label 1 : ', len(all_domain_df[all_domain_df['label']==1].person_id.unique()))
    print('label 0 : ', len(all_domain_df[all_domain_df['label']==0].person_id.unique()))
    nCase = all_domain_df.loc[all_domain_df['label'] == 1].person_id.nunique()
    if nCase < 20:
        log.debug(f"{outcome_name} case is less than 20")
        return

    # In[ ]:
    """
        (4) use only necessary columns
    """
    common_cols = ['person_id', 'age', 'sex', 'cohort_start_date', 'first_abnormal_date', 'concept_date', 'concept_id', 'concept_name', 'concept_value', 'concept_domain', 'label']
    all_domain_df = all_domain_df[common_cols]

    # In[ ]:
    """
        (5) Remove feature used in outcome define
    """
    drug_name = outcome_name
    drug_concept_ids_excluded = map(int,cfg['drug'][drug_name]['@drug_concept_set'].split(','))
    all_domain_df = all_domain_df.loc[~all_domain_df.concept_id.isin(drug_concept_ids_excluded)]
    meas_concept_ids_excluded = map(int,[cfg['meas'][meas_name]['@meas_concept_id'] for meas_name in cfg['meas']])
    all_domain_df = all_domain_df.loc[~all_domain_df.concept_id.isin(meas_concept_ids_excluded)]

    meas_df = all_domain_df.loc[all_domain_df.concept_domain=='meas'].reset_index(drop=True)
    drug_df = all_domain_df.loc[all_domain_df.concept_domain=='drug'].reset_index(drop=True)
    proc_df = all_domain_df.loc[all_domain_df.concept_domain=='proc'].reset_index(drop=True)
    cond_df = all_domain_df.loc[all_domain_df.concept_domain=='cond'].reset_index(drop=True)

    # @variable selection
    meas_vars_df = variant_selection_paired_t_test(meas_df)
    drug_vars_df = variant_selection_mcnemar(drug_df)
    proc_vars_df = variant_selection_mcnemar(proc_df)
    cond_vars_df = variant_selection_mcnemar(cond_df)

    # @variable selection
    meas_vars_df = meas_vars_df.sort_values(by='pvalue', ascending=True).reset_index(drop=True)
    drug_vars_df = drug_vars_df.sort_values(by='pvalue', ascending=True).reset_index(drop=True)
    cond_vars_df = cond_vars_df.sort_values(by='pvalue', ascending=True).reset_index(drop=True)
    proc_vars_df = proc_vars_df.sort_values(by='pvalue', ascending=True).reset_index(drop=True)

    log.info("[nVar] m : {} d : {} p : {}  c: {}".format(len(meas_vars_df), len(drug_vars_df), len(proc_vars_df), len(cond_vars_df)))

    meas_vars_df['concept_domain'] = 'meas'
    drug_vars_df['concept_domain'] = 'drug'
    proc_vars_df['concept_domain'] = 'proc'
    cond_vars_df['concept_domain'] = 'cond'

    all_domain_vars_df = pd.concat([meas_vars_df, drug_vars_df, cond_vars_df, proc_vars_df], axis=0, ignore_index=True)
    # @variable selection (save)
    all_domain_vars_df.to_csv('{}/{}_feature.csv'.format(output_result_dir, outcome_name), header=True, index=True)

    # @Extract only selected concepts from data frame
    def extractSelectedConceptID(domain_df, concept_id_list):
        extract_domain_df = domain_df[domain_df['concept_id'].isin(concept_id_list)]
        print(len(concept_id_list), len(domain_df), len(extract_domain_df))
        return extract_domain_df

    meas_df2 = extractSelectedConceptID(meas_df, meas_vars_df.concept_id.unique())
    drug_df2 = extractSelectedConceptID(drug_df, drug_vars_df.concept_id.unique())
    proc_df2 = extractSelectedConceptID(proc_df, proc_vars_df.concept_id.unique())
    cond_df2 = extractSelectedConceptID(cond_df, cond_vars_df.concept_id.unique())

    all_domain_df = pd.concat([meas_df2, drug_df2, proc_df2, cond_df2], axis=0, ignore_index=True)

    # In[]:
    pivot_data = pivotting(all_domain_df)

    drop_cols = []
    for col in pivot_data.columns:
        if (len(pivot_data[pivot_data[col].notnull()])/len(pivot_data[col]) < 0.3):
            drop_cols.append(col)

    pivot_data = pivot_data.drop(drop_cols, axis='columns')

    pivot_data = pivot_data.query("concept_date <= cohort_start_date")
    pivot_data = pivot_data.sort_values(by=["person_id", "concept_date"], axis=0, ascending=[True, False]).reset_index(drop=True)
    pivot_data = pivot_data.drop_duplicates(subset=['person_id'], keep = 'first')
    pivot_data = pivot_data.fillna(0)

    domain_ids={}
    domain_ids['meas'] = np.setdiff1d(meas_df2.concept_id.unique(), drop_cols)
    domain_ids['drug'] = np.setdiff1d(drug_df2.concept_id.unique(), drop_cols)
    domain_ids['proc'] = np.setdiff1d(proc_df2.concept_id.unique(), drop_cols)
    domain_ids['cond'] = np.setdiff1d(cond_df2.concept_id.unique(), drop_cols)
    print(len(domain_ids['meas']), len(domain_ids['drug']), len(domain_ids['proc']), len(domain_ids['cond']))

    # Normalization (Min/Max Scalar)
    concat_df = normalization(pivot_data)
    concat_df = concat_df.dropna(axis=1)

    # columns name : concept_id > concept_name
    concept_dict = dict(zip(all_domain_df.concept_id, all_domain_df.concept_name))
    concat_df = concat_df.rename(concept_dict, axis='columns')

    # Save File
    concat_df.to_csv('{}/{}.txt'.format(output_data_dir, outcome_name), index=False, float_format='%g')

    output={}
    output['meas_whole_var'] = len(meas_df.concept_id.unique())
    output['drug_whole_var'] = len(drug_df.concept_id.unique())
    output['proc_whole_var'] = len(proc_df.concept_id.unique())
    output['cond_whole_var'] = len(cond_df.concept_id.unique())
    output['meas_selected_var'] = len(domain_ids['meas'])
    output['drug_selected_var'] = len(domain_ids['drug'])
    output['proc_selected_var'] = len(domain_ids['proc'])
    output['cond_selected_var'] = len(domain_ids['cond'])
    output['nPatient_label1'] = len(concat_df[concat_df["label"] == 1])
    output['nPatient_label0'] = len(concat_df[concat_df["label"] == 0])

    # print
    print(output['meas_whole_var'], output['meas_selected_var'])
    print(output['drug_whole_var'], output['drug_selected_var'])
    print(output['proc_whole_var'], output['proc_selected_var'])
    print(output['cond_whole_var'], output['cond_selected_var'])

    out = open('{}/output.txt'.format(output_result_dir),'a')

    out.write(str(outcome_name) + '///' )
    out.write(str(output['meas_whole_var']) + '///')
    out.write(str(output['meas_selected_var']) + '///')
    out.write(str(output['drug_whole_var']) + '///')
    out.write(str(output['drug_selected_var']) + '///')
    out.write(str(output['proc_whole_var']) + '///')
    out.write(str(output['proc_selected_var']) + '///')
    out.write(str(output['cond_whole_var']) + '///')
    out.write(str(output['cond_selected_var']) + '///')
    out.write(str(output['nPatient_label1']) + '///')
    out.write(str(output['nPatient_label0']) + '\n')
    out.close()

In [None]:
# In[ ]:
"""
    For all drugs, perform the above tasks.
"""
for outcome_name in tqdm(cfg['drug'].keys()) :
    try :
        log.debug('drug : {}'.format(outcome_name))
        runTask(outcome_name)
    except :
        traceback.print_exc()
        log.error(traceback.format_exc())
