## 6. Predictive model training and validation (XGBoost)
  * Data training and performance metrics evaluation
---------

In [None]:
# In[ ]:
"""
1) import package
"""
import os
import sys
import json
import pathlib
sys.path.append("..")

import traceback
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from _utils.model_estimation import *
from _utils.customlogger import customlogger as CL

In [None]:
# In[ ]:
"""
2) loading config
"""
current_dir = pathlib.Path.cwd()
parent_dir = current_dir.parent
with open(parent_dir.joinpath("config.json")) as file:
    cfg = json.load(file)

In [None]:
# In[ ]:
"""
3) load information 
"""
working_date = cfg["working_date"]
curr_file_name = os.path.splitext(os.path.basename(os.path.abspath('')))[0]

In [None]:
# In[ ]:
"""
4) create Logger
"""
log = CL("custom_logger")
pathlib.Path.mkdir(pathlib.Path('{}/_log/'.format(parent_dir)), mode=0o777, parents=True, exist_ok=True)
log = log.create_logger(file_name="../_log/{}.log".format(curr_file_name), mode="a", level="DEBUG")  
log.debug('start {}'.format(curr_file_name))

In [None]:
# In[ ]:
def runTask(outcome_name):
    # In[ ]:
    """
        (1) set path & make directory
    """
    ps_data_dir = pathlib.Path('{}/data/{}/preprocess_xgboost/{}/'.format(parent_dir, working_date, outcome_name))
    output_result_dir = pathlib.Path('{}/result/{}/xgboost/{}/'.format(parent_dir, working_date, outcome_name))
    pathlib.Path.mkdir(output_result_dir, mode=0o777, parents=True, exist_ok=True)

    # In[ ]:
    """
        (2) read data
    """
    filepath = ps_data_dir.joinpath('{}.txt'.format(outcome_name))
    concat_df = pd.read_csv(filepath, index_col=False)

    concat_df['cohort_start_date'] = pd.to_datetime(concat_df['cohort_start_date'], format='%Y-%m-%d %H:%M:%S', errors='raise')
    concat_df['first_abnormal_date'] = pd.to_datetime(concat_df['first_abnormal_date'], format='%Y-%m-%d %H:%M:%S', errors='raise')
    concat_df['concept_date'] = pd.to_datetime(concat_df['concept_date'], format='%Y-%m-%d %H:%M:%S', errors='raise')
        
    concat_df = concat_df.drop(['person_id', 'cohort_start_date', 'concept_date', 'first_abnormal_date'], axis=1)

    # In[ ]:
    """
        (3) change column name ; If a json file separator is entered in the column, the plot cannot be drawn.
    """
    import re
    concat_df.columns = concat_df.columns.str.translate("".maketrans({"[":"(", "]":")"}))
    concat_df = concat_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_ /()]+', '', x))
    concat_df.columns

    # check n of patients
    print("label_1 : ",len(concat_df[concat_df["label"] == 1]))
    print("label_0 : ",len(concat_df[concat_df["label"] == 0]))

    # In[ ]:
    """
        (4) split x, y data
    """
    def split_x_y_data(df) :
        y_data = df['label'].T.reset_index(drop=True) 
        x_data = df.drop('label', axis=1)
        new_col = x_data.columns
        return x_data, y_data, new_col

    x_data, y_data, new_col = split_x_y_data(concat_df)

    # In[ ]:
    """
        (5) train / test dataset
    """
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=1, stratify=y_data)

    print("data  : ", x_data.shape, y_data.shape)
    print("train : ", x_train.shape, y_train.shape)
    print("test  : ", x_test.shape, y_test.shape)

    scale_weight = int(len(concat_df[concat_df["label"] == 0])/len(concat_df[concat_df["label"] == 1]))

    dtrain = xgb.DMatrix(data=x_train , label=y_train) 
    dtest = xgb.DMatrix(data=x_test , label=y_test)

    # In[ ]:
    """
        (6) training (hyper parameter / early stopping)
    """
    params = { 'max_depth':5, 'learning_rate': 0.01, 'objective':'binary:logistic', 'eval_metric':'logloss', 'scale_pos_weight': scale_weight}
    num_rounds = 1000

    wlist = [(dtrain,'train'),(dtest,'eval')]
    
    class_weight= class_balance_weight(output_result_dir, outcome_name, y_train)
    
    xgb_model = xgb.train(params = params, dtrain=dtrain, num_boost_round=num_rounds, early_stopping_rounds=20, evals=wlist)

    # In[ ]:
    """
        (7) Evaluate on the test data set
    """
    pred_probs = xgb_model.predict(dtest)

    # If the prediction probability is greater than 0.5, the prediction value is determined as 1, otherwise 0, and stored in the List object preds.
    y_pred = [ 1 if x >= 0.5 else 0 for x in pred_probs ]
    get_clf_eval(y_test, y_pred, pred_probs)

    # In[ ]:
    """
        (8) save : plot tree & plot importance feature 
    """
    make_plot_tree(xgb_model, output_result_dir, outcome_name, rankdir=None)
    make_plot_tree(xgb_model, output_result_dir, outcome_name, rankdir='LR')
    make_plot_importance(xgb_model, output_result_dir, outcome_name)

    # In[ ]:
    """
        (9) save : clf report & model estimation & confusion matrix & roc
    """
    clf_report(y_test, y_pred, output_result_dir, outcome_name)
    model_performance_evaluation(y_test, y_pred, pred_probs, output_result_dir, outcome_name)
    confusion_matrix_figure(y_test, y_pred, output_result_dir, outcome_name)
    confusion_matrix_figure2(y_test, y_pred, output_result_dir, outcome_name)
    AUC, ACC = ROC_AUC(y_test, y_pred, output_result_dir, outcome_name)

    # In[ ]:
    """
        (10) model save 
    """
    save_xgb_model_json(xgb_model, output_result_dir, outcome_name)

In [None]:
# In[ ]:
"""
    For all drugs, perform the above tasks.
"""
for outcome_name in tqdm(cfg['drug'].keys()) :
    try :
        log.debug('drug : {}'.format(outcome_name))
        runTask(outcome_name)
    except :
        traceback.print_exc()
        log.error(traceback.format_exc())