In [46]:
from pathlib import Path
import numpy as np
import pandas as pd
from Classes.Func.KitTools import SaveGen
from Classes.Func.CalculatePart import PerfomAssess
from Classes.ORM.expr import PatientInfo, LabExtube, LabWean
from Classes.ORM.cate import ExtubePSV, ExtubeSumP12, WeanPSV, WeanSumP12

In [14]:
mode_ = 'Extube_SumP12_Nad_xgboost'  # run nad mode then
data_p = r'C:\Main\Data\_\Result\Form\20220515_18_Extube_SumP12_Nad'

mode_info = {
    'Extube': {
        'Lab': LabExtube,
        'PSV': ExtubePSV,
        'SumP12': ExtubeSumP12
    },
    'Wean': {
        'Lab': LabWean,
        'PSV': WeanPSV,
        'SumP12': WeanSumP12
    }
}

In [15]:
def DataLoader(load_path: str):
    p_i_l = []
    m_i_l = mode_.split('_')

    for path in Path(load_path).iterdir():
        if not path.is_file():
            pass
        else:
            p_r = pd.read_csv(path, index_col='method')
            p_info = path.name.split('_')
            p_r_ave = p_r.loc['ave'].to_dict()
            p_i_d = {'pid': p_info[0], 'end': int(p_info[1]), 'rid': p_info[2]}
            p_i_d.update(p_r_ave)
            p_i_l.append(p_i_d)

    df_basic = pd.DataFrame(p_i_l)

    src_0, src_1 = PatientInfo, mode_info[m_i_l[0]]['Lab']
    join_info = {'dest': src_0, 'on': src_0.pid == src_1.pid, 'attr': 'pinfo'}
    col_que = [src_1, src_0.age, src_0.sex, src_0.bmi]
    col_order = [src_1.pid]
    cond = src_1.pid.in_(df_basic.pid.to_list())
    que_l = src_1.select(*col_que).join(**join_info).where(cond).order_by(
        *col_order)

    df_que = pd.DataFrame(list(que_l.dicts()))
    df_que = df_que.drop('pid', axis=1)

    df_total = pd.concat([df_basic, df_que], axis=1)
    df_total = df_total.drop(['pid', 'rid'], axis=1)

    # drop featutre nan > 40%
    # df_total = DropByThreshold(df_total, 0.4, 1)
    # drop data nan > 80%
    # df_total = DropByThreshold(df_total, 0.8, 0)

    df_0 = df_total[df_total.end == 0]
    df_1 = df_total[df_total.end == 1]

    return df_0, df_1, df_total

def DropByThreshold(df: pd.DataFrame, per: float, ax_st: int):
    ax_set = int(not ax_st)
    threshold = df.shape[ax_set] * per
    len_raw = df.shape[ax_st]
    df = df.dropna(axis=ax_st, thresh=threshold)
    len_new = df.shape[ax_st]
    print('Drop col/row: {0}'.format(len_raw - len_new))
    return df

In [16]:
_,_, df = DataLoader(data_p)

In [62]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
def SingleLogReg(df, col_l):
    X = df.loc[:, df.columns != col_l]
    y = df.loc[:, df.columns == col_l].values.ravel()
    X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=0)
    model = LogisticRegression(C=1, max_iter=2000,class_weight='balanced')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    return auc

In [63]:
dl = []
for col in df.columns[1:]:
    dict_ = {}
    dict_['method'] = col
    df_tmp = df[['end', col]]
    df_tmp = df_tmp.dropna()
    perform = PerfomAssess(df_tmp['end'], df_tmp[col])
    p, rs_pos, rs_neg = perform.PAssess()
    auc, _, _ = perform.AucAssess()
    neg_n = len(df_tmp[df_tmp.end == 0])
    pos_n = len(df_tmp[df_tmp.end == 1])
    dict_['P'] = p
    dict_['AUC'] = auc
    dict_['LogReg'] = SingleLogReg(df_tmp, 'end')
    dict_['succ_rs'] = rs_neg
    dict_['succ_len'] = neg_n
    dict_['fail_rs'] = rs_pos
    dict_['fail_len'] = pos_n
    dl.append(dict_)
df_assess = pd.DataFrame(dl)



In [64]:
pd.DataFrame.to_csv(df_assess, 'assess.csv', index=False)

In [69]:
df_p = df_assess.sort_values('P')
df_p = df_p.reset_index(drop=True)
df_p = df_p[0:28]
df_p

Unnamed: 0,method,P,AUC,LogReg,succ_rs,succ_len,fail_rs,fail_len
0,b_hr,0.0018,0.314,0.665789,"82.75 (76.482, 91.405)",194,"75.61 (71.95, 82.005)",27
1,co2,0.0213,0.627,0.533333,"24.215 (21.952, 26.008)",200,"25.785 (23.875, 26.925)",32
2,ga,0.0338,0.383,0.475,"1.13 (1.09, 1.17)",200,"1.11 (1.08, 1.132)",32
3,hco3,0.0449,0.611,0.491667,"26.215 (23.475, 28.078)",200,"27.335 (25.442, 29.025)",32
4,na,0.0565,0.605,0.483333,"140.9 (136.988, 144.0)",200,"142.0 (138.832, 147.062)",32
5,be,0.0839,0.61,0.475,1.409 +- 3.707,200,2.65 +- 3.616,32
6,cl,0.0958,0.592,0.533333,"105.5 (102.0, 110.65)",200,"107.0 (103.675, 114.125)",32
7,sofa,0.1723,0.429,0.352273,6.675 +- 4.144,40,5.333 +- 1.944,9
8,osp,0.1751,0.574,0.533333,"284.775 (0.0, 294.812)",200,"288.25 (204.0, 302.875)",32
9,lac,0.2026,0.43,0.516667,"1.2 (0.9, 1.8)",200,"1.0 (0.8, 1.473)",32


In [71]:
df_auc = df_p.sort_values('LogReg', ascending=False)
df_auc = df_auc.reset_index(drop=True)
df_auc

Unnamed: 0,method,P,AUC,LogReg,succ_rs,succ_len,fail_rs,fail_len
0,npco2,0.3333,0.553,0.675,"40.0 (36.15, 43.618)",200,"40.875 (39.502, 43.025)",32
1,b_hr,0.0018,0.314,0.665789,"82.75 (76.482, 91.405)",194,"75.61 (71.95, 82.005)",27
2,bmi,0.3113,0.578,0.6,"22.893 (19.531, 25.352)",141,"23.512 (22.055, 25.172)",16
3,mp_jm_d,0.2244,0.574,0.593182,3.775 +- 1.957,204,4.218 +- 1.852,32
4,mp_jl_d,0.3915,0.547,0.567424,"0.48 (0.42, 0.532)",204,"0.49 (0.455, 0.54)",32
5,rhs,0.2653,0.561,0.566667,"0.6 (0.0, 1.25)",200,"0.835 (0.255, 1.362)",32
6,o2,0.495,0.538,0.541667,"99.05 (98.29, 99.607)",200,"99.1 (98.358, 99.587)",32
7,mp_jm_t,0.329,0.554,0.535606,"7.7 (6.61, 9.322)",204,"7.875 (7.198, 10.178)",32
8,cr,0.2161,0.404,0.534375,"66.0 (48.0, 112.0)",107,"57.0 (47.0, 72.5)",16
9,osp,0.1751,0.574,0.533333,"284.775 (0.0, 294.812)",200,"288.25 (204.0, 302.875)",32
