In [1]:
# !pip install cutkum
# !pip install tensorflow
# !pip install keras
# !pip install deepcut
# !pip install xgboost
# !pip install Pipeline

import pandas as pd
import numpy as np
import warnings
import deepcut

from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df = pd.read_excel ('E:\--- Job ---\--- งานนอก ---\อาจารย์พิมผกา (STATS)\Data\ex_df.xlsx', sheet_name="Sheet1")

df['Subject_ID'] = df['HN']

In [3]:
duplicate = df['Subject_ID'][df['Subject_ID'].duplicated()]

duplicate

41     649131.0
42     649131.0
67     647655.0
68     647655.0
69     647655.0
70     647655.0
74     129483.0
100    121410.0
108     14413.0
109     14413.0
124     65555.0
129    342992.0
149    648773.0
157    301633.0
170    535807.0
172     38608.0
173     38608.0
186     66071.0
192     15439.0
219    642321.0
236     46244.0
241    461995.0
246     66956.0
248    491497.0
271     99104.0
273    571589.0
314    266280.0
339    109588.0
380    649696.0
Name: Subject_ID, dtype: float64

# DATA EXTRACTION

In [4]:
# Personal Columns
df_personal = df[['Subject_ID', 'SEX_x', 'AGE']]

# ICD-10 Columns
df_icd10 = df[df.columns[np.r_[1, 208:422, 1288]]]

# DX Columns
df_dx = df[df.columns[np.r_[1, 33:45, 1288]]]

# Lab Columns
df_lab = df[df.columns[np.r_[1, 49:206, 1288]]]

# Drug
df_drug = df[df.columns[np.r_[1, 884:1256, 1288]]]

# Drug Details
df_drug_detail = df[df.columns[np.r_[1, 422:883, 1288]]]

# Clinic
df_clinic = df[['Subject_ID', 'Temp', 'Pulse', 'RR', 'O2Sat', 'Body Weight', 'Height', 'BMI', 'sbp1', 'dbp1', 'sbp2', 'dbp2']]

# Label
df_label = df[['Subject_ID', 'DM_label', 'HTN_label' ,'CKD_label', 'DLP_label']]

# DATA CLEANSING

In [5]:
df_report = pd.DataFrame(columns=['group', 'row', 'duplicated', 'subject id', 'new row'])

### Personal  

In [6]:
# Personal Columns

# Check dupliated data
df_personal_dup = df_personal[df_personal.duplicated()]
id_dup = df_personal_dup['Subject_ID'].drop_duplicates().values


# Drop duplicated data
df_personal_nodup = df_personal.drop_duplicates()


# Make a report
personal_report = ['Personal', df_personal.shape[0], df_personal_dup.shape[0], id_dup, df_personal_nodup.shape[0]]
df_personal_report = pd.Series(personal_report, index = df_report.columns)

### ICD-10

In [7]:
# ICD-10 Columns

# Fill NaN with 0
df_icd10.fillna(0, inplace=True)

# Check dupliated data
df_icd10_dup = df_icd10[df_icd10.duplicated()]
id_dup = df_icd10_dup['Subject_ID'].drop_duplicates().values

# Drop duplicated data
df_icd10_nodup = df_icd10.drop_duplicates()

# Get all columns name
variable = list(df_icd10.columns)
variable.remove('Subject_ID')

# Get MAX values all column with duplicated Subject_ID
df_icd10_nodup = df_icd10.groupby(['Subject_ID'])[variable].agg('max').reset_index()

# Make a report
icd10_report = ['ICD-10', df_icd10.shape[0], df_icd10_dup.shape[0], id_dup, df_icd10_nodup.shape[0]]
df_icd10_report = pd.Series(icd10_report, index = df_report.columns)

### Drug  

In [8]:
# Fill NaN with 0
df_drug.fillna(0, inplace=True)

# Check dupliated data
df_drug_dup = df_drug[df_drug.duplicated()]
id_dup = df_drug_dup['Subject_ID'].drop_duplicates().values

# Drop duplicated data
df_drug_nodup = df_drug.drop_duplicates()

# Get all columns name
variable = list(df_drug.columns)
variable.remove('Subject_ID')

# Get MAX values all column with duplicated Subject_ID
df_drug_nodup = df_drug.groupby(['Subject_ID'])[variable].agg('max').reset_index()

# Make a report
drug_report = ['Drug', df_drug.shape[0], df_drug_dup.shape[0], id_dup, df_drug_nodup.shape[0]]
df_drug_report = pd.Series(drug_report, index = df_report.columns)

In [9]:
# Drug Details

# Fill NaN with 0
df_drug_detail.fillna(0, inplace=True)

# Check dupliated data
df_drug_detail_dup = df_drug_detail[df_drug_detail.duplicated()]
id_dup = df_drug_detail_dup['Subject_ID'].drop_duplicates().values

# Drop duplicated data
df_drug_detail_nodup = df_drug_detail.drop_duplicates()

# Get all columns name
variable = list(df_drug_detail.columns)
variable.remove('Subject_ID')

# Get MAX values all column with duplicated Subject_ID
df_drug_detail_nodup = df_drug_detail.groupby(['Subject_ID'])[variable].agg('max').reset_index()

# Make a report
drug_detail_report = ['Drug Details', df_drug_detail.shape[0], df_drug_detail_dup.shape[0], id_dup, df_drug_detail_nodup.shape[0]]
df_drug_detail_report = pd.Series(drug_detail_report, index = df_report.columns)

### Clinic

In [10]:
# Clinic

# Fill 0 with NaN
df_clinic.replace(0, np.nan, inplace=True)

# Check dupliated data
df_clinic_dup = df_clinic[df_clinic.duplicated()]
id_dup = df_clinic_dup['Subject_ID'].drop_duplicates().values

# Get all columns name
variable = list(df_clinic.columns)
variable.remove('Subject_ID')

# Get MAX values all column with duplicated Subject_ID
df_clinic_groupby = df_clinic.groupby(['Subject_ID'])[variable].agg('max').reset_index()

# Drop duplicated data
df_clinic_nodup = df_clinic_groupby.drop_duplicates()
df_clinic_nodup.replace(0, np.nan, inplace=True)

# Get last BP
df_clinic_nodup['sbp'] = np.where(df_clinic_nodup['sbp2'].isnull(), df_clinic_nodup['sbp1'], df_clinic_nodup['sbp2'])
df_clinic_nodup['dbp'] = np.where(df_clinic_nodup['dbp2'].isnull(), df_clinic_nodup['dbp1'], df_clinic_nodup['dbp2'])

del df_clinic_nodup['sbp1']
del df_clinic_nodup['dbp1']
del df_clinic_nodup['sbp2']
del df_clinic_nodup['dbp2']

# Make a report
clinic_report = ['Clinic', df_clinic.shape[0], df_clinic_dup.shape[0], id_dup, df_clinic_nodup.shape[0]]
df_clinic_report = pd.Series(clinic_report, index = df_report.columns)

### Lab

In [11]:
def remove_repeated(x):
    if isinstance(x, str): 
        return int(x.rstrip('  (Repeated)'))
    else:
        return x

df_lab['AST'] = df_lab['AST'].map(remove_repeated)
df_lab.replace("Negative", 0, inplace=True)
df_lab.replace("2+", 2, inplace=True)
df_lab.replace("Adequate", 0, inplace=True)
df_lab.replace("Adequate (with Giant platelets)", 0, inplace=True)
df_lab.replace("Increase", 1, inplace=True)
df_lab.replace("Trace", 0, inplace=True)
df_lab.replace("Yellow", 0, inplace=True)
df_lab.replace("Pale yellow", 1, inplace=True)
df_lab.replace("Clear", 0, inplace=True)
df_lab.replace("Rare", 0, inplace=True)
df_lab.replace("Few", 1, inplace=True)
df_lab.replace("Moderate", 2, inplace=True)
df_lab.replace("0-1", 0, inplace=True)

In [12]:
important_lab = ['Subject_ID',
                 'Glucose',
                 'Glycohemoglobin (HbA1c)',
                 'eGFR',
                 'Creatinine',
                 'BUN',
                 'Albumin',
                 'Uric Acid',
                 'Total Protein',
                 'Cholesterol',
                 'Triglyceride',
                 'HDL-c',
                 'LDL-c']

df_lab = df_lab[important_lab]

In [13]:
# Check dupliated data
df_lab_dup = df_lab[df_lab.duplicated()]
id_dup = df_lab_dup['Subject_ID'].drop_duplicates().values

# Drop duplicated data
df_lab_nodup = df_lab.drop_duplicates()

# Get all columns name
variable = list(df_lab.columns)
variable.remove('Subject_ID')

# Get MAX values all column with duplicated Subject_ID
df_lab_nodup = df_lab.groupby(['Subject_ID'])[variable].agg('max').reset_index()

# Make a report
lab_report = ['Lab', df_lab.shape[0], df_lab_dup.shape[0], id_dup, df_lab_nodup.shape[0]]
df_lab_report = pd.Series(lab_report, index = df_report.columns)

### Label

In [14]:
# Check dupliated data
df_label_dup = df_label[df_label.duplicated()]
id_dup = df_label_dup['Subject_ID'].drop_duplicates().values

# Get all columns name
variable = list(df_label.columns)
variable.remove('Subject_ID')

# Get MAX values all column with duplicated Subject_ID
df_label_groupby = df_label.groupby(['Subject_ID'])[variable].agg('max').reset_index()

# Drop duplicated data
df_label_nodup = df_label_groupby.drop_duplicates()

# Make a report
label_report = ['Label', df_label.shape[0], df_label_dup.shape[0], id_dup, df_label_nodup.shape[0]]
df_label_report = pd.Series(label_report, index = df_report.columns)

## Report  

In [15]:
df_report = df_report.append([df_personal_report, 
                              df_icd10_report, 
                              df_drug_report, 
                              df_drug_detail_report, 
                              df_clinic_report,
                              df_lab_report,
                              df_label_report
                             ])

df_report

Unnamed: 0,group,row,duplicated,subject id,new row
0,Personal,428,29,"[649131.0, 647655.0, 129483.0, 121410.0, 14413...",399
1,ICD-10,428,16,"[649131.0, 647655.0, 14413.0, 65555.0, 342992....",399
2,Drug,428,9,"[647655.0, 14413.0, 342992.0, 535807.0, 66071....",399
3,Drug Details,428,6,"[647655.0, 14413.0, 342992.0, 642321.0, 461995.0]",399
4,Clinic,428,14,"[649131.0, 647655.0, 14413.0, 342992.0, 38608....",398
5,Lab,428,12,"[649131.0, 647655.0, 129483.0, 14413.0, 342992...",398
6,Label,428,19,"[647655.0, 129483.0, 14413.0, 65555.0, 342992....",398


# Text Analysis

## Doctor Chifef Complaint

In [16]:
df_doc_cc = df[['Subject_ID', 'Chifef Complaint', 'Present Illness', 'Medical Note']]

df_doc_cc["Doc CC"] = df_doc_cc["Chifef Complaint"].map(str) + " " + df_doc_cc["Present Illness"] + " " + df_doc_cc["Medical Note"].map(str)

df_doc_cc = df_doc_cc[['Subject_ID','Doc CC']]

df_doc_cc['Doc CC'] = df_doc_cc['Doc CC'].astype(str)

df_doc_cc_nodup = df_doc_cc.groupby(['Subject_ID']).agg({'Doc CC': ' '.join}).reset_index()

df_doc_cc_nodup['Doc CC'] = df_doc_cc_nodup['Doc CC'].str.lower()

In [17]:
keyword_list = ['DM', 'CKD', 'CPK', 'WNL', 'HT', 'HTN', 'DLP', 'CVD', 'TB', 'PID',
                'UTI', 'ARF', 'CRF', 'ESRD', 'ATN', 'ALS', 'CVA', 'ARF', 'CRF', 'T1DM', 'T2DM',
                'diabetes', 'high blood pressure', 'hypertension', 'obesity', 'statin', 
                'thyroid', 'thyrotoxicosis',
                'hypothyroid', 'hypoglycemia', 'hypogonadism', 'kidney stone', 'bisoprolol', 'thalassemia',
                'hypothyroidism', 'IGT', 'IFG', 'DYSLIPID']

keyword_list_lower = list(map(lambda x: x.lower(), keyword_list))


df_doc_cc_nodup['Doc CC List'] = ""

for key in keyword_list_lower:
    df_doc_cc_nodup[key+"_key"] = ""


for i in range (len(df_doc_cc_nodup)):
    
    df_doc_cc_nodup['Doc CC List'][i] = deepcut.tokenize(df_doc_cc_nodup['Doc CC'][i])
    
    # Remove " " in list
    df_doc_cc_nodup['Doc CC List'][i] = [ele for ele in df_doc_cc_nodup['Doc CC List'][i] if ele.strip()]
    
    # Remove nan in list
    df_doc_cc_nodup['Doc CC List'][i] = [v1 for v1 in df_doc_cc_nodup['Doc CC List'][i] if v1 != 'nan']
    
    
    for key in keyword_list_lower:
        
        df_doc_cc_nodup[key+"_key"][i] = np.where(len([ele for ele in df_doc_cc_nodup['Doc CC List'][i] if ele == key]) == 0, 0, 1)

del df_doc_cc_nodup['Doc CC']
del df_doc_cc_nodup['Doc CC List']

df_doc_cc_nodup = df_doc_cc_nodup.astype(int)

## Nurse Chifef Complaint

In [18]:
df_nurse_cc = df[['Subject_ID', 'Nurse Chief Complaint', 'Nurse Present Illness']]

df_nurse_cc["Nurse CC"] = df_nurse_cc["Nurse Chief Complaint"].map(str) + " " + df_nurse_cc["Nurse Present Illness"].map(str)

df_nurse_cc = df_nurse_cc[['Subject_ID','Nurse CC']]

df_nurse_cc['Nurse CC'] = df_nurse_cc['Nurse CC'].astype(str)

df_nurse_cc_nodup = df_nurse_cc.groupby(['Subject_ID']).agg({'Nurse CC': ' '.join}).reset_index()

In [19]:
df_nurse_cc_nodup['Nurse CC List'] = ""

for i in range (len(df_nurse_cc_nodup)):
    
    df_nurse_cc_nodup['Nurse CC List'][i] = deepcut.tokenize(df_nurse_cc_nodup['Nurse CC'][i])
    
    # Remove " " in list
    df_nurse_cc_nodup['Nurse CC List'][i] = [ele for ele in df_nurse_cc_nodup['Nurse CC List'][i] if ele.strip()]
    
    # Remove nan in list
    df_nurse_cc_nodup['Nurse CC List'][i] = [x for x in df_nurse_cc_nodup['Nurse CC List'][i] if x != 'nan']

# DATA MERGING

In [20]:
# Personal + ICD-10 + Drugs + Clinic + Label

df_final = df_personal_nodup.merge(df_icd10_nodup).merge(df_drug_nodup).merge(df_lab_nodup).merge(df_clinic_nodup).merge(df_doc_cc_nodup).merge(df_label_nodup)


df_final.replace("ช", 1, inplace=True)
df_final.replace("ญ", 0, inplace=True)


X = df_personal_nodup.merge(df_icd10_nodup).merge(df_drug_nodup).merge(df_drug_detail_nodup).merge(
    df_lab_nodup).merge(df_clinic_nodup).merge(df_doc_cc_nodup).merge(df_label_nodup)


df_final.shape

(398, 651)

In [21]:
# df_final['DM_1'] = X.apply(lambda x: (x['Glucose'] >= 126 
#                                or x['Glycohemoglobin (HbA1c)'] >= 6.5), 
#                    axis=1).astype(int)
# df_final['DM_2'] = X.apply(lambda x: (x['E109'] == True or 
#                                x['E112'] == True or 
#                                x['E118'] == True or 
#                                x['E119'] == True or 
#                                x['E132'] == True),
#                     axis=1).astype(int)
# df_final['DM_3'] = X.apply(lambda x: (x['Glipizide'] == True or 
#                                x['Insulin, Insulin aspart, Insulin aspart protamine'] == True or
#                                x['Insulin, Insulin lispro, Insulin lispro protamine'] == True or
#                                x['Insulin, Regular insulin'] == True or
#                                x['Metformin'] == True or 
#                                x['Sitagliptin'] == True or
#                                x['Pioglitazone'] == True or
#                                x['Alogliptin, Pioglitazone'] == True or
#                                x['Empagliflozin'] == True or
#                                x['Gliclazide'] == True or
#                                x['Glipizide'] == True or
#                                x['Teneligliptin'] == True),
#                     axis=1).astype(int)
# df_final['DM_4'] = X.apply(lambda x: (x['DM_1'] == True or
#                                x['DM_2'] == True or
#                                x['DM_3'] == True ),
#                     axis=1).astype(int)

# Classification

# DM

In [22]:
df_dm = df_final[df_final.columns[~df_final.columns.isin(['Subject_ID','HN'])]]

df_dm['Glucose'] = np.where(df_final['Glucose'] >= 126, 1, 0)
df_dm['Glycohemoglobin (HbA1c)'] = np.where(df_final['Glycohemoglobin (HbA1c)'] >= 6.5, 1, 0)
df_dm['Age'] = np.where(df_final['AGE'] >= 35, 1, 0)
df_dm['BMI'] = np.where(df_final['BMI'] >= 25, 1, 0)
# df_dm['Sex'] = df_final['SEX_x']
# df_dm['Temp'] = df_final['Temp']
# df_dm['Pulse'] = df_final['Pulse']
# df_dm['RR'] = df_final['RR']
# df_dm['O2Sat'] = df_final['O2Sat']

### XGBoost

In [23]:
important_features = list(df_dm.columns)
important_features.remove('DM_label')
important_features.remove('HTN_label')
important_features.remove('CKD_label')
important_features.remove('DLP_label')

train, test = train_test_split(df_dm, test_size = 0.25, random_state=111)

ytrain = train['DM_label']
ytest = test['DM_label']

Xtrain = train[important_features]
Xtest = test[important_features]

xgb = XGBClassifier()

parameters = {'scale_pos_weight': [9],
                'n_estimators': [30, 100, 200],
                'max_depth': [2, 3],
                'min_child_weight': [1, 10, 100]
             }
clf = GridSearchCV(xgb, parameters, scoring='f1', cv=5)
clf.fit(Xtrain,ytrain)

xgb.set_params(**clf.best_params_)
xgb.fit(Xtrain, ytrain)
ypred = xgb.predict(Xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98        91
           1       0.80      0.89      0.84         9

    accuracy                           0.97       100
   macro avg       0.89      0.93      0.91       100
weighted avg       0.97      0.97      0.97       100



In [24]:
# show the feature importances computed by xgboost
Xtrain.columns[xgb.feature_importances_ != 0]

Index(['SEX_x', 'AGE', 'E112', 'E118', 'E119', 'I10', 'FOLT01', 'SIAT01',
       'Glycohemoglobin (HbA1c)', 'eGFR', 'Creatinine', 'BUN', 'Albumin',
       'Triglyceride', 'LDL-c', 'Temp', 'Pulse', 'RR', 'O2Sat', 'Body Weight',
       'Height', 'BMI', 'sbp', 'dbp', 'dm_key', 'ht_key', 'dlp_key'],
      dtype='object')

# HTN

In [25]:
df_htn = df_final[df_final.columns[~df_final.columns.isin(['Subject_ID','HN'])]]

# df_htn['sBP'] = np.where(df_final['sbp'] >= 140, 1, 0)
# df_htn['dBP'] = np.where(df_final['dbp'] >= 90, 1, 0)

# df_htn['Age'] = np.where(df_final['AGE'] >= 60, 1, 0)
# df_htn['BMI'] = np.where(df_final['BMI'] >= 25, 1, 0)
# df_htn['Sex'] = df_final['SEX_x']
# df_htn['Temp'] = df_final['Temp']
# df_htn['Pulse'] = df_final['Pulse']
# df_htn['RR'] = df_final['RR']
# df_htn['O2Sat'] = df_final['O2Sat']

### XGBoost

In [26]:
important_features = list(df_htn.columns)
important_features.remove('DM_label')
important_features.remove('HTN_label')
important_features.remove('CKD_label')
important_features.remove('DLP_label')

train, test = train_test_split(df_htn, test_size = 0.25, random_state=111)

ytrain = train['HTN_label']
ytest = test['HTN_label']

Xtrain = train[important_features]
Xtest = test[important_features]

xgb = XGBClassifier()

parameters = {'scale_pos_weight': [9],
                'n_estimators': [30, 100, 200],
                'max_depth': [2, 3],
                'min_child_weight': [1, 10, 100]
             }
clf = GridSearchCV(xgb, parameters, scoring='f1', cv=5)
clf.fit(Xtrain,ytrain)

xgb.set_params(**clf.best_params_)
xgb.fit(Xtrain, ytrain)
ypred = xgb.predict(Xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        87
           1       1.00      0.69      0.82        13

    accuracy                           0.96       100
   macro avg       0.98      0.85      0.90       100
weighted avg       0.96      0.96      0.96       100



In [27]:
# show the feature importances computed by xgboost
Xtrain.columns[xgb.feature_importances_ != 0]

Index(['E119', 'I10', 'I482', 'EPOI00', 'HYPT00', 'LORT05', 'Glucose',
       'Glycohemoglobin (HbA1c)', 'eGFR', 'BUN', 'Albumin', 'Cholesterol',
       'LDL-c', 'Temp', 'Pulse', 'Body Weight', 'Height', 'BMI', 'sbp',
       'ht_key', 'htn_key', 'dlp_key', 'esrd_key'],
      dtype='object')

# CKD

In [28]:
df_ckd = df_final[df_final.columns[~df_final.columns.isin(['Subject_ID','HN'])]]

# df_ckd['eGFR'] = df_final['eGFR']
# df_ckd['Creatinine'] = df_final['Creatinine']
# df_ckd['BUN'] = df_final['BUN']
# df_ckd['Albumin'] = df_final['Albumin']
# df_ckd['Uric Acid'] = df_final['Uric Acid']
# df_ckd['Total Protein'] = df_final['Total Protein']

# df_ckd['Age'] = np.where(df_final['AGE'] >= 60, 1, 0)
# df_ckd['BMI'] = df_final['BMI']
# df_ckd['Sex'] = df_final['SEX_x']
# df_ckd['Temp'] = df_final['Temp']
# df_ckd['Pulse'] = df_final['Pulse']
# df_ckd['RR'] = df_final['RR']
# df_ckd['O2Sat'] = df_final['O2Sat']

### XGBoost

In [29]:
important_features = list(df_ckd.columns)
important_features.remove('DM_label')
important_features.remove('HTN_label')
important_features.remove('CKD_label')
important_features.remove('DLP_label')

train, test = train_test_split(df_ckd, test_size = 0.25, random_state=111)

ytrain = train['CKD_label']
ytest = test['CKD_label']

Xtrain = train[important_features]
Xtest = test[important_features]

xgb = XGBClassifier()

parameters = {'scale_pos_weight': [9],
                'n_estimators': [30, 100, 200],
                'max_depth': [2, 3],
                'min_child_weight': [1, 10, 100]
             }
clf = GridSearchCV(xgb, parameters, scoring='f1', cv=5)
clf.fit(Xtrain,ytrain)

xgb.set_params(**clf.best_params_)
xgb.fit(Xtrain, ytrain)
ypred = xgb.predict(Xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        96
           1       1.00      1.00      1.00         4

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [30]:
# show the feature importances computed by xgboost
Xtrain.columns[xgb.feature_importances_ != 0]

Index(['D638', 'N832', 'FERT01', 'eGFR', 'Creatinine', 'BUN', 'Albumin',
       'Pulse', 'Height', 'BMI', 'sbp', 'dbp', 'ckd_key'],
      dtype='object')

# DLP

In [31]:
df_dlp = df_final[df_final.columns[~df_final.columns.isin(['Subject_ID','HN'])]]

# df_dlp['Cholesterol'] = np.where(df_final['Cholesterol'] >= 200, 1, 0)
# df_dlp['Triglyceride'] = np.where(df_final['Triglyceride'] >= 150, 1, 0)
# df_dlp['HDL-c'] = np.where(df_final['HDL-c'] <= 60, 1, 0)
# df_dlp['LDL-c'] = np.where(df_final['LDL-c'] >= 130, 1, 0)

# df_dlp['Age'] = np.where(df_final['AGE'] >= 55, 1, 0)
# df_dlp['BMI'] = df_final['BMI']
# df_dlp['Sex'] = df_final['SEX_x']
# df_dlp['Temp'] = df_final['Temp']
# df_dlp['Pulse'] = df_final['Pulse']
# df_dlp['RR'] = df_final['RR']
# df_dlp['O2Sat'] = df_final['O2Sat']

### XGBoost

In [32]:
important_features = list(df_dlp.columns)
important_features.remove('DM_label')
important_features.remove('HTN_label')
important_features.remove('CKD_label')
important_features.remove('DLP_label')

train, test = train_test_split(df_dlp, test_size = 0.25, random_state=111)

ytrain = train['DLP_label']
ytest = test['DLP_label']

Xtrain = train[important_features]
Xtest = test[important_features]

xgb = XGBClassifier()

parameters = {'scale_pos_weight': [9],
                'n_estimators': [30, 100, 200],
                'max_depth': [2, 3],
                'min_child_weight': [1, 10, 100]
             }
clf = GridSearchCV(xgb, parameters, scoring='f1', cv=5)
clf.fit(Xtrain,ytrain)

xgb.set_params(**clf.best_params_)
xgb.fit(Xtrain, ytrain)
ypred = xgb.predict(Xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97        91
           1       0.75      0.67      0.71         9

    accuracy                           0.95       100
   macro avg       0.86      0.82      0.84       100
weighted avg       0.95      0.95      0.95       100



In [33]:
# show the feature importances computed by xgboost
Xtrain.columns[xgb.feature_importances_ != 0]

Index(['SEX_x', 'AGE', 'E112', 'E785', 'I10', 'EUTT01', 'HYDT02', 'MADT03',
       'VITT08', 'XART02', 'Glucose', 'eGFR', 'Cholesterol', 'HDL-c', 'LDL-c',
       'Temp', 'Pulse', 'O2Sat', 'Body Weight', 'Height', 'BMI', 'sbp', 'dbp',
       'dm_key', 'htn_key', 'dlp_key', 'hypertension_key', 'hypoglycemia_key'],
      dtype='object')