In [1]:
import optuna
from optuna.samplers import TPESampler
import warnings

import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import Lasso, Ridge, RidgeCV
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")

### Modifyable Params:

In [2]:
disease = "hypertension"

### 0. Extract icd_code(s) for specific disease (e.g. hypertension) from d_icd_diagnoses.csv

In [3]:
import pandas as pd
import numpy as np

d_icd_diagnoses_df = pd.read_csv(
    f"E:/Chrome Dls/MIMIC_IV_Core/hosp/d_icd_diagnoses.csv"
)

d_icd_diagnoses_df = d_icd_diagnoses_df.drop_duplicates()

In [4]:
d_icd_diagnoses_df.head()

Unnamed: 0,icd_code,icd_version,long_title
0,10,9,Cholera due to vibrio cholerae
1,11,9,Cholera due to vibrio cholerae el tor
2,19,9,"Cholera, unspecified"
3,20,9,Typhoid fever
4,21,9,Paratyphoid fever A


In [5]:
d_icd_diagnoses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112107 entries, 0 to 112106
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   icd_code     112107 non-null  object
 1   icd_version  112107 non-null  int64 
 2   long_title   112107 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.6+ MB


In [5]:
# filter rows where 'long_title' contains "hypertension" (case-insensitive)
htn_codes = d_icd_diagnoses_df[
    d_icd_diagnoses_df['long_title'].str.contains(disease, case=False, na=False)
]

# show only the icd_code column (or both code + title if you want to inspect)
print(htn_codes[['icd_code', 'long_title']])


       icd_code                                         long_title
3524       3482                   Benign intracranial hypertension
3850      36504                                Ocular hypertension
4649       4010                   Malignant essential hypertension
4650       4011                      Benign essential hypertension
4651       4019                 Unspecified essential hypertension
...         ...                                                ...
39065      O169  Unspecified maternal hypertension, unspecified...
42059      P292                              Neonatal hypertension
42061     P2930                  Pulmonary hypertension of newborn
43372      R030  Elevated blood-pressure reading, without diagn...
103933     V811                         Screening for hypertension

[160 rows x 2 columns]


### 1. Identify patients (subject_id) with certain Diseases

In [6]:
diagnoses_icd_df = pd.read_csv(
    f"E:/Chrome Dls/MIMIC_IV_Core/hosp/diagnoses_icd.csv"
)

diagnoses_icd_df = diagnoses_icd_df.drop_duplicates()

In [7]:
print(diagnoses_icd_df.shape)

(1048575, 5)


In [8]:
print("unique patients: ", len(np.unique(diagnoses_icd_df['subject_id'])))

unique patients:  36899


In [10]:
# filter rows where 'long_title' contains "hypertension" (case-insensitive)
htn_patients_df = diagnoses_icd_df[
    diagnoses_icd_df['icd_code'].isin(htn_codes['icd_code'])
]

htn_patients = np.unique(htn_patients_df['subject_id'])
print("patients_of_hypertension: ", len(htn_patients))


patients_of_hypertension:  16608


### 2. Build Main Dataset

In [13]:
patients_df = pd.read_csv(
    f"E:/Chrome Dls/MIMIC_IV_Core/hosp/patients.csv"
)

patients_df = patients_df.drop_duplicates()

print("unique patients: ", len(np.unique(patients_df['subject_id'])))

unique patients:  364627


In [14]:
patients_df = patients_df.drop(columns=['anchor_year', 'anchor_year_group', 'dod'])


In [15]:
# convert patients_of_interest to a set for faster lookup
pat_set = set(np.unique(diagnoses_icd_df['subject_id']))

# subsample only patients presented in diagnoses_icd.csv (has a record of disease != no hypertensions only!!!)
htn_patients_df = patients_df[
    patients_df['subject_id'].isin(pat_set)
]

patients_df = htn_patients_df

In [16]:
# convert patients_of_interest to a set for faster lookup
patients_set = set(htn_patients)

# create label column: 1 if subject_id in patients_of_interest else 0
patients_df['label'] = patients_df['subject_id'].isin(patients_set).astype(int)

print(patients_df.shape)
print(patients_df['label'].value_counts(normalize=True))
patients_df.head()

(36899, 4)
label
0    0.549907
1    0.450093
Name: proportion, dtype: float64


Unnamed: 0,subject_id,gender,anchor_age,label
0,10000032,F,52,1
3,10000068,F,19,0
4,10000084,M,72,0
6,10000108,M,25,0
8,10000117,F,48,0


In [17]:
patients_df.to_csv("patients.csv")

# turn set into a DataFrame with a proper column name
subject_id_df = pd.DataFrame({"subject_id": list(pat_set)})

# save to CSV without index
subject_id_df.to_csv("patients_subject_id.csv", index=False)