In [None]:
import pandas as pd
import numpy as np 

# Preprocessing

1. static:
    - age in age groups
    - sex as binary
    - prescription as dosage values. 24 hours of each hospital admission. Normalized using mix max norm. Impute using 0. 
    - diagnoses as binary. 24 hours of each hospital admission.
    
2. temporal:
    - vital signs: 24 hours of each hospital admission. At each hour the avg is used to represent the signal. 

    
    Then normalized using mix-max norm. To handle missing values, we simply use “0” to impute

In [None]:
diagnoses = pd.read_csv("diagnoses.csv")
#drugs  = pd.read_csv("C:\\Users\\Maria\\Desktop\\data\\drugs.csv")
patients = pd.read_csv("patients_vitals.csv")

In [None]:
patients.drop("Unnamed: 0", axis=1, inplace=True)
#drugs.drop("Unnamed: 0", axis=1, inplace=True)
diagnoses.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
patients.drop(["DOB", "icu_length_of_stay"], axis=1, inplace=True)
diagnoses.drop(["icd9_title"], axis=1, inplace=True)


In [None]:
patients.head()

In [None]:
adm_details = patients[["subject_id", "hadm_id", "gender", "age", "mortality"]]

In [None]:
vitals = patients[["subject_id", "hadm_id", "admittime", "charttime",  "HeartRate", "SysBP", "DiasBP", "MeanBP", "RespRate", "TempC", "SpO2", "Glucose", "mortality"]]

In [None]:
del patients

## Preprocess admissions

In [None]:
df_adm = adm_details.copy()

In [None]:
del adm_details

In [None]:
print('Total admissions:', len(df_adm))

In [None]:
### bin age group 

In [None]:
def bin_age(age):
    if age < 25:
        return '18-25'
    elif age < 45:
        return '25-45'
    elif age < 65:
        return '45-65'
    elif age < 89:
        return '65-89'
    else:
        return '89+'

In [None]:
df_adm['age'] = df_adm['age'].apply(bin_age)

In [None]:
df_adm.head()

In [None]:
df_adm = df_adm.sort_values(['hadm_id']).reset_index(drop=True)

In [None]:
df_adm.head()

In [None]:
gender = df_adm.groupby([df_adm.hadm_id,  df_adm.subject_id, 'gender'])['gender'].first().unstack()
age = df_adm.groupby([df_adm.hadm_id, df_adm.subject_id, 'age'])['age'].first().unstack()
demographics = pd.concat([gender, age], axis=1)
demographics = demographics.replace({np.NaN: 0, 'F': 1, "M": 1, '18-25': 1, '25-45': 1, '45-65': 1, '65-89': 1, '89+': 1})
demographics

## Preprocessing ICD-10

In [None]:
diagnoses

In [None]:

def convert_icd_group(icd):
    icd = str(icd)
    if icd.startswith('V'):
        return 19
    if icd.startswith('E'):
        return 20
    icd = int(icd[:3])
    if icd <= 139:
        return 1
    elif icd <= 239:
        return 2
    elif icd <= 279:
        return 3
    elif icd <= 289:
        return 4
    elif icd <= 319:
        return 5
    elif icd <= 389:
        return 6
    elif icd <= 459:
        return 7
    elif icd <= 519:
        return 8
    elif icd <= 579:
        return 9
    elif icd < 629:
        return 10
    elif icd <= 679:
        return 11
    elif icd <= 709:
        return 12
    elif icd <= 739:
        return 13
    elif icd <= 759:
        return 14
    elif icd <= 779:
        return np.nan
    elif icd <= 789:
        return 15
    elif icd <= 796:
        return 16
    elif icd <= 799:
        return 17
    else:
        return 18

In [None]:
df_icd = diagnoses.copy()

In [None]:
del diagnoses

In [None]:
df_icd

In [None]:
df_icd.columns = map(str.lower, df_icd.columns)
df_icd['icd9_code'] = df_icd['icd9_code'].apply(convert_icd_group)
df_icd = df_icd.dropna().drop_duplicates().sort_values(['hadm_id', 'icd9_code'])
for x in range(20):
    x += 1
    df_icd[f'{x}'] = (df_icd['icd9_code'] == x).astype(int)
df_icd = df_icd.groupby(['hadm_id', 'subject_id']).sum()
df_icd = df_icd[df_icd.columns[1:]].reset_index()
df_icd = df_icd[df_icd.hadm_id.isin(df_adm.hadm_id)]

In [None]:
df_icd.head()

## Preprocess vitals

In [None]:
adm_ids = df_adm.hadm_id.tolist()

In [None]:
df_vitals = vitals.copy()

In [None]:
df_vitals['admittime'] = pd.to_datetime(df_vitals['admittime'])

In [None]:
df_vitals['charttime'] = pd.to_datetime(df_vitals['charttime'])

In [None]:
#getting vitals 24 hours 
df_vitals['hr'] = (df_vitals.charttime - df_vitals.admittime) / np.timedelta64(1, 'h')
df_vitals = df_vitals[(df_vitals.hr <= 23) & (df_vitals.hr >= 0)]
df_vitals = df_vitals.set_index('hadm_id').groupby('hadm_id').resample('H', on='charttime').mean().reset_index()

In [None]:
## transform vitals to tabular format

In [None]:

df_vitals.hadm_id.value_counts()
df_vitals_sc = df_vitals[["hadm_id", "subject_id", "mortality", "HeartRate", "SysBP", "DiasBP", "MeanBP", "RespRate", "TempC", "SpO2", "Glucose"]]

In [None]:
from sklearn.preprocessing import MinMaxScaler
df_vitals.hadm_id.value_counts()
df_vitals_sc = df_vitals[["hadm_id", "subject_id", "mortality", "HeartRate", "SysBP", "DiasBP", "MeanBP", "RespRate", "TempC", "SpO2", "Glucose"]]

## impute using 0
df_vitals_sc = df_vitals_sc.replace(np.NaN, 0)

df_vitals_scaled = df_vitals_sc.copy()
## For the patients that have more than 24 hour intervals, we keep the first 24.
df_vitals_scaled['counts'] = df_vitals_scaled.groupby(['hadm_id'])['Glucose'].transform('count')
vitals_slice = df_vitals_scaled[df_vitals_scaled.counts >=24]

vitals_slice.hadm_id.value_counts()



In [None]:
vitals_slice.mortality.value_counts()

In [None]:
## for all tables take the same ids

In [None]:
vitals_slice.subject_id.unique()

In [None]:
vitals_slice[vitals_slice.hadm_id==None]

In [None]:
vitals_slice[["subject_id", "hadm_id"]]

In [None]:
demographics = demographics.reset_index()
demos = demographics[demographics.hadm_id.isin(list(vitals_slice.hadm_id.unique()))]

In [None]:
demos

In [None]:
df_icd = df_icd.reset_index()
icd = df_icd[df_icd.hadm_id.isin(list(vitals_slice.hadm_id.unique()))]

In [None]:
icd.drop(["index"], axis=1, inplace=True)

In [None]:
icd_demos = pd.merge(icd, demos, how="inner", on=["hadm_id", "subject_id"])

In [None]:
icd_demos_vitals = pd.merge(vitals_slice, icd_demos, how="inner", on=["hadm_id", "subject_id"])

In [None]:
icd_demos_vitals.mortality.value_counts()

In [None]:
icd_demos_vitals[["hadm_id", "subject_id"]].head(60)

In [None]:
len(icd_demos_vitals.subject_id.unique())

In [None]:
len(icd_demos_vitals.hadm_id.unique())

In [None]:
icd_demos_vitals.to_csv("icd_demos_vitals.csv")