In [1]:
import pandas as pd
import numpy as np 

# Preprocessing

1. static:
    - age in age groups
    - sex as binary
    - prescription as dosage values. 24 hours of each hospital admission. Normalized using mix max norm. Impute using 0. 
    - diagnoses as binary. 24 hours of each hospital admission.
    
2. temporal:
    - vital signs: 24 hours of each hospital admission. At each hour the avg is used to represent the signal. 

    
    Then normalized using mix-max norm. To handle missing values, we simply use “0” to impute

In [2]:
diagnoses = pd.read_csv("C:\\Users\\Maria\\Desktop\\data\\diagnoses.csv")
#drugs  = pd.read_csv("C:\\Users\\Maria\\Desktop\\data\\drugs.csv")
patients = pd.read_csv("C:\\Users\\Maria\\Desktop\\data\\patients_vitals.csv")

In [3]:
patients.drop("Unnamed: 0", axis=1, inplace=True)
#drugs.drop("Unnamed: 0", axis=1, inplace=True)
diagnoses.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
patients.drop(["DOB", "icu_length_of_stay"], axis=1, inplace=True)
diagnoses.drop(["icd9_title"], axis=1, inplace=True)


In [5]:
patients.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,gender,admittime,age,mortality,charttime,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose
0,23,152223,227807,M,2153-09-03 07:15:00,71.32731,0,2153-09-03 12:20:00,,,,,12.0,,,
1,23,152223,227807,M,2153-09-03 07:15:00,71.32731,0,2153-09-03 12:30:00,90.0,106.0,62.0,77.0,7.0,35.0,100.0,
2,23,152223,227807,M,2153-09-03 07:15:00,71.32731,0,2153-09-03 12:45:00,90.0,109.0,63.0,79.0,,,100.0,
3,23,152223,227807,M,2153-09-03 07:15:00,71.32731,0,2153-09-03 13:00:00,90.0,92.0,57.0,69.0,13.0,,100.0,
4,23,152223,227807,M,2153-09-03 07:15:00,71.32731,0,2153-09-03 13:15:00,90.0,98.0,55.0,70.0,12.0,,100.0,


In [7]:
adm_details = patients[["subject_id", "hadm_id", "gender", "age", "mortality"]]

In [8]:
vitals = patients[["subject_id", "hadm_id", "admittime", "charttime",  "HeartRate", "SysBP", "DiasBP", "MeanBP", "RespRate", "TempC", "SpO2", "Glucose", "mortality"]]

In [9]:
del patients

## Preprocess admissions

In [10]:
df_adm = adm_details.copy()

In [11]:
del adm_details

In [12]:
print('Total admissions:', len(df_adm))

Total admissions: 6163296


In [13]:
### bin age group 

In [14]:
def bin_age(age):
    if age < 25:
        return '18-25'
    elif age < 45:
        return '25-45'
    elif age < 65:
        return '45-65'
    elif age < 89:
        return '65-89'
    else:
        return '89+'

In [15]:
df_adm['age'] = df_adm['age'].apply(bin_age)

In [16]:
df_adm.head()

Unnamed: 0,subject_id,hadm_id,gender,age,mortality
0,23,152223,M,65-89,0
1,23,152223,M,65-89,0
2,23,152223,M,65-89,0
3,23,152223,M,65-89,0
4,23,152223,M,65-89,0


In [17]:
df_adm = df_adm.sort_values(['hadm_id']).reset_index(drop=True)

In [18]:
df_adm.head()

Unnamed: 0,subject_id,hadm_id,gender,age,mortality
0,58526,100001,F,25-45,0
1,58526,100001,F,25-45,0
2,58526,100001,F,25-45,0
3,58526,100001,F,25-45,0
4,58526,100001,F,25-45,0


In [25]:
gender = df_adm.groupby([df_adm.hadm_id,  df_adm.subject_id, 'gender'])['gender'].first().unstack()
age = df_adm.groupby([df_adm.hadm_id, df_adm.subject_id, 'age'])['age'].first().unstack()
demographics = pd.concat([gender, age], axis=1)
demographics = demographics.replace({np.NaN: 0, 'F': 1, "M": 1, '18-25': 1, '25-45': 1, '45-65': 1, '65-89': 1, '89+': 1})
demographics

Unnamed: 0_level_0,Unnamed: 1_level_0,F,M,18-25,25-45,45-65,65-89,89+
hadm_id,subject_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100001,58526,1,0,0,1,0,0,0
100006,9895,1,0,0,0,1,0,0
100007,23018,1,0,0,0,0,1,0
100009,533,0,1,0,0,1,0,0
100010,55853,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
199988,25780,0,1,0,0,0,1,0
199993,20785,0,1,0,0,1,0,0
199995,19412,0,1,1,0,0,0,0
199998,27200,0,1,0,0,0,1,0


## Preprocessing ICD-10

In [None]:
diagnoses

In [26]:

def convert_icd_group(icd):
    icd = str(icd)
    if icd.startswith('V'):
        return 19
    if icd.startswith('E'):
        return 20
    icd = int(icd[:3])
    if icd <= 139:
        return 1
    elif icd <= 239:
        return 2
    elif icd <= 279:
        return 3
    elif icd <= 289:
        return 4
    elif icd <= 319:
        return 5
    elif icd <= 389:
        return 6
    elif icd <= 459:
        return 7
    elif icd <= 519:
        return 8
    elif icd <= 579:
        return 9
    elif icd < 629:
        return 10
    elif icd <= 679:
        return 11
    elif icd <= 709:
        return 12
    elif icd <= 739:
        return 13
    elif icd <= 759:
        return 14
    elif icd <= 779:
        return np.nan
    elif icd <= 789:
        return 15
    elif icd <= 796:
        return 16
    elif icd <= 799:
        return 17
    else:
        return 18

In [27]:
df_icd = diagnoses.copy()

In [28]:
del diagnoses

In [None]:
df_icd

In [29]:
df_icd.columns = map(str.lower, df_icd.columns)
df_icd['icd9_code'] = df_icd['icd9_code'].apply(convert_icd_group)
df_icd = df_icd.dropna().drop_duplicates().sort_values(['hadm_id', 'icd9_code'])
for x in range(20):
    x += 1
    df_icd[f'{x}'] = (df_icd['icd9_code'] == x).astype(int)
df_icd = df_icd.groupby(['hadm_id', 'subject_id']).sum()
df_icd = df_icd[df_icd.columns[1:]].reset_index()
df_icd = df_icd[df_icd.hadm_id.isin(df_adm.hadm_id)]

In [30]:
df_icd.head()

Unnamed: 0,hadm_id,subject_id,1,2,3,4,5,6,7,8,...,11,12,13,14,15,16,17,18,19,20
0,100001,58526,0,0,3,0,0,3,3,0,...,0,3,0,0,0,0,0,0,3,0
2,100006,9895,0,2,2,0,2,0,0,2,...,0,0,0,0,2,0,0,0,2,0
3,100007,23018,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,100009,533,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,1,1,0
5,100010,55853,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Preprocess vitals

In [31]:
vitals

Unnamed: 0,subject_id,hadm_id,admittime,charttime,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose,mortality
0,23,152223,2153-09-03 07:15:00,2153-09-03 12:20:00,,,,,12.0,,,,0
1,23,152223,2153-09-03 07:15:00,2153-09-03 12:30:00,90.0,106.0,62.0,77.0,7.0,35.0,100.0,,0
2,23,152223,2153-09-03 07:15:00,2153-09-03 12:45:00,90.0,109.0,63.0,79.0,,,100.0,,0
3,23,152223,2153-09-03 07:15:00,2153-09-03 13:00:00,90.0,92.0,57.0,69.0,13.0,,100.0,,0
4,23,152223,2153-09-03 07:15:00,2153-09-03 13:15:00,90.0,98.0,55.0,70.0,12.0,,100.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6163291,98797,105447,2132-12-24 20:06:00,2132-12-25 22:00:00,,,,,,,61.0,,1
6163292,98797,105447,2132-12-24 20:06:00,2132-12-25 22:01:00,,,,,,,,,1
6163293,98797,105447,2132-12-24 20:06:00,2132-12-25 22:03:00,48.0,,,,,,,,1
6163294,98797,105447,2132-12-24 20:06:00,2132-12-25 23:00:00,107.0,,,,34.0,,79.0,,1


In [32]:
adm_ids = df_adm.hadm_id.tolist()

In [33]:
df_vitals = vitals.copy()

In [34]:
df_vitals['admittime'] = pd.to_datetime(df_vitals['admittime'])

In [35]:
df_vitals['charttime'] = pd.to_datetime(df_vitals['charttime'])

In [36]:
#getting vitals 24 hours 
df_vitals['hr'] = (df_vitals.charttime - df_vitals.admittime) / np.timedelta64(1, 'h')
df_vitals = df_vitals[(df_vitals.hr <= 24) & (df_vitals.hr >= 0)]
df_vitals = df_vitals.set_index('hadm_id').groupby('hadm_id').resample('H', on='charttime').mean().reset_index()

In [37]:
df_vitals.head()

Unnamed: 0,hadm_id,charttime,subject_id,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose,mortality,hr
0,100001,2117-09-11 12:00:00,58526.0,122.0,,,,14.0,,,,0.0,1.183333
1,100001,2117-09-11 13:00:00,58526.0,118.0,192.0,100.0,122.0,22.0,36.666667,,,0.0,1.645833
2,100001,2117-09-11 14:00:00,58526.0,118.0,165.0,85.0,103.0,15.0,,,,0.0,2.233333
3,100001,2117-09-11 15:00:00,58526.0,110.0,119.0,64.0,76.0,15.0,,,,0.0,3.233333
4,100001,2117-09-11 16:00:00,58526.0,104.0,169.0,84.0,105.0,16.0,36.611111,100.0,,0.0,4.25


In [38]:
vitals.head()

Unnamed: 0,subject_id,hadm_id,admittime,charttime,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose,mortality
0,23,152223,2153-09-03 07:15:00,2153-09-03 12:20:00,,,,,12.0,,,,0
1,23,152223,2153-09-03 07:15:00,2153-09-03 12:30:00,90.0,106.0,62.0,77.0,7.0,35.0,100.0,,0
2,23,152223,2153-09-03 07:15:00,2153-09-03 12:45:00,90.0,109.0,63.0,79.0,,,100.0,,0
3,23,152223,2153-09-03 07:15:00,2153-09-03 13:00:00,90.0,92.0,57.0,69.0,13.0,,100.0,,0
4,23,152223,2153-09-03 07:15:00,2153-09-03 13:15:00,90.0,98.0,55.0,70.0,12.0,,100.0,,0


In [None]:
## transform vitals to tabular format

In [56]:

df_vitals.hadm_id.value_counts()
df_vitals_sc = df_vitals[["hadm_id", "subject_id", "mortality", "HeartRate", "SysBP", "DiasBP", "MeanBP", "RespRate", "TempC", "SpO2", "Glucose"]]

Unnamed: 0,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose
0,122.0,,,,14.0,,,
1,118.0,192.0,100.0,122.0,22.0,36.666667,,
2,118.0,165.0,85.0,103.0,15.0,,,
3,110.0,119.0,64.0,76.0,15.0,,,
4,104.0,169.0,84.0,105.0,16.0,36.611111,100.0,
...,...,...,...,...,...,...,...,...
550814,83.0,94.0,69.0,83.0,14.0,,,
550815,,,,,,,,
550816,84.0,95.0,68.0,81.0,13.0,36.222221,,
550817,,,,,,,,


In [58]:
from sklearn.preprocessing import MinMaxScaler
df_vitals.hadm_id.value_counts()
df_vitals_sc = df_vitals[["hadm_id", "subject_id", "mortality", "HeartRate", "SysBP", "DiasBP", "MeanBP", "RespRate", "TempC", "SpO2", "Glucose"]]

## impute using 0
df_vitals_sc = df_vitals_sc.replace(np.NaN, 0)

scaler = MinMaxScaler()

df_vitals_scaled = scaler.fit_transform(df_vitals_sc.iloc[:, 2:])

df_vitals_scaled = pd.DataFrame(df_vitals_scaled, columns=df_vitals_sc.columns[2:], index = df_vitals_sc.index)
df_vitals_scaled["hadm_id"] = df_vitals_sc.hadm_id
df_vitals_scaled["subject_id"] = df_vitals_sc.subject_id

## For the patients that have more than 24 hour intervals, we keep the first 24.
df_vitals_scaled['counts'] = df_vitals_scaled.groupby(['hadm_id'])['Glucose'].transform('count')
vitals_slice = df_vitals_scaled[df_vitals_scaled.counts >=24]

vitals_slice.hadm_id.value_counts()



116679    25
152694    25
185590    25
151926    25
171263    25
          ..
198396    24
116644    24
155607    24
102575    24
118842    24
Name: hadm_id, Length: 11075, dtype: int64

In [60]:
vitals_slice.mortality.value_counts()

0.0    227251
1.0     42120
Name: mortality, dtype: int64

In [61]:
vitals_slice.head()

Unnamed: 0,mortality,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose,hadm_id,subject_id,counts
0,0.0,0.435714,0.0,0.0,0.0,0.202899,0.0,0.0,0.0,100001,58526.0,24
1,0.0,0.421429,0.617363,0.37594,0.409396,0.318841,0.869565,0.0,0.0,100001,58526.0,24
2,0.0,0.421429,0.530547,0.319549,0.345638,0.217391,0.0,0.0,0.0,100001,58526.0,24
3,0.0,0.392857,0.382637,0.240602,0.255034,0.217391,0.0,0.0,0.0,100001,58526.0,24
4,0.0,0.371429,0.543408,0.315789,0.352349,0.231884,0.868248,1.0,0.0,100001,58526.0,24


In [64]:
## for all tables take the same ids

In [91]:
vitals_slice.hadm_id.unique()

array([100001, 100006, 100011, ..., 199981, 199984, 199993], dtype=int64)

In [92]:
demographics

Unnamed: 0,level_0,index,hadm_id,subject_id,F,M,18-25,25-45,45-65,65-89,89+
0,0,0,100001,58526,1,0,0,1,0,0,0
1,1,1,100006,9895,1,0,0,0,1,0,0
2,2,2,100007,23018,1,0,0,0,0,1,0
3,3,3,100009,533,0,1,0,0,1,0,0
4,4,4,100010,55853,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
33559,33559,33559,199988,25780,0,1,0,0,0,1,0
33560,33560,33560,199993,20785,0,1,0,0,1,0,0
33561,33561,33561,199995,19412,0,1,1,0,0,0,0
33562,33562,33562,199998,27200,0,1,0,0,0,1,0


In [94]:
#demographics = demographics.reset_index()
demos = demographics[demographics.hadm_id.isin(list(vitals_slice.hadm_id.unique()))]

In [95]:
demos.drop(['level_0', "index"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [96]:
demos

Unnamed: 0,hadm_id,subject_id,F,M,18-25,25-45,45-65,65-89,89+
0,100001,58526,1,0,0,1,0,0,0
1,100006,9895,1,0,0,0,1,0,0
5,100011,87977,0,1,1,0,0,0,0
11,100030,12803,0,1,0,1,0,0,0
16,100037,58947,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
33547,199961,20620,0,1,0,0,1,0,0
33554,199976,26198,0,1,0,0,0,1,0
33556,199981,28616,0,1,0,0,0,0,1
33557,199984,55617,1,0,0,0,0,0,1


In [97]:
df_icd = df_icd.reset_index()
icd = df_icd[df_icd.hadm_id.isin(list(vitals_slice.hadm_id.unique()))]

In [101]:
icd.drop(["index", "level_0"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [102]:
icd

Error: KeyboardInterrupt

In [107]:
icd_demos = pd.merge(icd, demos, how="inner", on=["hadm_id", "subject_id"])

In [108]:
icd_demos

Unnamed: 0,hadm_id,subject_id,1,2,3,4,5,6,7,8,...,18,19,20,F,M,18-25,25-45,45-65,65-89,89+
0,100001,58526,0,0,3,0,0,3,3,0,...,0,3,0,1,0,0,1,0,0,0
1,100006,9895,0,2,2,0,2,0,0,2,...,0,2,0,1,0,0,0,1,0,0
2,100011,87977,0,0,0,1,1,0,0,1,...,1,0,1,0,1,1,0,0,0,0
3,100030,12803,0,0,1,1,0,0,1,1,...,0,0,0,0,1,0,1,0,0,0
4,100037,58947,1,1,0,1,0,1,1,1,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11070,199961,20620,0,0,2,2,0,0,2,0,...,2,0,2,0,1,0,0,1,0,0
11071,199976,26198,1,0,0,0,0,0,0,1,...,1,0,1,0,1,0,0,0,1,0
11072,199981,28616,0,0,1,1,0,0,1,1,...,1,0,1,0,1,0,0,0,0,1
11073,199984,55617,0,0,1,0,0,1,1,1,...,0,1,0,1,0,0,0,0,0,1


In [109]:
icd_demos_vitals = pd.merge(vitals_slice, icd_demos, how="inner", on=["hadm_id", "subject_id"])

In [111]:
icd_demos_vitals ##final table hopefully 

Unnamed: 0,mortality,HeartRate,SysBP,DiasBP,MeanBP,RespRate,TempC,SpO2,Glucose,hadm_id,...,18,19,20,F,M,18-25,25-45,45-65,65-89,89+
0,0.0,0.435714,0.000000,0.000000,0.000000,0.202899,0.000000,0.00,0.0,100001,...,0,3,0,1,0,0,1,0,0,0
1,0.0,0.421429,0.617363,0.375940,0.409396,0.318841,0.869565,0.00,0.0,100001,...,0,3,0,1,0,0,1,0,0,0
2,0.0,0.421429,0.530547,0.319549,0.345638,0.217391,0.000000,0.00,0.0,100001,...,0,3,0,1,0,0,1,0,0,0
3,0.0,0.392857,0.382637,0.240602,0.255034,0.217391,0.000000,0.00,0.0,100001,...,0,3,0,1,0,0,1,0,0,0
4,0.0,0.371429,0.543408,0.315789,0.352349,0.231884,0.868248,1.00,0.0,100001,...,0,3,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258616,0.0,0.310714,0.302251,0.251880,0.275168,0.260870,0.866930,0.00,0.0,199993,...,0,0,0,0,1,0,0,1,0,0
258617,0.0,0.310714,0.334405,0.278195,0.302013,0.260870,0.000000,0.00,0.0,199993,...,0,0,0,0,1,0,0,1,0,0
258618,0.0,0.296429,0.302251,0.259398,0.278523,0.202899,0.000000,0.00,0.0,199993,...,0,0,0,0,1,0,0,1,0,0
258619,0.0,0.300000,0.305466,0.255639,0.271812,0.188406,0.859025,0.00,0.0,199993,...,0,0,0,0,1,0,0,1,0,0


In [112]:
icd_demos_vitals.mortality.value_counts()

0.0    216501
1.0     42120
Name: mortality, dtype: int64