# Data Filtering for Non-time-series modelling
Developed by Zion Knight (s3906411)

Important events file exists as a midpoint, so that the filtering can be done in chunks, necessary for low RAM.

In [1]:
importantEventsFile = 'data/importantChartEvents.csv'
sourceEventsFile = 'data/chartevents.csv'
finalEventsFile = 'data/chartEventsPred.csv'
itemIDsFile = 'importantItemIDs.txt'
itemLabelsFile = 'data/d_items.csv'

## Reading in important features
Read in item ids of important features

In [2]:
import pandas as pd

with open(itemIDsFile, 'r') as f:
    importantItemIDs = f.read().splitlines()

importantItemIDs = [int(x) for x in importantItemIDs]
print(importantItemIDs)

[220045, 220048, 220050, 220051, 220052, 220179, 220180, 220181, 220210, 220227, 220228, 220274, 220277, 220341, 220344, 220363, 220364, 220395, 220451, 220454, 220467, 220546, 220603, 220615, 220621, 220645, 220650, 223762, 224167, 224643, 225170, 225624, 225664, 225667, 225693, 225695, 225948, 226537, 226996, 227242, 227243, 227463, 227466, 228699, 229761, 230093]


Filter events file for only these features

In [3]:
with open(importantEventsFile, 'w') as f:
    f.write(
        'subject_id,charttime,itemid,valuenum\n'
    )

for chunk in pd.read_csv(sourceEventsFile, usecols=['subject_id', 'charttime', 'itemid', 'valuenum'], chunksize=100000):
    chunk = chunk[chunk['itemid'].isin(importantItemIDs)]
    chunk.to_csv(importantEventsFile, mode='a', header=False, index=False)

In [13]:
chartEvents = pd.read_csv(importantEventsFile)
chartEvents.head()

Unnamed: 0,subject_id,charttime,itemid,valuenum
0,10000032,2180-07-23 14:00:00,220048,
1,10000032,2180-07-23 14:11:00,220179,84.0
2,10000032,2180-07-23 14:11:00,220180,48.0
3,10000032,2180-07-23 14:11:00,220181,56.0
4,10000032,2180-07-23 14:12:00,220045,91.0


Keep only numerical data

In [14]:
chartEvents.rename(columns={'valuenum': 'value'}, inplace=True)
chartEvents['value'] = chartEvents['value'].astype(float)
chartEvents.dropna(inplace=True)

chartEvents.head()

Unnamed: 0,subject_id,charttime,itemid,value
1,10000032,2180-07-23 14:11:00,220179,84.0
2,10000032,2180-07-23 14:11:00,220180,48.0
3,10000032,2180-07-23 14:11:00,220181,56.0
4,10000032,2180-07-23 14:12:00,220045,91.0
5,10000032,2180-07-23 14:12:00,220210,24.0


In [15]:
chartEvents.isna().sum()

subject_id    0
charttime     0
itemid        0
value         0
dtype: int64

In [16]:
print(chartEvents.shape)

(57461927, 4)


Add feature labels

In [17]:
d_items = pd.read_csv(itemLabelsFile, usecols=['itemid', 'label'])

chartEvents = chartEvents.merge(d_items, on='itemid')
chartEvents.head()

Unnamed: 0,subject_id,charttime,itemid,value,label
0,10000032,2180-07-23 14:11:00,220179,84.0,Non Invasive Blood Pressure systolic
1,10000032,2180-07-23 14:30:00,220179,95.0,Non Invasive Blood Pressure systolic
2,10000032,2180-07-23 15:00:00,220179,88.0,Non Invasive Blood Pressure systolic
3,10000032,2180-07-23 16:01:00,220179,91.0,Non Invasive Blood Pressure systolic
4,10000032,2180-07-23 17:00:00,220179,95.0,Non Invasive Blood Pressure systolic


In [18]:
chartEvents = chartEvents[['subject_id', 'label', 'charttime', 'value']]
chartEvents = chartEvents.sort_values(['subject_id', 'label', 'charttime'])
chartEvents.to_csv(importantEventsFile, index=False)

In [19]:
chartEvents.head()

Unnamed: 0,subject_id,label,charttime,value
43887953,10000032,BUN,2180-07-23 21:45:00,33.0
42085090,10000032,Creatinine (serum),2180-07-23 21:45:00,0.5
42675956,10000032,Glucose (serum),2180-07-23 21:45:00,115.0
16129351,10000032,Heart Rate,2180-07-23 14:12:00,91.0
16129352,10000032,Heart Rate,2180-07-23 14:30:00,93.0


Checkpoint if previously ran the above

In [2]:
import pandas as pd
chartEvents = pd.read_csv('data/importantChartEvents.csv')

Filter the labels for common/the same vital under different names

In [20]:
print(chartEvents['label'].unique())

['BUN' 'Creatinine (serum)' 'Glucose (serum)' 'Heart Rate'
 'Non Invasive Blood Pressure diastolic'
 'Non Invasive Blood Pressure mean' 'Non Invasive Blood Pressure systolic'
 'O2 saturation pulseoxymetry' 'Respiratory Rate' 'Sodium (serum)'
 'Glucose finger stick (range 70-100)' 'Hemoglobin' 'WBC' 'PTT'
 'PH (Venous)' 'Uric Acid' 'Arterial O2 Saturation'
 'Glucose (whole blood)' 'Ionized Calcium' 'Temperature Celsius'
 'Arterial Blood Pressure diastolic' 'Arterial Blood Pressure mean'
 'Arterial Blood Pressure systolic' 'Triglyceride'
 'Manual Blood Pressure Diastolic Left'
 'Manual Blood Pressure Systolic Left' 'Cholesterol'
 'Creatinine (whole blood)' 'Total Protein'
 'Manual Blood Pressure Diastolic Right'
 'Manual Blood Pressure Systolic Right' 'Cortisol' 'APS']


In [22]:
chartEvents['label'] = chartEvents['label'].replace({
    'Creatinine (serum)': 'Creatinine',
    'Glucose (serum)': 'Glucose',
    'Non Invasive Blood Pressure diastolic': 'Diastolic Blood Pressure',
    'Non Invasive Blood Pressure mean': 'Mean Blood Pressure',
    'Non Invasive Blood Pressure systolic': 'Systolic Blood Pressure',
    'Sodium (serum)': 'Sodium',
    'Glucose finger stick (range 70-100)': 'Glucose',
    'O2 saturation pulseoxymetry': 'O2 Saturation',
    'Arterial O2 Saturation': 'O2 Saturation',
    'Glucose (whole blood)': 'Glucose',
    'Arterial Blood Pressure diastolic': 'Diastolic Blood Pressure',
    'Arterial Blood Pressure mean': 'Mean Blood Pressure',
    'Arterial Blood Pressure systolic': 'Systolic Blood Pressure',
    'Triglyceride': 'WBC',
    'Manual Blood Pressure Diastolic Left': 'Diastolic Blood Pressure',
    'Manual Blood Pressure Systolic Left': 'Systolic Blood Pressure',
    'Creatinine (whole blood)': 'Creatinine',
    'Manual Blood Pressure Diastolic Right': 'Diastolic Blood Pressure',
    'Manual Blood Pressure Systolic Right': 'Systolic Blood Pressure'
})

print(chartEvents['label'].unique())

['BUN' 'Creatinine' 'Glucose' 'Heart Rate' 'Diastolic Blood Pressure'
 'Mean Blood Pressure' 'Systolic Blood Pressure' 'O2 Saturation'
 'Respiratory Rate' 'Sodium' 'Hemoglobin' 'WBC' 'PTT' 'PH (Venous)'
 'Uric Acid' 'Ionized Calcium' 'Temperature Celsius' 'Cholesterol'
 'Total Protein' 'Cortisol' 'APS']


In [23]:
chartEvents.head()

Unnamed: 0,subject_id,label,charttime,value
43887953,10000032,BUN,2180-07-23 21:45:00,33.0
42085090,10000032,Creatinine,2180-07-23 21:45:00,0.5
42675956,10000032,Glucose,2180-07-23 21:45:00,115.0
16129351,10000032,Heart Rate,2180-07-23 14:12:00,91.0
16129352,10000032,Heart Rate,2180-07-23 14:30:00,93.0


## Blood Glucose Time-series capture
Separate glucose values for time series processing

In [24]:
glucose_df = chartEvents[chartEvents['label'] == 'Glucose']
glucose_df = glucose_df.sort_values(by=['subject_id', 'charttime'])
glucose_df.head()

Unnamed: 0,subject_id,label,charttime,value
42675956,10000032,Glucose,2180-07-23 21:45:00,115.0
42675957,10000690,Glucose,2150-11-03 02:56:00,77.0
42675958,10000690,Glucose,2150-11-04 03:03:00,84.0
44477405,10000690,Glucose,2150-11-04 10:00:00,117.0
42675959,10000690,Glucose,2150-11-04 17:54:00,120.0


In [25]:
glucose_df.describe()

Unnamed: 0,subject_id,value
count,1814462.0,1814462.0
mean,15001620.0,209.1426
std,2893341.0,7727.934
min,10000030.0,-124.0
25%,12492850.0,110.0
50%,15019290.0,135.0
75%,17516320.0,172.0
max,19999990.0,1653550.0


In [26]:
glucose_df = glucose_df[glucose_df['value'] <= 1000]
glucose_df = glucose_df[glucose_df['value'] >= 20]

for each glucose reading, find the proceeding reading as the prediction value

In [27]:
# Shift value to get the "next" reading
glucose_df['next_glucose'] = glucose_df.groupby('subject_id')['value'].shift(-1)
glucose_df.head()

Unnamed: 0,subject_id,label,charttime,value,next_glucose
42675956,10000032,Glucose,2180-07-23 21:45:00,115.0,
42675957,10000690,Glucose,2150-11-03 02:56:00,77.0,84.0
42675958,10000690,Glucose,2150-11-04 03:03:00,84.0,117.0
44477405,10000690,Glucose,2150-11-04 10:00:00,117.0,120.0
42675959,10000690,Glucose,2150-11-04 17:54:00,120.0,107.0


In [28]:
print(glucose_df.isnull().sum())
print(glucose_df.shape)

subject_id          0
label               0
charttime           0
value               0
next_glucose    64330
dtype: int64
(1813711, 5)


Assign it a classification label based on final glucose

In [29]:
# Classify next glucose value
def classify_glucose(val):
    if pd.isna(val):
        return None
    if val < 70:
        return 'hypo'
    elif val > 180:
        return 'hyper'
    else:
        return 'normal'

glucose_df['label'] = glucose_df['next_glucose'].apply(classify_glucose)
glucose_df.head()

Unnamed: 0,subject_id,label,charttime,value,next_glucose
42675956,10000032,,2180-07-23 21:45:00,115.0,
42675957,10000690,normal,2150-11-03 02:56:00,77.0,84.0
42675958,10000690,normal,2150-11-04 03:03:00,84.0,117.0
44477405,10000690,normal,2150-11-04 10:00:00,117.0,120.0
42675959,10000690,normal,2150-11-04 17:54:00,120.0,107.0


Derive statistic characteristics from the previous 3 glucose (before the current and prediction)

In [30]:
# Example: calculate rolling features per subject
glucose_df['mean_last3'] = glucose_df.groupby('subject_id')['value'].rolling(3, min_periods=1).mean().reset_index(level=0, drop=True)
glucose_df['std_last3'] = glucose_df.groupby('subject_id')['value'].rolling(3, min_periods=1).std().reset_index(level=0, drop=True)
glucose_df['trend'] = glucose_df.groupby('subject_id')['value'].diff()  # slope-ish
glucose_df.head()

Unnamed: 0,subject_id,label,charttime,value,next_glucose,mean_last3,std_last3,trend
42675956,10000032,,2180-07-23 21:45:00,115.0,,115.0,,
42675957,10000690,normal,2150-11-03 02:56:00,77.0,84.0,77.0,,
42675958,10000690,normal,2150-11-04 03:03:00,84.0,117.0,80.5,4.949747,7.0
44477405,10000690,normal,2150-11-04 10:00:00,117.0,120.0,92.666667,21.36196,33.0
42675959,10000690,normal,2150-11-04 17:54:00,120.0,107.0,107.0,19.974984,3.0


In [31]:
glucose_df = glucose_df.dropna(subset=['label'])
glucose_df.head()

Unnamed: 0,subject_id,label,charttime,value,next_glucose,mean_last3,std_last3,trend
42675957,10000690,normal,2150-11-03 02:56:00,77.0,84.0,77.0,,
42675958,10000690,normal,2150-11-04 03:03:00,84.0,117.0,80.5,4.949747,7.0
44477405,10000690,normal,2150-11-04 10:00:00,117.0,120.0,92.666667,21.36196,33.0
42675959,10000690,normal,2150-11-04 17:54:00,120.0,107.0,107.0,19.974984,3.0
42675960,10000690,normal,2150-11-05 05:36:00,107.0,97.0,114.666667,6.806859,-13.0


In [32]:
glucose_df.dropna(inplace=True)
glucose_df.isnull().sum()

subject_id      0
label           0
charttime       0
value           0
next_glucose    0
mean_last3      0
std_last3       0
trend           0
dtype: int64

In [33]:
glucose_df.to_csv('data/glucose.csv', index=False)

## Patient vitals
Average out and pivot the rest of the patient's vital information

In [53]:
pivot_df = chartEvents.pivot_table(
    index='subject_id',
    columns='label',
    values='value',
    aggfunc='mean'
)

pivot_df.head()

label,APS,BUN,Cholesterol,Cortisol,Creatinine,Diastolic Blood Pressure,Glucose,Heart Rate,Hemoglobin,Ionized Calcium,...,O2 Saturation,PH (Venous),PTT,Respiratory Rate,Sodium,Systolic Blood Pressure,Temperature Celsius,Total Protein,Uric Acid,WBC
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000032,,33.0,,,0.5,54.1,115.0,96.5,,,...,96.3,,,20.7,132.0,88.9,,,,
10000690,,20.2,,,0.82,60.361702,100.333333,84.072917,10.175,,...,95.702128,,,22.557895,135.2,122.893617,,,,6.35
10000980,,,,,,83.272727,109.5,73.636364,,,...,98.909091,,,20.545455,,142.454545,,,,
10001217,,9.5,,,0.45,77.72,123.666667,86.711538,11.75,,...,95.019231,,32.7,19.038462,138.5,126.9,,,,13.7
10001725,,17.0,,,0.8,61.1875,153.0,79.15625,13.25,,...,98.225806,,30.6,17.53125,139.0,100.40625,,,,18.55


In [54]:
print(pivot_df.shape)
print(pivot_df.isnull().sum())

(65366, 21)
label
APS                         65359
BUN                          1543
Cholesterol                 58947
Cortisol                    61099
Creatinine                   1526
Diastolic Blood Pressure       61
Glucose                      1035
Heart Rate                      1
Hemoglobin                   1713
Ionized Calcium             32072
Mean Blood Pressure            85
O2 Saturation                  38
PH (Venous)                 41336
PTT                          7452
Respiratory Rate               64
Sodium                       1519
Systolic Blood Pressure        60
Temperature Celsius         56041
Total Protein               62527
Uric Acid                   63249
WBC                          1707
dtype: int64


Dropped the averaged glucose (to be replaced with more in depth readings)

In [55]:
pivot_df.drop(columns=['Glucose'], inplace=True)
pivot_df.head()

label,APS,BUN,Cholesterol,Cortisol,Creatinine,Diastolic Blood Pressure,Heart Rate,Hemoglobin,Ionized Calcium,Mean Blood Pressure,O2 Saturation,PH (Venous),PTT,Respiratory Rate,Sodium,Systolic Blood Pressure,Temperature Celsius,Total Protein,Uric Acid,WBC
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
10000032,,33.0,,,0.5,54.1,96.5,,,62.3,96.3,,,20.7,132.0,88.9,,,,
10000690,,20.2,,,0.82,60.361702,84.072917,10.175,,74.93617,95.702128,,,22.557895,135.2,122.893617,,,,6.35
10000980,,,,,,83.272727,73.636364,,,97.545455,98.909091,,,20.545455,,142.454545,,,,
10001217,,9.5,,,0.45,77.72,86.711538,11.75,,88.673469,95.019231,,32.7,19.038462,138.5,126.9,,,,13.7
10001725,,17.0,,,0.8,61.1875,79.15625,13.25,,70.0625,98.225806,,30.6,17.53125,139.0,100.40625,,,,18.55


Remove outliers from the patient vitals

In [56]:
# Step 1: Drop columns with more than 5% missing values
pivot_df = pivot_df.loc[:, pivot_df.isnull().mean() <= 0.15]
print(pivot_df.shape)

# Step 2: Drop the remaining missing values in the retained columns with column means
pivot_df.dropna(pivot_df.mean(), inplace=True)

# Check resulting shape
print(pivot_df.shape)

(65366, 11)
(65366, 11)


In [57]:
print(pivot_df.isnull().sum())

label
BUN                         0
Creatinine                  0
Diastolic Blood Pressure    0
Heart Rate                  0
Hemoglobin                  0
Mean Blood Pressure         0
O2 Saturation               0
Respiratory Rate            0
Sodium                      0
Systolic Blood Pressure     0
WBC                         0
dtype: int64


In [58]:
def remove_outliers_iqr(df, multiplier=1.5):
    numeric_cols = df.select_dtypes(include='number').columns
    mask = pd.Series(True, index=df.index)  # start with all True
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        col_mask = (df[col] >= (q1 - multiplier * iqr)) & (df[col] <= (q3 + multiplier * iqr))
        mask &= col_mask
    return df[mask]

pivot_df = remove_outliers_iqr(pivot_df)

In [59]:
print(pivot_df.shape)

(46622, 11)


Add in the earlier found glucose readings along with patient vital information

In [60]:
merged_df = pd.merge(glucose_df, pivot_df, on='subject_id')
merged_df.head()

Unnamed: 0,subject_id,label,charttime,value,next_glucose,mean_last3,std_last3,trend,BUN,Creatinine,Diastolic Blood Pressure,Heart Rate,Hemoglobin,Mean Blood Pressure,O2 Saturation,Respiratory Rate,Sodium,Systolic Blood Pressure,WBC
0,10000690,normal,2150-11-04 03:03:00,84.0,117.0,80.5,4.949747,7.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
1,10000690,normal,2150-11-04 10:00:00,117.0,120.0,92.666667,21.36196,33.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
2,10000690,normal,2150-11-04 17:54:00,120.0,107.0,107.0,19.974984,3.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
3,10000690,normal,2150-11-05 05:36:00,107.0,97.0,114.666667,6.806859,-13.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
4,10001217,normal,2157-12-19 22:00:00,145.0,113.0,129.0,22.627417,32.0,9.5,0.45,77.72,86.711538,11.75,88.673469,95.019231,19.038462,138.5,126.9,13.7


In [61]:
print(merged_df.isnull().sum())

subject_id                  0
label                       0
charttime                   0
value                       0
next_glucose                0
mean_last3                  0
std_last3                   0
trend                       0
BUN                         0
Creatinine                  0
Diastolic Blood Pressure    0
Heart Rate                  0
Hemoglobin                  0
Mean Blood Pressure         0
O2 Saturation               0
Respiratory Rate            0
Sodium                      0
Systolic Blood Pressure     0
WBC                         0
dtype: int64


In [62]:
merged_df.rename(columns={'value': 'glucose'}, inplace=True)
merged_df.head()

Unnamed: 0,subject_id,label,charttime,glucose,next_glucose,mean_last3,std_last3,trend,BUN,Creatinine,Diastolic Blood Pressure,Heart Rate,Hemoglobin,Mean Blood Pressure,O2 Saturation,Respiratory Rate,Sodium,Systolic Blood Pressure,WBC
0,10000690,normal,2150-11-04 03:03:00,84.0,117.0,80.5,4.949747,7.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
1,10000690,normal,2150-11-04 10:00:00,117.0,120.0,92.666667,21.36196,33.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
2,10000690,normal,2150-11-04 17:54:00,120.0,107.0,107.0,19.974984,3.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
3,10000690,normal,2150-11-05 05:36:00,107.0,97.0,114.666667,6.806859,-13.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
4,10001217,normal,2157-12-19 22:00:00,145.0,113.0,129.0,22.627417,32.0,9.5,0.45,77.72,86.711538,11.75,88.673469,95.019231,19.038462,138.5,126.9,13.7


## Saving data files
File of all patient glucose readings

In [27]:
patients = merged_df[['subject_id', 'charttime', 'label', 'next_glucose']]
patients.head()

Unnamed: 0,subject_id,charttime,label,next_glucose
0,10000690,2150-11-04 03:03:00,normal,120.0
1,10000690,2150-11-04 17:54:00,normal,107.0
2,10000690,2150-11-05 05:36:00,normal,97.0
3,10001884,2131-01-12 03:34:00,hyper,199.0
4,10001884,2131-01-13 04:29:00,normal,177.0


In [28]:
patients.to_csv('data/patients.csv', index=False)

File for only dysglycemic patients

In [31]:
dys_patients = merged_df[(merged_df['label'] == 'hypo') | (merged_df['label'] == 'hyper')]
dys_patients.head()

Unnamed: 0,subject_id,charttime,glucose,label,next_glucose,mean_last3,std_last3,trend,Anion gap,BP Diastolic,...,Creatinine (serum),Heart Rate,Hematocrit (serum),Hemoglobin,Magnesium,O2 saturation pulseoxymetry,Respiratory Rate,Sodium (serum),Temperature Fahrenheit,WBC
3,10001884,2131-01-12 03:34:00,145.0,hyper,199.0,146.5,2.12132,-3.0,10.214286,74.669683,...,0.885714,70.8,27.226667,8.670588,2.228571,97.648649,19.044444,136.5,98.98913,12.78
11,10001884,2131-01-16 04:02:00,135.0,hyper,203.0,136.0,33.511192,-35.0,10.214286,74.669683,...,0.885714,70.8,27.226667,8.670588,2.228571,97.648649,19.044444,136.5,98.98913,12.78
19,10002114,2162-02-19 13:28:00,78.0,hypo,66.0,84.0,7.211103,-4.0,10.428571,82.169231,...,1.785714,80.85,30.7375,10.6375,2.271429,98.210526,16.746835,130.285714,97.557143,6.5
30,10002155,2131-03-09 23:02:00,166.0,hyper,235.0,132.333333,29.871949,44.0,13.285714,52.501873,...,1.507143,89.754266,28.588235,9.816667,2.0,93.701342,17.276451,134.857143,97.308571,6.3
31,10002155,2131-03-10 02:04:00,235.0,hyper,192.0,174.333333,56.95905,69.0,13.285714,52.501873,...,1.507143,89.754266,28.588235,9.816667,2.0,93.701342,17.276451,134.857143,97.308571,6.3


In [30]:
dys_patients.to_csv('data/dys_patients.csv', index=False)

Final deidentified dataset

In [63]:
merged_df.to_csv(finalEventsFile, index=False)
merged_df.head()

Unnamed: 0,subject_id,label,charttime,glucose,next_glucose,mean_last3,std_last3,trend,BUN,Creatinine,Diastolic Blood Pressure,Heart Rate,Hemoglobin,Mean Blood Pressure,O2 Saturation,Respiratory Rate,Sodium,Systolic Blood Pressure,WBC
0,10000690,normal,2150-11-04 03:03:00,84.0,117.0,80.5,4.949747,7.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
1,10000690,normal,2150-11-04 10:00:00,117.0,120.0,92.666667,21.36196,33.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
2,10000690,normal,2150-11-04 17:54:00,120.0,107.0,107.0,19.974984,3.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
3,10000690,normal,2150-11-05 05:36:00,107.0,97.0,114.666667,6.806859,-13.0,20.2,0.82,60.361702,84.072917,10.175,74.93617,95.702128,22.557895,135.2,122.893617,6.35
4,10001217,normal,2157-12-19 22:00:00,145.0,113.0,129.0,22.627417,32.0,9.5,0.45,77.72,86.711538,11.75,88.673469,95.019231,19.038462,138.5,126.9,13.7
