## 1 - Windows

In [1]:
!mkdir -p data_win

In [2]:
import numpy as np
import pandas as pd

In [3]:
years = [2020] 

In [4]:
df = pd.read_csv('/data/covid/acute_dyspnea_data/2020_0701/encounters-master-2020.csv')
df['hosp_id'] = df['hosp_id'].astype(int)
df = df.set_index('hosp_id').sort_index()
df = df[df['year'].isin(years)]

In [5]:
# time column is the original t=0, which is the first vital sign measurement
# convert that into number of minutes after 00:00
df['TOD_zero'] = df['time'].apply(lambda s: int(s.split(':')[0])*60 + int(s.split(':')[1]))

In [None]:
df

- 4h non-overlapping prediction windows: 
    - 1:00-4:59, 5:00-8:59, 9:00-12:59, 13:00-16:59, 17:00-20:59, 21:00-24:59
- Define new **TIME ZERO** as the nearest previous prediction time
    - for example, 11:14 would be mapped to 9:00

In [7]:
TOD_first_h = 1
window_size_h = 4
prediction_times_h = np.array(range(TOD_first_h, 25, window_size_h))
prediction_times = prediction_times_h * 60

In [8]:
prediction_times, prediction_times_h

(array([  60,  300,  540,  780, 1020, 1260]), array([ 1,  5,  9, 13, 17, 21]))

In [9]:
df['TOD_zero']

hosp_id
63990    1345
63991     601
63992    1125
63993     751
63994    1434
         ... 
74164     842
74165    1327
74166      71
74167    1301
74168    1223
Name: TOD_zero, Length: 10178, dtype: int64

In [10]:
# Find out which 4h window the current time zero belongs to 
# and set the left edge as the new time zero
TOD_zero_h = df['TOD_zero'] / 60
earlier_pred_time_h = ((TOD_zero_h - 1) // 4) * 4 + 1

In [11]:
df['new_TOD_zero_h'] = earlier_pred_time_h
df['new_TOD_zero'] = earlier_pred_time_h * 60

In [12]:
# this is the num of minutes to add to all timstamps for each hosp_id
df['time_zero_offset'] = (df['TOD_zero'].astype(int) - df['new_TOD_zero'].astype(int))

In [13]:
# this offset is <4h because we took the nearest previous prediction time
assert (df['time_zero_offset'] < (window_size_h*60)).all()

In [14]:
df['new_discharge'] = df['length_min'] + df['time_zero_offset']

In [None]:
df.head(10)

In [16]:
df.to_csv('data_win/enc.csv')

In [17]:
df_windows = df.apply(lambda row: range(0, int(row['new_discharge']), window_size_h*60), axis=1)

## DONT remove yet!!
## remove window containing event and one window prior to event
# df_windows = df_windows.apply(lambda row: row[:-2] if len(row) > 2 else [])

df_windows = pd.DataFrame(df_windows.explode().rename('window_start'))

In [18]:
df_windows['window_id'] = df_windows['window_start'] / (window_size_h*60)
df_windows['window_end'] = df_windows['window_start'] + (window_size_h*60)

In [19]:
df_windows

Unnamed: 0_level_0,window_start,window_id,window_end
hosp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
63990,0,0,240
63990,240,1,480
63990,480,2,720
63990,720,3,960
63990,960,4,1200
...,...,...,...
74168,72720,303,72960
74168,72960,304,73200
74168,73200,305,73440
74168,73440,306,73680


In [20]:
df_windows['window_id'] = df_windows['window_id'].astype(int)

In [21]:
df_windows['ID'] = df_windows.apply(lambda row: '%s-%s' % (str(row.name).rjust(5,'_'), str(row['window_id']).rjust(5,'_')), axis=1)

In [22]:
df_windows[['window_id', 'window_start', 'window_end']].to_csv('data_win/windows.csv')

In [23]:
df_windows[['window_id', 'window_start', 'window_end', 'ID']].to_csv('data_win/windows_map.csv')

In [24]:
df_windows

Unnamed: 0_level_0,window_start,window_id,window_end,ID
hosp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
63990,0,0,240,63990-____0
63990,240,1,480,63990-____1
63990,480,2,720,63990-____2
63990,720,3,960,63990-____3
63990,960,4,1200,63990-____4
...,...,...,...,...
74168,72720,303,72960,74168-__303
74168,72960,304,73200,74168-__304
74168,73200,305,73440,74168-__305
74168,73440,306,73680,74168-__306


## 2 - Labels

In [25]:
!mkdir -p labels

In [26]:
import numpy as np
import pandas as pd

In [27]:
df = pd.read_csv('data_win/enc.csv').set_index('hosp_id')

In [28]:
window_size_h = 4
years = ['2020_0701']

In [30]:
for year in years:
    df_flow = pd.read_pickle('/data/covid/data_pkl/_flow%s.pkl' % year)
    
    ## HHFNC: heated high-flow nasal cannula
    df_HHFNC = df_flow[(df_flow['observationtermid'] == 307923) & (df_flow['result'] == 'Nasal Cannula - Heated High Flow')]
    df_HHFNC = df_HHFNC.drop_duplicates(subset=['hosp_id'], keep='first')
    
    df_HHFNC = df_HHFNC.set_index('hosp_id').join(df[['time_zero_offset']])
    df_HHFNC['t'] = df_HHFNC['min'] + df_HHFNC['time_zero_offset']
    df_HHFNC['window_id'] = df_HHFNC['t'] // (window_size_h*60)
    df_HHFNC['window_t'] = df_HHFNC['t'] % (window_size_h*60)
    
    df_HHFNC = df_HHFNC.rename(columns={'t': 'HHFNC_time'})
    df_HHFNC.to_csv('labels/HHFNC_{}.csv'.format(year))

    ## Invasive mechanical ventilation
    df_MechVent = df_flow[(
        # Start of mechanical ventilation
        ((df_flow['observationtermid'] == 313141) & (df_flow['result'] == 'Start'))

        # A few specific modes of mechanical ventilation
        | ((df_flow['observationtermid'] == 315640) & (df_flow['result'].isin([
            "PC-IMV+/BiLevel", "APRV", "PC/AC", "PC/IMV", "PC/PSV", "Tube compensation", 
            "VC+/AC", "VC+/IMV", "VC/AC", "VC/IMV", "VC/MMV", "PAV/PPS"])))

        # ED
        | ((df_flow['observationtermid'] == 307923) & (df_flow['result'].isin([
            'Ventilator - Emergency Department', 'Mechanical Ventilation - UH/CVC'])))
    )]
    df_MechVent = df_MechVent.drop_duplicates(subset=['hosp_id'], keep='first')
    
    df_MechVent = df_MechVent.set_index('hosp_id').join(df[['time_zero_offset']])
    df_MechVent['t'] = df_MechVent['min'] + df_MechVent['time_zero_offset']
    df_MechVent['window_id'] = df_MechVent['t'] // (window_size_h*60)
    df_MechVent['window_t'] = df_MechVent['t'] % (window_size_h*60)
    
    df_MechVent = df_MechVent.rename(columns={'t': 'MechVent_time'})
    df_MechVent.to_csv('labels/MechVent_{}.csv'.format(year))

In [31]:
vaso_ids = pd.read_csv('/data/covid/acute_dyspnea_data/vasopressors.csv')['medicationtermid'].values
len(vaso_ids)

76

In [32]:
for year in years:
    df_meds = pd.read_pickle('/data/covid/data_pkl/_meds%s.pkl' % year)
    
    # Must be given with non-zero dose
    df_meds = df_meds[(df_meds['ReasonNotGiven'].isnull()) & (df_meds['dose'] > 0)]
    
    ## IV vasopressors
    df_VasoIV = df_meds[df_meds['medicationtermid'].isin(vaso_ids) & (df_meds['route'] == 'IV')]
    df_VasoIV = df_meds[df_meds['medicationtermid'].isin(vaso_ids)]
    df_VasoIV = df_VasoIV.drop_duplicates(subset=['hosp_id'], keep='first')
    
    df_VasoIV = df_VasoIV.set_index('hosp_id').join(df[['time_zero_offset']])
    df_VasoIV['t'] = df_VasoIV['min'] + df_VasoIV['time_zero_offset']
    df_VasoIV['window_id'] = df_VasoIV['t'] // (window_size_h*60)
    df_VasoIV['window_t'] = df_VasoIV['t'] % (window_size_h*60)
    
    df_VasoIV = df_VasoIV.rename(columns={'t': 'VasoIV_time'})
    df_VasoIV.to_csv('labels/VasoIV_{}.csv'.format(year))

### 2.1 - post-OR flags

In [33]:
for year in ['2020_0701']:
    for label in ['HHFNC', 'MechVent', 'VasoIV']:  
        try: 
            df_loc = pd.read_csv('/data/covid/acute_dyspnea_data/{}/location.csv'.format(year))
        except: 
            df_loc = pd.read_csv('/data/covid/acute_dyspnea_data/{}/location-new.csv'.format(year))
            print(year)
        df_proc = df_loc.loc[df_loc['location']=='Procedure']
        df_labels = pd.read_csv('labels/{}_{}.csv'.format(label, year))

        in_OR_within_24_hrs_wrong = {}
        times_in_OR_str = {}

        in_OR_within_24_hrs = {}
        times_in_OR_str = {}

        for hosp_id in df_proc['hosp_id']:
            if hosp_id in df_labels['hosp_id'].values:
                times_in_OR = df_proc.loc[df_proc['hosp_id']==hosp_id]['min']
                times_in_OR_str[hosp_id] = str(times_in_OR.values)
                for time in times_in_OR:
                    mv_time = (df_labels.loc[df_labels['hosp_id']==hosp_id]['{}_time'.format(label)]).values.item()
                    if time <= mv_time <= (time + (24*60)):
                        in_OR_within_24_hrs[hosp_id] = 1
                        break
                if hosp_id not in in_OR_within_24_hrs: 
                    in_OR_within_24_hrs[hosp_id] = 0

        assert( len(in_OR_within_24_hrs) == len(set(df_proc['hosp_id']).intersection(df_labels['hosp_id'])) )

        in_OR_within_24_hrs_col = []
        times_in_OR_str_col = []
        for hosp_id in df_labels['hosp_id']:
            if hosp_id in in_OR_within_24_hrs:
                in_OR_within_24_hrs_col.append(in_OR_within_24_hrs[hosp_id])
                times_in_OR_str_col.append(times_in_OR_str[hosp_id])
            else:
                in_OR_within_24_hrs_col.append(0)
                times_in_OR_str_col.append('no OR record')        

        df_labels['in_OR_within_24_hrs'] = in_OR_within_24_hrs_col
        df_labels['times_in_OR_str'] = times_in_OR_str_col
        df_labels.to_csv('labels/{}_{}_with_OR_flags-FIXED.csv'.format(label, year))

## 3 - Window data

In [34]:
window_size_h = 4

In [35]:
import numpy as np
import pandas as pd

In [36]:
years = ['2020_0701']

In [37]:
df_enc = pd.read_csv('data_win/enc.csv').set_index('hosp_id')
df_win = pd.read_csv('data_win/windows.csv').set_index('hosp_id')

In [38]:
for year in years:
    df_a = pd.read_pickle('/data/covid/data_pkl/meds-%s.pkl' % year)

    # Filter outside the window records
    df_a_ = df_a.set_index('hosp_id').join(df_enc[['time_zero_offset', 'new_discharge']], how='left')
    df_a_['t'] = df_a_['min'] + df_a_['time_zero_offset']
    df_a__ = df_a_[(df_a_['t'] >= 0) & (df_a_['t'] < df_a_['new_discharge'])]

    # Must be given with non-zero dose
    df_a__ = df_a__[(df_a__['ReasonNotGiven'].isnull()) & (df_a__['dose'] > 0)]
    
    # Calculate window index and t within window
    df_a__['window_id'] = df_a__['t'] // (window_size_h*60)
    df_a__['window_t'] = df_a__['t'] % (window_size_h*60)

    # Save
    df_a___ = df_a__.reset_index()[['hosp_id', 'window_id', 'window_t', 't', 'min', 'medicationtermid', 'dose', 'units', 'route']].rename(columns={'min':'raw_t'})
    df_a___.to_pickle('data_win/meds-%s-win.p'%year)

In [39]:
for year in years:
    df_a = pd.read_pickle('/data/covid/data_pkl/labs-%s.pkl' % year)

    # Filter outside the window records
    df_a_ = df_a.set_index('hosp_id').join(df_enc[['time_zero_offset', 'new_discharge']], how='left')
    df_a_['t'] = df_a_['obsmin'] + df_a_['time_zero_offset']
    df_a__ = df_a_[(df_a_['t'] >= 0) & (df_a_['t'] < df_a_['new_discharge'])]

    # Calculate window index and t within window
    df_a__['window_id'] = df_a__['t'] // (window_size_h*60)
    df_a__['window_t'] = df_a__['t'] % (window_size_h*60)

    # Save
    df_a___ = df_a__.reset_index()[['hosp_id', 'window_id', 'window_t', 't', 'obsmin', 'result_termid', 'value', 'hilonormal']].rename(columns={'obsmin':'raw_t', 'hilonormal': 'hilonormal_flag'})
    df_a___.to_pickle('data_win/labs-%s-win.p'%year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [40]:
for year in years:
    df_flow = pd.read_pickle('/data/covid/data_pkl/flow-%s.pkl' % year)

    # Filter outside the window records
    df_flow_ = df_flow.set_index('hosp_id').join(df_enc[['time_zero_offset', 'new_discharge']], how='left')
    df_flow_['t'] = df_flow_['min'] + df_flow_['time_zero_offset']
    df_flow__ = df_flow_[(df_flow_['t'] >= 0) & (df_flow_['t'] < df_flow_['new_discharge'])]

    # Calculate window index and t within window
    df_flow__['window_id'] = df_flow__['t'] // (window_size_h*60)
    df_flow__['window_t'] = df_flow__['t'] % (window_size_h*60)

    # Save
    df_flow___ = df_flow__.reset_index()[['hosp_id', 'window_id', 'window_t', 't', 'min', 'observationtermid', 'result']].rename(columns={'min':'raw_t'})
    df_flow___.to_pickle('data_win/flow-%s-win.p'%year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
