In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from pre_processing.db_tools import *

In [2]:
timestamp_format = "%Y-%m-%d %H:%M:%S"
PERIOD_SECONDS = 4 * 3600

In [3]:
sepsis_admissions = pd.read_sql("select * from sepsis_admissions_data", connection)
sepsis_admissions

Unnamed: 0,admittime,hadm_id
0,2166-08-10 00:28:00,185910
1,2198-08-02 04:49:00,145674
2,2198-11-01 22:36:00,122609
3,2122-12-13 19:30:00,142807
4,2128-03-17 17:11:00,160481
...,...,...
4132,2188-10-18 02:00:00,134977
4133,2143-08-22 16:01:00,186754
4134,2127-06-10 19:22:00,168288
4135,2127-10-21 12:43:00,153703


In [4]:
sepsis_inputevents_mv = pd.read_sql("select * from sepsis_inputevents_mv", connection)
sepsis_inputevents_mv

Unnamed: 0,hadm_id,starttime,endtime,itemid,amount,amountuom,rate,rateuom,orderid,linkorderid
0,166019,2178-04-23 16:47:00,2178-04-24 01:01:00,225942,2.479999,mg,301.214412,mcg/hour,7686797,7686797
1,166019,2178-04-23 16:47:00,2178-04-24 01:01:00,225943,49.600001,ml,6.024292,mL/hour,7686797,7686797
2,166019,2178-04-20 15:25:00,2178-04-20 20:50:00,225942,2.499999,mg,461.538366,mcg/hour,7687717,7687717
3,166019,2178-04-20 15:25:00,2178-04-20 20:50:00,225943,50.000002,ml,9.230770,mL/hour,7687717,7687717
4,178140,2152-05-02 08:08:00,2152-05-02 08:09:00,220949,200.000000,ml,,,788410,788410
...,...,...,...,...,...,...,...,...,...,...
700905,166019,2178-04-20 04:36:00,2178-04-20 09:58:00,225943,49.999999,ml,9.316770,mL/hour,7539016,7539016
700906,166019,2178-04-16 20:32:00,2178-04-16 20:33:00,225855,1.000000,dose,,,7570233,7570233
700907,166019,2178-04-16 20:32:00,2178-04-16 20:33:00,220949,50.000000,ml,,,7570233,7570233
700908,166019,2178-04-16 18:34:00,2178-04-16 21:03:00,222168,862.500089,mg,65.186838,mcg/kg/min,7652382,669855


In [5]:
item_ids = pd.read_sql("select itemid, label from d_items", connection)
item_ids

Unnamed: 0,itemid,label
0,497,Patient controlled analgesia (PCA) [Inject]
1,498,PCA Lockout (Min)
2,499,PCA Medication
3,500,PCA Total Dose
4,501,PCV Exh Vt (Obser)
...,...,...
12482,226757,GCSMotorApacheIIValue
12483,226758,GCSVerbalApacheIIValue
12484,226759,HCO3ApacheIIValue
12485,226760,HCO3Score


## Clean data


## Vasopressin

Get Vasopressin intake in four-hour chunks

In [6]:
vasopressin_ids = np.array([1136, 2445, 30051, 222315])
vasopressin_events = sepsis_inputevents_mv.loc[sepsis_inputevents_mv['itemid'].isin(vasopressin_ids)]
vasopressin_events

Unnamed: 0,hadm_id,starttime,endtime,itemid,amount,amountuom,rate,rateuom,orderid,linkorderid
99,170225,2105-04-30 07:43:00,2105-04-30 20:30:00,222315,30.686589,units,2.400515,units/hour,5677917,4342388
239,138598,2175-05-05 10:24:00,2175-05-05 19:42:00,222315,22.319999,units,2.400000,units/hour,3151116,3151116
502,132488,2150-07-10 10:43:00,2150-07-12 04:03:00,222315,99.999996,units,2.419355,units/hour,8139223,8139223
690,176834,2187-08-22 06:36:00,2187-08-23 07:15:00,222315,88.720113,units,3.599193,units/hour,2227782,2227782
736,176834,2187-08-25 18:09:00,2187-08-26 06:50:00,222315,30.439999,units,2.400000,units/hour,4815294,4815294
...,...,...,...,...,...,...,...,...,...,...
698807,127431,2195-07-17 04:21:00,2195-07-18 22:00:00,222315,99.959998,units,2.400000,units/hour,9227493,9227493
699872,166088,2149-07-03 23:15:00,2149-07-04 17:06:00,222315,42.839999,units,2.400000,units/hour,4772268,4772268
700044,167226,2170-06-17 10:45:00,2170-06-17 22:23:00,222315,13.960000,units,1.200000,units/hour,230104,230104
700253,143269,2122-05-31 00:00:00,2122-05-31 03:30:00,222315,8.400000,units,2.400000,units/hour,2567771,2567771


In [7]:
def time_norm(df:pd.DataFrame, 
              modifier_df:pd.DataFrame, 
              groupby_header:str, 
              modifier_header:str) -> pd.DataFrame:
    """
    To be passed to groupby.apply
    Subtracts the value of `modifier_df` at index specified by the current grouped value from the entire dataframe 
    group passed as df.
    
    :param modifier_df: Must have one column titled the same as `modifier header` and one the same as `groupby_header`
                        Values under `groupby_header` must be unique or an `AssertionError` will be raised.
    :param df: The parameter passed by the pandas `groupby` function
    :param modifier_header: The header of the modifier to be applied to `df`
    :param groupby_header: The header by which `df` was grouped
    :return: `df` minus the modifier value
    """
    hadm_id = df[groupby_header]
    assert hadm_id.nunique() == 1  # Something has gone terribly wrong if this asserts
    hadm_id = hadm_id.array[0]
    
    modifier = modifier_df.loc[modifier_df[groupby_header] == hadm_id, modifier_header].array
    if not len(modifier) == 1:
        raise AssertionError(
            f"The length of the modifier for group {df[groupby_header]} of header {groupby_header} equals {len(modifier)}"
        )
    return df - modifier[0]

In [8]:
gr = vasopressin_events.groupby('hadm_id')[['hadm_id', 'starttime', 'endtime']]

In [9]:
gr.apply(time_norm,
         modifier_df=sepsis_admissions,
         groupby_header='hadm_id',
         modifier_header='admittime')

TypeError: unsupported operand type(s) for -: 'numpy.ndarray' and 'Timestamp'