In [1]:
from pathlib import Path
from typing import List

import pandas as pd
import numpy as np
from pydantic import BaseModel

**NOTE**: please run this notebook from `./src`

In [2]:
%cd ../..
%pwd

/Users/steve/code/hylode/HyUi/src


'/Users/steve/code/hylode/HyUi/src'

In [3]:
assert Path.cwd().parts[-2:] == ('HyUi', 'src')

In [4]:
from mock import mock  # hence the need to get the directory set up

In [5]:
vitals_file = mock.path_to_hdf_file("perrt")
df = pd.read_hdf(vitals_file)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4166 entries, 0 to 4165
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   visit_observation_id    4166 non-null   int64         
 1   date_of_birth           4166 non-null   datetime64[ns]
 2   lastname                4166 non-null   object        
 3   firstname               4166 non-null   object        
 4   mrn                     4166 non-null   object        
 5   ob_tail_i               4166 non-null   int64         
 6   observation_datetime    4166 non-null   datetime64[ns]
 7   id_in_application       4166 non-null   object        
 8   value_as_real           273 non-null    float64       
 9   value_as_text           2815 non-null   object        
 10  unit                    33 non-null     object        
 11  sex                     4166 non-null   object        
 12  bed_admit_dt            4166 non-null   datetime

In [7]:
df.head()

Unnamed: 0,visit_observation_id,date_of_birth,lastname,firstname,mrn,ob_tail_i,observation_datetime,id_in_application,value_as_real,value_as_text,unit,sex,bed_admit_dt,dept_name,room_name,bed_hl7,perrt_consult_datetime
0,0,1976-03-20,Adams,Suzanne,8887833,1,2022-06-26 10:54:02,3040109304,,Room air,,M,2022-06-30 00:41:47,UCH EMERGENCY DEPT,BY06,UTC TZ,NaT
1,1,1963-01-31,Scott,Stephen,55582361,1,2022-06-26 09:38:49,5,,,,M,2022-06-25 17:12:23,UCH T02 VASCULAR ANGIO,OTF,SR04-04,NaT
2,2,1925-05-19,Love,Willie,23882699,1,2022-06-26 09:48:55,5,,130/72,,M,2022-07-11 06:05:08,UCH T08 SOUTH (T08S),SR08,BY05-36,NaT
3,3,2009-03-21,Rice,Adam,29488187,1,2022-06-26 08:52:51,9,,,,F,2022-06-20 21:40:10,UCH T16 NORTH (T16N),SR34,SR32-32,2022-06-18 02:58:00
4,4,2018-05-12,Kline,Donald,65051604,1,2022-06-26 07:35:18,6,,,,M,2022-07-04 00:13:53,UCH T02 DAY SURG THR,BY03,BY05-24,NaT


In [8]:
engine = mock.make_mock_db_in_memory("perrt")

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4166 entries, 0 to 4165
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   visit_observation_id    4166 non-null   int64         
 1   date_of_birth           4166 non-null   datetime64[ns]
 2   lastname                4166 non-null   object        
 3   firstname               4166 non-null   object        
 4   mrn                     4166 non-null   object        
 5   ob_tail_i               4166 non-null   int64         
 6   observation_datetime    4166 non-null   datetime64[ns]
 7   id_in_application       4166 non-null   object        
 8   value_as_real           273 non-null    float64       
 9   value_as_text           2815 non-null   object        
 10  unit                    33 non-null     object        
 11  sex                     4166 non-null   object        
 12  bed_admit_dt            4166 non-null   datetime

In [10]:
df['id_in_application'].value_counts()

6466          1508
3040109304     858
28315          411
5              390
8              370
6              310
9              223
28316           63
10              33
Name: id_in_application, dtype: int64

In [11]:
assert len(df['id_in_application'].value_counts()) == 9

In [12]:
obs_types = dict(
  SpO2         = '10',
  BP           = '5',
  air_or_o2    = '3040109304',
  Temp         = '6',
  Pulse        = '8',
  Resp         = '9',
  AVPU         = '6466',
  NEWS_scale_1 = '28315',
  NEWS_scale_2 = '28316',
)
obs_types_inverse = {v:k for k, v in obs_types.items()}

In [13]:
df.replace({'id_in_application': obs_types_inverse}, inplace=True)

In [14]:
df.head()

Unnamed: 0,visit_observation_id,date_of_birth,lastname,firstname,mrn,ob_tail_i,observation_datetime,id_in_application,value_as_real,value_as_text,unit,sex,bed_admit_dt,dept_name,room_name,bed_hl7,perrt_consult_datetime
0,0,1976-03-20,Adams,Suzanne,8887833,1,2022-06-26 10:54:02,air_or_o2,,Room air,,M,2022-06-30 00:41:47,UCH EMERGENCY DEPT,BY06,UTC TZ,NaT
1,1,1963-01-31,Scott,Stephen,55582361,1,2022-06-26 09:38:49,BP,,,,M,2022-06-25 17:12:23,UCH T02 VASCULAR ANGIO,OTF,SR04-04,NaT
2,2,1925-05-19,Love,Willie,23882699,1,2022-06-26 09:48:55,BP,,130/72,,M,2022-07-11 06:05:08,UCH T08 SOUTH (T08S),SR08,BY05-36,NaT
3,3,2009-03-21,Rice,Adam,29488187,1,2022-06-26 08:52:51,Resp,,,,F,2022-06-20 21:40:10,UCH T16 NORTH (T16N),SR34,SR32-32,2022-06-18 02:58:00
4,4,2018-05-12,Kline,Donald,65051604,1,2022-06-26 07:35:18,Temp,,,,M,2022-07-04 00:13:53,UCH T02 DAY SURG THR,BY03,BY05-24,NaT


In [202]:
# copy here since else SettingWithCopyWarning since you've manipulated the dataframe above
vitals_file = mock.path_to_hdf_file("perrt")
dft = pd.read_hdf(vitals_file)
dft = dft[['mrn', 'observation_datetime', 'id_in_application', 'value_as_real', 'value_as_text' ]]
dft.replace({'id_in_application': obs_types_inverse}, inplace=True)


In [203]:
dft['value'] = dft['value_as_real']

In [204]:
# dft.loc[dft['id_in_application']=='Temp']

In [205]:
def air_or_o2_as_int(df):    
    conditions = [
        (df['id_in_application'] == 'air_or_o2') & (df['value_as_text'] == 'Room air'),
        (df['id_in_application'] == 'air_or_o2') & (df['value_as_text'] == 'Supplemental Oxygen'),
    ]
    choices = [
        0,
        1,
    ]
    df['value'] = np.select(conditions, choices,
        default=df['value'])
    return df

In [206]:
def avpu_as_int(df):    
    conditions = [
        (df['id_in_application'] == 'AVPU') & (df['value_as_text'] == 'A'),
        (df['id_in_application'] == 'AVPU') & (df['value_as_text'] == 'C'),
        (df['id_in_application'] == 'AVPU') & (df['value_as_text'] == 'V'),
        (df['id_in_application'] == 'AVPU') & (df['value_as_text'] == 'P'),
        (df['id_in_application'] == 'AVPU') & (df['value_as_text'] == 'U'),
    ]
    choices = [
        0,
        1,
        2,
        3,
        4,
    ]
    df['value'] = np.select(conditions, choices,
        default=df['value'])
    return df

In [207]:
dft[:49]

Unnamed: 0,mrn,observation_datetime,id_in_application,value_as_real,value_as_text,value
0,8887833,2022-06-26 10:54:02,air_or_o2,,Room air,
1,55582361,2022-06-26 09:38:49,BP,,,
2,23882699,2022-06-26 09:48:55,BP,,130/72,
3,29488187,2022-06-26 08:52:51,Resp,,,
4,65051604,2022-06-26 07:35:18,Temp,,,
5,77402084,2022-06-26 09:11:01,Resp,,,
6,84556332,2022-06-26 10:52:22,NEWS_scale_1,,0,
7,81572660,2022-06-26 10:01:43,air_or_o2,,Room air,
8,81294471,2022-06-26 08:09:46,AVPU,,A,
9,16165950,2022-06-26 09:29:14,air_or_o2,,Room air,


In [208]:
def bp_as_int(df, bp_label: str = 'BP'):
    mask = df['id_in_application'] == bp_label
    df['tmp'] = pd.to_numeric(df[mask]['value_as_text'].str.split('/').str[0], errors='coerce')
    df['value'] = np.where(mask, df['tmp'], df['value'])
    df.drop(columns=['tmp'], inplace=True)
    return df

In [211]:
def news_as_int(df, news_labels: list[str] = ['NEWS_scale_1', 'NEWS_scale_2']):
    for label in news_labels:
        mask = df['id_in_application'] == label
        df['tmp'] = pd.to_numeric(df[mask]['value_as_text'], errors='coerce')
        df['value'] = np.where(mask, df['tmp'], df['value'])
        df.drop(columns=['tmp'], inplace=True)
    return df

In [218]:
[col for col in df.columns]

['visit_observation_id',
 'date_of_birth',
 'lastname',
 'firstname',
 'mrn',
 'ob_tail_i',
 'observation_datetime',
 'id_in_application',
 'value_as_real',
 'value_as_text',
 'unit',
 'sex',
 'bed_admit_dt',
 'dept_name',
 'room_name',
 'bed_hl7',
 'perrt_consult_datetime']

In [210]:
dft.id_in_application.value_counts()

AVPU            1508
air_or_o2        858
NEWS_scale_1     411
BP               390
Pulse            370
Temp             310
Resp             223
NEWS_scale_2      63
SpO2              33
Name: id_in_application, dtype: int64

In [None]:
mask = df['id_in_application'] == 'NEWS_scale_1'
df['value'] = np.where(mask, df['value_as_t'], df['value'])

In [212]:
dft = news_as_int(dft)
dft = bp_as_int(dft)
dft = air_or_o2_as_int(dft)
dft = avpu_as_int(dft)

In [213]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4166 entries, 0 to 4165
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   mrn                   4166 non-null   object        
 1   observation_datetime  4166 non-null   datetime64[ns]
 2   id_in_application     4166 non-null   object        
 3   value_as_real         273 non-null    float64       
 4   value_as_text         2815 non-null   object        
 5   value                 3088 non-null   float64       
dtypes: datetime64[ns](1), float64(2), object(3)
memory usage: 227.8+ KB


In [214]:
dft[-49:]

Unnamed: 0,mrn,observation_datetime,id_in_application,value_as_real,value_as_text,value
4117,57407728,2022-06-26 09:49:56,BP,,132/83,132.0
4118,41700048,2022-06-26 09:04:14,BP,,103/70,103.0
4119,17741924,2022-06-26 08:12:48,AVPU,,A,0.0
4120,63725702,2022-06-26 09:57:32,air_or_o2,,Room air,0.0
4121,64150473,2022-06-26 10:13:08,AVPU,,A,0.0
4122,35109764,2022-06-26 10:49:05,air_or_o2,,Room air,0.0
4123,3845994,2022-06-26 07:31:13,air_or_o2,,Room air,0.0
4124,30613430,2022-06-26 10:13:32,air_or_o2,,Room air,0.0
4125,354325,2022-06-26 11:18:05,AVPU,,A,0.0
4126,19484620,2022-06-26 11:52:14,AVPU,,A,0.0


In [220]:
dft.groupby(['mrn', 'id_in_application']).agg(
    f_max = ('value', 'max'),
    f_min = ('value', 'min'),

)

Unnamed: 0_level_0,Unnamed: 1_level_0,f_max,f_min
mrn,id_in_application,Unnamed: 2_level_1,Unnamed: 3_level_1
56263,Pulse,,
61188,AVPU,0.0,0.0
69613,AVPU,0.0,0.0
165695,air_or_o2,0.0,0.0
166677,AVPU,0.0,0.0
...,...,...,...
99860766,NEWS_scale_1,0.0,0.0
99862198,AVPU,0.0,0.0
99876841,Resp,,
99926920,NEWS_scale_2,2.0,2.0


In [108]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4166 entries, 0 to 4165
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   visit_observation_id  4166 non-null   int64         
 1   observation_datetime  4166 non-null   datetime64[ns]
 2   id_in_application     4166 non-null   object        
 3   value_as_real         273 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 162.7+ KB


In [105]:
dft['value_as_real'].describe()

count    273.000000
mean      76.560440
std       24.004034
min       10.000000
25%       62.000000
50%       80.000000
75%       97.200000
max      130.000000
Name: value_as_real, dtype: float64

In [110]:
dft_real = dft.pivot(index='visit_observation_id', 
          columns='id_in_application',
          values='value_as_real')

id_in_application,AVPU,BP,NEWS_scale_1,NEWS_scale_2,Pulse,Resp,SpO2,Temp,air_or_o2
count,0.0,0.0,0.0,0.0,176.0,26.0,17.0,54.0,0.0
mean,,,,,75.659091,26.192308,91.411765,99.074074,
std,,,,,17.583035,14.232412,3.410624,1.666109,
min,,,,,40.0,10.0,84.0,95.0,
25%,,,,,63.0,10.0,89.0,97.9,
50%,,,,,74.0,27.5,92.0,99.4,
75%,,,,,87.25,38.25,93.0,100.4,
max,,,,,130.0,55.0,99.0,101.8,
