# Notes

Check patient notes and create features:

## Dispo to 24hr

- count how many RN vs MD notes (author)
- count total all notes from anyone

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
# view data frame

# adjust this to hide ID columns for posting to github
forrepo=True

def view_df(df):
    hidecols = []
    if forrepo:
        hidecols=['anon_id', 'pat_enc_csn_id_coded', 'inpatient_data_id_coded'] # these are hidden columns
        
    return(df.drop(hidecols, axis=1, errors='ignore').head())

# Data

Load in the data

Grabbed the adt table with the following code:

In [3]:
# read in data files
datadir = "../../DataTD/shc2021/"
savedir = "../../OutputTD/shc2021/"


adt_file = datadir + "cohort3_adt_2021.csv"
cohort_file = savedir + "7_cohort4_3hr_labels_noOR.csv"


full_adt = pd.read_csv(adt_file)
cohort = pd.read_csv(cohort_file)

In [4]:
print(full_adt.pat_enc_csn_id_coded.nunique())
print(cohort.pat_enc_csn_id_coded.nunique())

61176
52532


In [5]:
full_adt.dtypes

anon_id                         object
pat_enc_csn_id_coded             int64
effective_time_jittered_utc     object
seq_num_in_enc                   int64
pat_class                       object
base_pat_class_c               float64
pat_lvl_of_care_c              float64
pat_lv_of_care                  object
event_type                      object
pat_service                     object
department_id                  float64
dtype: object

In [6]:
cohort.dtypes

anon_id                                 object
pat_enc_csn_id_coded                     int64
admit_time                              object
label_max3                               int64
label_3hr_recent                         int64
admit_label                            float64
has_admit_label                          int64
died_within_24hrs                        int64
death_3hr_max_label                      int64
death_3hr_recent_label                   int64
first_label                              int64
first_label_minutes_since_admit          int64
acute_to_critical_label_recent_3hr       int64
critical_to_acute_label_recent_3hr       int64
acute_to_critical_label_max_3hr          int64
critical_to_acute_label_max_3hr          int64
label_max6                               int64
label_6hr_recent                         int64
death_6hr_max_label                      int64
death_6hr_recent_label                   int64
acute_to_critical_label_recent_6hr       int64
critical_to_a

# Bring in the ADT Table

We can use the ADT table to look at 

- pat_class
- pat_lv_of_care

In [7]:
# filter adt table down to test cohort
adt = full_adt[full_adt.pat_enc_csn_id_coded.isin(cohort.pat_enc_csn_id_coded)]

# these should match - it does!
print(adt.pat_enc_csn_id_coded.nunique())
print(cohort.pat_enc_csn_id_coded.nunique())

view_df(adt)

52532
52532


Unnamed: 0,effective_time_jittered_utc,seq_num_in_enc,pat_class,base_pat_class_c,pat_lvl_of_care_c,pat_lv_of_care,event_type,pat_service,department_id
0,2021-08-06 06:41:00+00:00,49,Inpatient,,8.0,Critical Care,Patient Update,Pulmonary,120201006.0
1,2015-12-04 21:58:00+00:00,49,Inpatient,,6.0,Intermediate Care - With Cardiac Monitor,Patient Update,Pulmonary Hypertension,2000254.0
2,2020-10-20 23:00:00+00:00,10,Inpatient,,5.0,Acute Care (Assessment or intervention q4-8),Discharge,General Medicine (T),120201003.0
3,2021-08-08 20:54:00+00:00,13,Inpatient,,5.0,Acute Care (Assessment or intervention q4-8),Patient Update,Nephrology,2000262.0
4,2020-10-19 00:29:00+00:00,3,Inpatient,1.0,5.0,Acute Care (Assessment or intervention q4-8),Patient Update,General Medicine (T),120201020.0


In [8]:
# add the admit time column from the prediction results csv to the adt csv
adt_admit = adt.merge(cohort[['pat_enc_csn_id_coded', 'admit_time']], how='left', on=['pat_enc_csn_id_coded'])

adt_admit.columns
view_df(adt_admit[['pat_enc_csn_id_coded', 'effective_time_jittered_utc', 
                 'pat_class', 'admit_time']])

Unnamed: 0,effective_time_jittered_utc,pat_class,admit_time
0,2021-08-06 06:41:00+00:00,Inpatient,2021-07-06 01:53:00
1,2015-12-04 21:58:00+00:00,Inpatient,2015-10-30 01:30:00
2,2020-10-20 23:00:00+00:00,Inpatient,2020-10-19 00:29:00
3,2021-08-08 20:54:00+00:00,Inpatient,2021-08-06 20:45:00
4,2020-10-19 00:29:00+00:00,Inpatient,2020-10-19 00:29:00


In [9]:
adt_admit.columns

Index(['anon_id', 'pat_enc_csn_id_coded', 'effective_time_jittered_utc',
       'seq_num_in_enc', 'pat_class', 'base_pat_class_c', 'pat_lvl_of_care_c',
       'pat_lv_of_care', 'event_type', 'pat_service', 'department_id',
       'admit_time'],
      dtype='object')

In [10]:
# get time of first emergency services event
first_ED = adt_admit[(adt_admit.pat_class == 'Emergency Services')].sort_values(['pat_enc_csn_id_coded', 'seq_num_in_enc']).groupby('pat_enc_csn_id_coded').first().reset_index()
print(first_ED.seq_num_in_enc.unique())

view_df(first_ED)

[1 3]


Unnamed: 0,effective_time_jittered_utc,seq_num_in_enc,pat_class,base_pat_class_c,pat_lvl_of_care_c,pat_lv_of_care,event_type,pat_service,department_id,admit_time
0,2015-01-01 18:40:00+00:00,1,Emergency Services,3.0,,,Admission,Emergency,2001002.0,2015-01-02 03:48:00
1,2015-01-02 01:56:00+00:00,1,Emergency Services,3.0,,,Admission,Emergency,2001002.0,2015-01-02 05:53:00
2,2015-01-04 18:13:00+00:00,1,Emergency Services,3.0,,,Admission,Emergency,2001002.0,2015-01-05 03:20:00
3,2015-01-06 09:04:00+00:00,1,Emergency Services,3.0,,,Admission,Emergency,2001002.0,2015-01-06 14:47:00
4,2015-01-03 14:51:00+00:00,1,Emergency Services,3.0,,,Admission,Emergency,2001002.0,2015-01-03 21:24:00


In [11]:
print(adt_admit.pat_enc_csn_id_coded.nunique())
print(first_ED.pat_enc_csn_id_coded.nunique())

set(adt_admit.pat_enc_csn_id_coded) - set(first_ED.pat_enc_csn_id_coded)

print("\n", first_ED.pat_service.value_counts())
print("\n", first_ED.event_type.value_counts())

first_ED['first_ed_time'] = first_ED.effective_time_jittered_utc

first_ED_time = first_ED[['pat_enc_csn_id_coded', 'first_ed_time']].drop_duplicates()
first_ED_time.head()

adt_ed = adt_admit.merge(first_ED_time, how='left')
view_df(adt_ed)

52532
52532

 Emergency                        52519
Emergency Medicine                   7
General Medicine (University)        2
Psychiatry                           1
General Medicine (PAMF)              1
Gynecology                           1
Hepatology                           1
Name: pat_service, dtype: int64

 Admission         52531
Patient Update        1
Name: event_type, dtype: int64


Unnamed: 0,effective_time_jittered_utc,seq_num_in_enc,pat_class,base_pat_class_c,pat_lvl_of_care_c,pat_lv_of_care,event_type,pat_service,department_id,admit_time,first_ed_time
0,2021-08-06 06:41:00+00:00,49,Inpatient,,8.0,Critical Care,Patient Update,Pulmonary,120201006.0,2021-07-06 01:53:00,2021-07-06 00:37:00+00:00
1,2015-12-04 21:58:00+00:00,49,Inpatient,,6.0,Intermediate Care - With Cardiac Monitor,Patient Update,Pulmonary Hypertension,2000254.0,2015-10-30 01:30:00,2015-10-29 22:22:00+00:00
2,2020-10-20 23:00:00+00:00,10,Inpatient,,5.0,Acute Care (Assessment or intervention q4-8),Discharge,General Medicine (T),120201003.0,2020-10-19 00:29:00,2020-10-18 20:10:00+00:00
3,2021-08-08 20:54:00+00:00,13,Inpatient,,5.0,Acute Care (Assessment or intervention q4-8),Patient Update,Nephrology,2000262.0,2021-08-06 20:45:00,2021-08-06 18:16:00+00:00
4,2020-10-19 00:29:00+00:00,3,Inpatient,1.0,5.0,Acute Care (Assessment or intervention q4-8),Patient Update,General Medicine (T),120201020.0,2020-10-19 00:29:00,2020-10-18 20:10:00+00:00


In [13]:
# save some of these columns to upload to BQ so we can query the notes table
sub_first_ED = first_ED[['anon_id', 'admit_time', 'first_ed_time']].drop_duplicates()
# sub_first_ED.to_csv("{}/10_cohort4_3hr_noOR_first_ed_time.csv".format(savedir), index=False)

# Pull the notes from BQ

I used the "7_cohort4_3hr_noOR_first_ED_time.csv" file to query the notes table on BQ. We need to do this since CSN does not match between the cohort and the notes table. I used the following code to get all notes that occur for the anon_id between door to 24hr after admit.

In [14]:
# read the notes in
notes_file = "{}/cohort4_3hr_noOR_notes_2021.csv".format(datadir)

notes = pd.read_csv(notes_file)

# check anon id since csns no longer match
print(adt_admit.anon_id.nunique()) #35144
print(notes.anon_id.nunique()) #35144

35144
35144


In [15]:
notes.head(3)

Unnamed: 0,anon_id,pat_enc_csn_id_coded,author_prov_map_id,auth_lnked_prov_map_id,dept_id,ambulatory,note_status_c,note_status,note_type,note_date_jittered_utc,filing_date_jittered_utc,admit_time,first_ed_time
0,JC1784947,891879702,SS0202503,SS0202503,120201005.0,N,2.0,Signed,ED Note,2018-07-12 05:40:00+00:00,2018-07-12 05:40:00+00:00,2018-07-12 06:56:00+00:00,2018-07-12 05:15:00+00:00
1,JC2039424,941393474,SS0201720,SS0201720,120201005.0,N,2.0,Signed,ED Note,2018-12-06 18:12:00+00:00,2018-12-06 18:13:00+00:00,2018-12-06 21:12:00+00:00,2018-12-06 18:10:00+00:00
2,JC2942611,1804549067,SS0173500,SS0173500,110100004.0,N,2.0,Signed,ED Note,2021-05-16 05:03:00+00:00,2021-05-16 05:03:00+00:00,2021-05-16 06:47:00+00:00,2021-05-16 04:49:00+00:00


In [16]:
## -- takes a while to run -- ##

# change the effective time to datetime since read in from csv
notes.note_date_jittered_utc = pd.to_datetime(notes.note_date_jittered_utc)

# change admit time
notes.admit_time = pd.to_datetime(notes.admit_time, utc=True)

# Remove unwanted notes

Remove notes according to the following:

- remove "Letter" and "Telephone Encounter", "Progress Note, Outpatient"
- remove Ambulatory == Y

In [17]:
# remove specific note types
print(notes.value_counts('note_type'))

# removing from these categories
remove_types = ['Letter', 'Telephone Encounter', 'Progress Note, Outpatient']

filtered_notes = notes[-(notes.note_type.isin(remove_types))]

print("\n", filtered_notes.value_counts('note_type'))

note_type
ED Note                                416210
Progress Note, Inpatient               115757
Consultation Note                      102935
Progress/Discharge/Transfer Summary     79767
History and Physical                    58896
Other Note                              48926
Nursing Sign Out Note                   32307
Operative/Procedure Report              11397
Discharge/Transfer Summary               4241
Letter                                   1704
Progress Note, Outpatient                 279
Telephone Encounter                       102
dtype: int64

 note_type
ED Note                                416210
Progress Note, Inpatient               115757
Consultation Note                      102935
Progress/Discharge/Transfer Summary     79767
History and Physical                    58896
Other Note                              48926
Nursing Sign Out Note                   32307
Operative/Procedure Report              11397
Discharge/Transfer Summary               4241

In [18]:
# remove ambulatory == Y
print("\n", filtered_notes.value_counts('ambulatory')) 

filtered_notes_noAmbulatory = filtered_notes[filtered_notes.ambulatory != 'Y']

print("\n", filtered_notes_noAmbulatory.value_counts('ambulatory'))


 ambulatory
N    860499
Y      9937
dtype: int64

 ambulatory
N    860499
dtype: int64


In [19]:
# adjust the prover id columns

# assign auth_lnked_prov_map_id, 
# use auth_prov_map_id only if auth_lnked_prov_map_id is null
filtered_notes_noAmbulatory['prov_map_id'] = np.where(filtered_notes_noAmbulatory['auth_lnked_prov_map_id'].isnull(),
                                                     filtered_notes_noAmbulatory['author_prov_map_id'],
                                                     filtered_notes_noAmbulatory['auth_lnked_prov_map_id'])


print(filtered_notes_noAmbulatory.auth_lnked_prov_map_id.isna().sum())
print(filtered_notes_noAmbulatory.author_prov_map_id.isna().sum())
print(filtered_notes_noAmbulatory.prov_map_id.isna().sum()) # 841 notes do not have an author id

print(filtered_notes_noAmbulatory.prov_map_id.nunique())

1743
958
841
12354


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


# Treatment team

Bring in the treatment team data so we know classify notes as written by MD or RN.

I used the following SQL to query the treatment team table in BQ to get those that overlap with out notes.

In [20]:
# read in the treatment team file
treatment_team_file = "{}/cohort4_3hr_noOR_team_2021.csv".format(datadir)
team = pd.read_csv(treatment_team_file)# [['name', 'prov_map_id']]
print(team.shape)

team.head()

(19501, 2)


Unnamed: 0,name,prov_map_id
0,Physical Therapist Assistant,SS0078625
1,Wound/Ostomy/Continence RN,SS0101939
2,Licensed Vocational Nurse,SS0002065
3,Wound/Ostomy/Continence RN,SS0053201
4,Clinical Pharmacist,SS0319860


In [21]:
# connect the treatment team to the notes table
notes_team = filtered_notes_noAmbulatory.merge(team, how='left')
print(notes_team.name.value_counts())

Registered Nurse           505772
Primary Team               146029
Consulting Attending        94711
Emergency Resident          90873
Co-Attending                87649
                            ...  
Survivorship Provider           3
Activity Therapist              1
Case Manager Intern             1
Consulting Hematologist         1
Transplant Coordinator          1
Name: name, Length: 118, dtype: int64


In [22]:
notes_team.head(3)

Unnamed: 0,anon_id,pat_enc_csn_id_coded,author_prov_map_id,auth_lnked_prov_map_id,dept_id,ambulatory,note_status_c,note_status,note_type,note_date_jittered_utc,filing_date_jittered_utc,admit_time,first_ed_time,prov_map_id,name
0,JC1784947,891879702,SS0202503,SS0202503,120201005.0,N,2.0,Signed,ED Note,2018-07-12 05:40:00+00:00,2018-07-12 05:40:00+00:00,2018-07-12 06:56:00+00:00,2018-07-12 05:15:00+00:00,SS0202503,Registered Nurse
1,JC2039424,941393474,SS0201720,SS0201720,120201005.0,N,2.0,Signed,ED Note,2018-12-06 18:12:00+00:00,2018-12-06 18:13:00+00:00,2018-12-06 21:12:00+00:00,2018-12-06 18:10:00+00:00,SS0201720,
2,JC2942611,1804549067,SS0173500,SS0173500,110100004.0,N,2.0,Signed,ED Note,2021-05-16 05:03:00+00:00,2021-05-16 05:03:00+00:00,2021-05-16 06:47:00+00:00,2021-05-16 04:49:00+00:00,SS0173500,Registered Nurse


In [23]:
# add columns for the author name
rn = ['Registered Nurse', 'Resource Nurse', 'Float Nurse', 
      'Triage Nurse', 'Licensed Vocational Nurse', 'Post-Transplant Nurse', 
      'Nursery Nurse', 'Delivery Nurse', 'Specialty Nurse']
md = ['Primary Team', 'Emergency Resident', 'Primary Resident', 'Primary Intern', 'Senior Resident', 
        'Primary Advanced Practice Provider', 'Nurse Practitioner', 'Intern', 'Resident', 'Physician Assistant', 
         'Cardiologist', 'Primary Fellow', 'Pulmonologist', 'Fellow', 'Primary Sub-intern', 'Sub-Intern', 
         'Gastroenterologist', 'Infectious Disease', 'Chief Resident', 'Co-Attending', 'Primary Care Physician',  
         'Hematologist', 'Internist', 'Surgeon', 'Medical Oncologist', 'Dermatologist', 'Surgical Oncologist', 
         'Endocrinologist', 'Nephrologist', 'Trauma Attending', 'Diabetes MD', 'Trauma Resident', 
         'BMT Attending Provider', 'Urologist', 'Transplant Pulmonologist', 'Neurologist', 'Anesthesiologist', 
         'Primary Physician Assistant', 'Hepatologist', 'Transplant Surgeon', 'Obstetrician', 
         'Family Practitioner', 'Heart Failure Cardiologist', 'Radiation Oncologist', 'Referring Pulmonologist', 'Transplant Nephrologist',
         'Cross Cover Intern', 'Cross Cover Sub-Intern', 'Cross Cover Attending', 'Cross Cover Resident', 
         'Cross Cover Fellow', 'Cross Cover Advanced Practice Provider',
         'Consulting Service', 'Consulting Attending', 'Consulting Fellow', 'Consulting Resident', 'Consulting Intern', 
         'Psychologist', 'Consulting Medical Oncologist', 'Consulting Hematologist', 'Consulting Surgical Oncologist', 'Post-Transplant Nephrologist']


# assign the author type
notes_team['author_type'] = np.where(notes_team['name'].isin(rn), 'rn', None)
notes_team['author_type'] = np.where(notes_team['name'].isin(md), 'md', notes_team['author_type'])


print(notes_team.author_type.value_counts()) # md 789781, rn 540917

md    789781
rn    540917
Name: author_type, dtype: int64


In [24]:
first_ED.columns

Index(['pat_enc_csn_id_coded', 'anon_id', 'effective_time_jittered_utc',
       'seq_num_in_enc', 'pat_class', 'base_pat_class_c', 'pat_lvl_of_care_c',
       'pat_lv_of_care', 'event_type', 'pat_service', 'department_id',
       'admit_time', 'first_ed_time'],
      dtype='object')

# Connect cohort CSN to notes

Connect the cohort CSN to the notes.

In [25]:
# get the csn and important columns from first ED table
cols = ['pat_enc_csn_id_coded', 'anon_id', 'admit_time', 'first_ed_time']
csns = first_ED[cols]

# chage admit time to datetime
csns.admit_time = pd.to_datetime(csns.admit_time, utc=True)
csns.head()

# join the notes table by anon_id
csn_notes = csns.merge(notes_team, how='left', on=['anon_id', 'first_ed_time', 'admit_time'], suffixes=('_cohort', '_notes'))

# change first ED time to datetime
csn_notes.first_ed_time = pd.to_datetime(csn_notes.first_ed_time, utc=True)

csn_notes.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,pat_enc_csn_id_coded_cohort,anon_id,admit_time,first_ed_time,pat_enc_csn_id_coded_notes,author_prov_map_id,auth_lnked_prov_map_id,dept_id,ambulatory,note_status_c,note_status,note_type,note_date_jittered_utc,filing_date_jittered_utc,prov_map_id,name,author_type
0,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373488588,SS0037120,SS0037120,2000237.0,N,3.0,Addendum,History and Physical,2015-01-02 00:15:00+00:00,2015-01-03 04:58:00+00:00,SS0037120,Co-Attending,md
1,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373488588,SS0037120,SS0037120,2000237.0,N,3.0,Addendum,History and Physical,2015-01-02 00:15:00+00:00,2015-01-03 04:58:00+00:00,SS0037120,Primary Team,md
2,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373488588,SS0037120,SS0037120,2000237.0,N,3.0,Addendum,History and Physical,2015-01-02 00:15:00+00:00,2015-01-03 04:58:00+00:00,SS0037120,Primary Resident,md
3,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373488588,SS0037120,SS0037120,2000237.0,N,3.0,Addendum,History and Physical,2015-01-02 00:15:00+00:00,2015-01-03 04:58:00+00:00,SS0037120,Consulting Resident,md
4,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373222819,SS0099302,SS0099302,2000237.0,N,2.0,Signed,Progress/Discharge/Transfer Summary,2015-01-02 10:08:00+00:00,2015-01-02 10:08:00+00:00,SS0099302,Nurse Coordinator,


In [26]:
# make sure that the note's time is between first_ED_time and admit_time+24hours

filtered_csn_notes = csn_notes[(csn_notes.note_date_jittered_utc >= csn_notes.first_ed_time) &
                              (csn_notes.note_date_jittered_utc <= csn_notes.admit_time+timedelta(hours=24))]

# rename the csn that we're actually using
filtered_csn_notes = filtered_csn_notes.rename({'pat_enc_csn_id_coded_cohort':'pat_enc_csn_id_coded'}, axis=1)

filtered_csn_notes.pat_enc_csn_id_coded.nunique() # we have all of our csns

52532

# Door to dispo

Get the notes that occur prior to admission

In [27]:
door_to_dispo_notes = filtered_csn_notes[filtered_csn_notes.note_date_jittered_utc < filtered_csn_notes.admit_time]

# some anon_ids do not have notes before admit (door_to_dispo window)
print(filtered_csn_notes.anon_id.nunique())
print(door_to_dispo_notes.anon_id.nunique())

# less than half of the notes occur before admit
print(filtered_csn_notes.shape) #(1567752, 17)
print(door_to_dispo_notes.shape) #(526164, 17)


35144
34842
(1567752, 17)
(526164, 17)


In [28]:
door_to_dispo_notes.head(3)

Unnamed: 0,pat_enc_csn_id_coded,anon_id,admit_time,first_ed_time,pat_enc_csn_id_coded_notes,author_prov_map_id,auth_lnked_prov_map_id,dept_id,ambulatory,note_status_c,note_status,note_type,note_date_jittered_utc,filing_date_jittered_utc,prov_map_id,name,author_type
0,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373488588,SS0037120,SS0037120,2000237.0,N,3.0,Addendum,History and Physical,2015-01-02 00:15:00+00:00,2015-01-03 04:58:00+00:00,SS0037120,Co-Attending,md
1,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373488588,SS0037120,SS0037120,2000237.0,N,3.0,Addendum,History and Physical,2015-01-02 00:15:00+00:00,2015-01-03 04:58:00+00:00,SS0037120,Primary Team,md
2,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373488588,SS0037120,SS0037120,2000237.0,N,3.0,Addendum,History and Physical,2015-01-02 00:15:00+00:00,2015-01-03 04:58:00+00:00,SS0037120,Primary Resident,md


In [29]:
# get the counts for all notes, rn notes, and md notes for this time window
def get_counts(window_notes, window):
    all_counts = window_notes.groupby(['pat_enc_csn_id_coded']).size().reset_index(name='{}_all_notes'.format(window))
    all_counts.head()

    # get the rn counts
    rn_counts = window_notes[window_notes.author_type == 'rn'].groupby(['pat_enc_csn_id_coded']).size().reset_index(name='{}_rn_notes'.format(window))
    rn_counts.head()

    # get the md counts
    md_counts = window_notes[window_notes.author_type == 'md'].groupby(['pat_enc_csn_id_coded']).size().reset_index(name='{}_md_notes'.format(window))
    md_counts.head()

    counts = all_counts.merge(rn_counts.merge(md_counts, how='outer'), how='outer')
    counts.head()
    
    return(counts)

In [30]:
# get door to dispo counts
door_to_dispo_counts = get_counts(door_to_dispo_notes, "door_to_dispo")
door_to_dispo_counts.head()

Unnamed: 0,pat_enc_csn_id_coded,door_to_dispo_all_notes,door_to_dispo_rn_notes,door_to_dispo_md_notes
0,131062572931,38,13.0,16.0
1,131062745090,8,3.0,3.0
2,131062927111,24,11.0,11.0
3,131063006922,13,6.0,6.0
4,131063022232,31,9.0,16.0


# Dispo to 24hr counts

Get the counts for dispo to 24hr: 

- effective_time >= admit_time

In [31]:
dispo_to_24hr_notes = filtered_csn_notes[filtered_csn_notes.note_date_jittered_utc >= filtered_csn_notes.admit_time]

# almost all anon_ids have notes during this window
print(filtered_csn_notes.anon_id.nunique())
print(dispo_to_24hr_notes.anon_id.nunique())

# majority of notes happen after admit
print(filtered_csn_notes.shape) #1567752
print(dispo_to_24hr_notes.shape) #1041588

35144
35133
(1567752, 17)
(1041588, 17)


In [32]:
dispo_to_24hr_counts = get_counts(dispo_to_24hr_notes, "dispo_to_24hr")
dispo_to_24hr_counts.head()

Unnamed: 0,pat_enc_csn_id_coded,dispo_to_24hr_all_notes,dispo_to_24hr_rn_notes,dispo_to_24hr_md_notes
0,131062572931,24,11.0,6.0
1,131062745090,8,2.0,3.0
2,131062927111,28,6.0,13.0
3,131063006922,26,8.0,4.0
4,131063022232,5,1.0,3.0


In [33]:
# join the two window counts together
all_counts = door_to_dispo_counts.merge(dispo_to_24hr_counts, how='outer')
all_counts.head()

Unnamed: 0,pat_enc_csn_id_coded,door_to_dispo_all_notes,door_to_dispo_rn_notes,door_to_dispo_md_notes,dispo_to_24hr_all_notes,dispo_to_24hr_rn_notes,dispo_to_24hr_md_notes
0,131062572931,38.0,13.0,16.0,24.0,11.0,6.0
1,131062745090,8.0,3.0,3.0,8.0,2.0,3.0
2,131062927111,24.0,11.0,11.0,28.0,6.0,13.0
3,131063006922,13.0,6.0,6.0,26.0,8.0,4.0
4,131063022232,31.0,9.0,16.0,5.0,1.0,3.0


In [34]:
# print(all_counts['door_to_dispo_rn_notes'].describe())
# print(all_counts['door_to_dispo_md_notes'].describe())
# print(all_counts['dispo_to_24hr_rn_notes'].describe())
# print(all_counts['dispo_to_24hr_md_notes'].describe())
all_counts.describe()

Unnamed: 0,pat_enc_csn_id_coded,door_to_dispo_all_notes,door_to_dispo_rn_notes,door_to_dispo_md_notes,dispo_to_24hr_all_notes,dispo_to_24hr_rn_notes,dispo_to_24hr_md_notes
count,52532.0,51984.0,46850.0,44885.0,52510.0,51564.0,49792.0
mean,131240400000.0,10.121653,4.987876,5.22346,19.835993,5.958324,11.152916
std,66838410.0,10.597627,6.186237,4.958397,13.146692,4.424066,10.80341
min,131062600000.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,131211300000.0,4.0,2.0,2.0,12.0,3.0,4.0
50%,131259700000.0,8.0,3.0,4.0,17.0,5.0,8.0
75%,131289300000.0,13.0,6.0,7.0,24.0,8.0,14.0
max,131320900000.0,265.0,181.0,156.0,543.0,108.0,533.0


In [87]:
savedir

'../../OutputTD/shc2021/'