# Notes

Check patient notes and create features:

## Dispo to 24hr

- count how many RN vs MD notes (author)
- count total all notes from anyone

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
# view data frame

# adjust this to hide ID columns for posting to github
forrepo=False

def view_df(df):
    hidecols = []
    if forrepo:
        hidecols=['anon_id', 'pat_enc_csn_id_coded', 'inpatient_data_id_coded'] # these are hidden columns
        
    return(df.drop(hidecols, axis=1, errors='ignore').head())

# Data

Load in the data

Grabbed the adt table with the following code:

In [3]:
# read in data files
datadir = "../../DataTD/shc2021/"
savedir = "../../OutputTD/shc2021/"


adt_file = datadir + "7_cohort4_3hr_noOR_adt.csv"
cohort_file = savedir + "7_cohort4_3hr_labels_noOR.csv"


full_adt = pd.read_csv(adt_file)
cohort = pd.read_csv(cohort_file)

In [4]:
full_adt.pat_enc_csn_id_coded.nunique()

52532

# Bring in the ADT Table

We can use the ADT table to look at 

- pat_class
- pat_lv_of_care

In [5]:
# filter adt table down to test cohort
adt = full_adt[full_adt.pat_enc_csn_id_coded.isin(cohort.pat_enc_csn_id_coded)]

# these should match - it does!
print(adt.pat_enc_csn_id_coded.nunique())
print(cohort.pat_enc_csn_id_coded.nunique())

view_df(adt)

52532
52532


Unnamed: 0,anon_id,pat_enc_csn_id_coded,event_id_coded,event_type_c,event_type,event_subtype_c,event_subtype,department_id,pat_class_c,pat_class,...,seq_num_in_enc,seq_num_in_bed_min,labor_status_c,first_ip_in_ip_yn,loa_reason_c,data_source,effective_time_jittered,event_time_jittered,effective_time_jittered_utc,event_time_jittered_utc
0,JC927965,131193903413,20318159,3,Transfer In,3,Update,2000231.0,166,Bedded Outpatient (corrections only),...,10,1.0,,,,CLARITY_SHC,2016-07-23 18:40:00,2016-07-24 15:36:00,2016-07-24 01:40:00 UTC,2016-07-24 22:36:00 UTC
1,JC2784058,131293610978,38869902,3,Transfer In,3,Update,110100016.0,166,Bedded Outpatient (corrections only),...,5,1.0,,,,CLARITY_SHC,2020-09-23 18:30:00,2020-09-24 15:02:00,2020-09-24 01:30:00 UTC,2020-09-24 22:02:00 UTC
2,JC2854999,131287855697,37210117,3,Transfer In,3,Update,2000252.0,166,Bedded Outpatient (corrections only),...,8,1.0,,,,CLARITY_SHC,2020-05-06 02:53:00,2020-05-07 10:52:00,2020-05-06 09:53:00 UTC,2020-05-07 17:52:00 UTC
3,JC860091,131295227813,39280862,6,Census,3,Update,110100017.0,166,Bedded Outpatient (corrections only),...,6,1.0,,,,CLARITY_SHC,2020-10-16 23:59:00,2020-10-17 15:58:00,2020-10-17 06:59:00 UTC,2020-10-17 22:58:00 UTC
4,JC2112785,131251315002,27375439,2,Discharge,1,Original,2000273.0,126,Inpatient,...,19,1.0,,,,CLARITY_SHC,2018-04-29 19:09:00,2018-04-29 19:09:00,2018-04-30 02:09:00 UTC,2018-04-30 02:09:00 UTC


In [6]:
# add the admit time column from the prediction results csv to the adt csv
adt_admit = adt.merge(cohort[['pat_enc_csn_id_coded', 'admit_time']], how='left', on=['pat_enc_csn_id_coded'])


adt_admit.columns
view_df(adt_admit[['pat_enc_csn_id_coded', 'effective_time_jittered_utc', 
                 'pat_class', 'admit_time']])

Unnamed: 0,pat_enc_csn_id_coded,effective_time_jittered_utc,pat_class,admit_time
0,131193903413,2016-07-24 01:40:00 UTC,Bedded Outpatient (corrections only),2016-07-23 23:20:00
1,131293610978,2020-09-24 01:30:00 UTC,Bedded Outpatient (corrections only),2020-09-24 00:04:00
2,131287855697,2020-05-06 09:53:00 UTC,Bedded Outpatient (corrections only),2020-05-06 08:11:00
3,131295227813,2020-10-17 06:59:00 UTC,Bedded Outpatient (corrections only),2020-10-17 04:22:00
4,131251315002,2018-04-30 02:09:00 UTC,Inpatient,2018-04-26 06:30:00


In [8]:
# get time of first emergency services event
first_ED = adt_admit[(adt_admit.pat_class == 'Emergency Services')].sort_values(['pat_enc_csn_id_coded', 'seq_num_in_enc']).groupby('pat_enc_csn_id_coded').first().reset_index()
first_ED.head()

Unnamed: 0,pat_enc_csn_id_coded,anon_id,event_id_coded,event_type_c,event_type,event_subtype_c,event_subtype,department_id,pat_class_c,pat_class,...,seq_num_in_bed_min,labor_status_c,first_ip_in_ip_yn,loa_reason_c,data_source,effective_time_jittered,event_time_jittered,effective_time_jittered_utc,event_time_jittered_utc,admit_time
0,131062572931,JC1170548,15250994,1,Admission,1,Original,2001002.0,112,Emergency Services,...,1.0,,,,CLARITY_SHC,2015-01-01 10:40:00,2015-01-01 10:40:00,2015-01-01 18:40:00 UTC,2015-01-01 18:40:00 UTC,2015-01-02 03:48:00
1,131062745090,JC913990,15263453,1,Admission,1,Original,2001002.0,112,Emergency Services,...,1.0,,,,CLARITY_SHC,2015-01-01 17:56:00,2015-01-01 17:56:00,2015-01-02 01:56:00 UTC,2015-01-02 01:56:00 UTC,2015-01-02 05:53:00
2,131062927111,JC529112,15279339,1,Admission,1,Original,2001002.0,112,Emergency Services,...,1.0,,,,CLARITY_SHC,2015-01-04 10:13:00,2015-01-04 10:13:00,2015-01-04 18:13:00 UTC,2015-01-04 18:13:00 UTC,2015-01-05 03:20:00
3,131063006922,JC1702404,15286969,1,Admission,1,Original,2001002.0,112,Emergency Services,...,1.0,,,,CLARITY_SHC,2015-01-06 01:04:00,2015-01-06 01:04:00,2015-01-06 09:04:00 UTC,2015-01-06 09:04:00 UTC,2015-01-06 14:47:00
4,131063022232,JC523028,15288226,1,Admission,1,Original,2001002.0,112,Emergency Services,...,1.0,,,,CLARITY_SHC,2015-01-03 06:51:00,2015-01-03 06:51:00,2015-01-03 14:51:00 UTC,2015-01-03 14:51:00 UTC,2015-01-03 21:24:00


In [9]:
print(adt_admit.pat_enc_csn_id_coded.nunique())
print(first_ED.pat_enc_csn_id_coded.nunique())

set(adt_admit.pat_enc_csn_id_coded) - set(first_ED.pat_enc_csn_id_coded)

print("\n", first_ED.pat_service.value_counts())
print("\n", first_ED.event_type.value_counts())

first_ED['first_ED_time'] = first_ED.effective_time_jittered_utc

first_ED_time = first_ED[['pat_enc_csn_id_coded', 'first_ED_time']].drop_duplicates()
first_ED_time.head()

adt_ed = adt_admit.merge(first_ED_time, how='left')
view_df(adt_ed)

52532
52532

 Emergency                        52519
Emergency Medicine                   7
General Medicine (University)        2
Gynecology                           1
General Medicine (PAMF)              1
Hepatology                           1
Psychiatry                           1
Name: pat_service, dtype: int64

 Admission         52531
Patient Update        1
Name: event_type, dtype: int64


Unnamed: 0,anon_id,pat_enc_csn_id_coded,event_id_coded,event_type_c,event_type,event_subtype_c,event_subtype,department_id,pat_class_c,pat_class,...,labor_status_c,first_ip_in_ip_yn,loa_reason_c,data_source,effective_time_jittered,event_time_jittered,effective_time_jittered_utc,event_time_jittered_utc,admit_time,first_ED_time
0,JC927965,131193903413,20318159,3,Transfer In,3,Update,2000231.0,166,Bedded Outpatient (corrections only),...,,,,CLARITY_SHC,2016-07-23 18:40:00,2016-07-24 15:36:00,2016-07-24 01:40:00 UTC,2016-07-24 22:36:00 UTC,2016-07-23 23:20:00,2016-07-23 20:38:00 UTC
1,JC2784058,131293610978,38869902,3,Transfer In,3,Update,110100016.0,166,Bedded Outpatient (corrections only),...,,,,CLARITY_SHC,2020-09-23 18:30:00,2020-09-24 15:02:00,2020-09-24 01:30:00 UTC,2020-09-24 22:02:00 UTC,2020-09-24 00:04:00,2020-09-23 22:53:00 UTC
2,JC2854999,131287855697,37210117,3,Transfer In,3,Update,2000252.0,166,Bedded Outpatient (corrections only),...,,,,CLARITY_SHC,2020-05-06 02:53:00,2020-05-07 10:52:00,2020-05-06 09:53:00 UTC,2020-05-07 17:52:00 UTC,2020-05-06 08:11:00,2020-05-06 06:15:00 UTC
3,JC860091,131295227813,39280862,6,Census,3,Update,110100017.0,166,Bedded Outpatient (corrections only),...,,,,CLARITY_SHC,2020-10-16 23:59:00,2020-10-17 15:58:00,2020-10-17 06:59:00 UTC,2020-10-17 22:58:00 UTC,2020-10-17 04:22:00,2020-10-16 21:37:00 UTC
4,JC2112785,131251315002,27375439,2,Discharge,1,Original,2000273.0,126,Inpatient,...,,,,CLARITY_SHC,2018-04-29 19:09:00,2018-04-29 19:09:00,2018-04-30 02:09:00 UTC,2018-04-30 02:09:00 UTC,2018-04-26 06:30:00,2018-04-26 04:26:00 UTC


In [15]:
# save some of these columns to upload to BQ so we can query the notes table
sub_first_ED = first_ED[['anon_id', 'admit_time', 'first_ED_time']].drop_duplicates()
sub_first_ED.to_csv("{}/7_cohort4_3hr_noOR_first_ED_time.csv".format(savedir), index=False)

# Pull the notes from BQ

I used the "7_cohort4_3hr_noOR_first_ED_time.csv" file to query the notes table on BQ. We need to do this since CSN does not match between the cohort and the notes table. I used the following code to get all notes that occur for the anon_id between door to 24hr after admit.

In [21]:
# read the notes in
notes_file = "{}/7_cohort4_3hr_noOR_notes.csv".format(datadir)

notes = pd.read_csv(notes_file)

# check anon id since csns no longer match
print(adt_admit.anon_id.nunique())
print(notes.anon_id.nunique())

notes.head()

35144
35144


Unnamed: 0,anon_id,pat_enc_csn_id_coded,author_prov_map_id,effective_dept_id,note_status_c,note_status,ambulatory,ltr_status_c,letter_status,note_type,...,filing_date_jittered,note_date_jittered,activity_date_jittered,effective_time_jittered,filing_date_jittered_utc,note_date_jittered_utc,activity_date_jittered_utc,effective_time_jittered_utc,admit_time,first_ED_time
0,JC1856510,1886223149,SS0056768,2000238.0,3.0,Addendum,N,,,"Progress Note, Inpatient",...,2021-08-20 13:37:00,2021-08-19 16:00:00,,2021-08-19 16:00:00,2021-08-20 20:37:00 UTC,2021-08-19 23:00:00 UTC,,2021-08-19 23:00:00 UTC,2021-08-19 01:44:00 UTC,2021-08-18 23:03:00 UTC
1,JC1834385,1287516779,SS0002883,2000238.0,2.0,Signed,N,,,Progress/Discharge/Transfer Summary,...,2020-02-19 13:35:00,2020-02-19 13:25:00,,2020-02-19 13:25:00,2020-02-19 21:35:00 UTC,2020-02-19 21:25:00 UTC,,2020-02-19 21:25:00 UTC,2020-02-19 04:54:00 UTC,2020-02-19 01:14:00 UTC
2,JC1049247,738133874,SS0076624,2000250.0,2.0,Signed,N,,,ED Note,...,2017-07-05 22:26:00,2017-07-05 22:25:00,,2017-07-05 22:25:00,2017-07-06 05:26:00 UTC,2017-07-06 05:25:00 UTC,,2017-07-06 05:25:00 UTC,2017-07-06 05:50:00 UTC,2017-07-06 02:40:00 UTC
3,JC1765121,396665876,SS0039146,2000250.0,2.0,Signed,N,,,ED Note,...,2015-03-11 17:46:00,2015-03-11 17:46:00,,2015-03-11 17:46:00,2015-03-12 00:46:00 UTC,2015-03-12 00:46:00 UTC,,2015-03-12 00:46:00 UTC,2015-03-12 03:47:00 UTC,2015-03-11 21:39:00 UTC
4,JC1298129,509786137,SS0098429,2000250.0,2.0,Signed,N,,,ED Note,...,2016-02-04 03:47:00,2016-02-04 03:47:00,,2016-02-04 03:47:00,2016-02-04 11:47:00 UTC,2016-02-04 11:47:00 UTC,,2016-02-04 11:47:00 UTC,2016-02-04 10:22:00 UTC,2016-02-04 04:28:00 UTC


In [22]:
## -- takes a while to run -- ##

# change the effective time to datetime since read in from csv
notes.effective_time_jittered_utc = pd.to_datetime(notes.effective_time_jittered_utc)

# change admit time
notes.admit_time = pd.to_datetime(notes.admit_time, utc=True)

# Remove unwanted notes

Remove notes according to the following:

- remove "Letter" and "Telephone Encounter", "Progress Note, Outpatient"
- remove Ambulatory == Y

In [32]:
# remove specific note types
print(notes.value_counts('note_type'))

# removing from these categories
remove_types = ['Letter', 'Telephone Encounter', 'Progress Note, Outpatient']

filtered_notes = notes[-(notes.note_type.isin(remove_types))]

print("\n", filtered_notes.value_counts('note_type'))

note_type
ED Note                                416210
Progress Note, Inpatient               115757
Consultation Note                      102935
Progress/Discharge/Transfer Summary     79767
History and Physical                    58896
Other Note                              48926
Nursing Sign Out Note                   32307
Operative/Procedure Report              11397
Discharge/Transfer Summary               4241
Letter                                   1704
Progress Note, Outpatient                 279
Telephone Encounter                       102
dtype: int64

 note_type
ED Note                                416210
Progress Note, Inpatient               115757
Consultation Note                      102935
Progress/Discharge/Transfer Summary     79767
History and Physical                    58896
Other Note                              48926
Nursing Sign Out Note                   32307
Operative/Procedure Report              11397
Discharge/Transfer Summary               4241

In [34]:
# remove ambulatory == Y
print("\n", filtered_notes.value_counts('ambulatory')) 

filtered_notes_noAmbulatory = filtered_notes[filtered_notes.ambulatory != 'Y']

print("\n", filtered_notes_noAmbulatory.value_counts('ambulatory'))


 ambulatory
N    860499
Y      9937
dtype: int64

 ambulatory
N    860499
dtype: int64


In [59]:
# adjust the prover id columns

# assign auth_lnked_prov_map_id, 
# use auth_prov_map_id only if auth_lnked_prov_map_id is null
filtered_notes_noAmbulatory['prov_map_id'] = np.where(filtered_notes_noAmbulatory['auth_lnked_prov_map_id'].isnull(),
                                                     filtered_notes_noAmbulatory['author_prov_map_id'],
                                                     filtered_notes_noAmbulatory['auth_lnked_prov_map_id'])


print(filtered_notes_noAmbulatory.auth_lnked_prov_map_id.isna().sum())
print(filtered_notes_noAmbulatory.author_prov_map_id.isna().sum())
print(filtered_notes_noAmbulatory.prov_map_id.isna().sum()) # 841 notes do not have an author id

1743
958
841


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_notes_noAmbulatory['prov_map_id'] = np.where(filtered_notes_noAmbulatory['auth_lnked_prov_map_id'].isnull(),


In [61]:
filtered_notes_noAmbulatory.prov_map_id.nunique()

12354

# Treatment team

Bring in the treatment team data so we know classify notes as written by MD or RN.

I used the following SQL to query the treatment team table in BQ to get those that overlap with out notes.

In [64]:
# read in the treatment team file
treatment_team_file = "{}/7_cohort4_3hr_noOR_treatment_team.csv".format(datadir)
team = pd.read_csv(treatment_team_file)

team.head()

Unnamed: 0,name,prov_map_id
0,Aging Adult Services Coordinator,SS0198481
1,Clinical Pharmacist,SS0173067
2,Physical Therapist Assistant,SS0005101
3,Clinical Pharmacist,SS0290707
4,Physical Therapist Assistant,SS0187134


In [66]:
# connect the treatment team to the notes table
notes_team = filtered_notes_noAmbulatory.merge(team, how='left')

print(notes_team.name.value_counts())

notes_team.head()

Registered Nurse                     505772
Primary Team                         146029
Consulting Attending                  94711
Emergency Resident                    90873
Co-Attending                          87649
                                      ...  
Consulting Reconstructive Surgeon         3
Consulting Hematologist                   1
Transplant Coordinator                    1
Case Manager Intern                       1
Activity Therapist                        1
Name: name, Length: 118, dtype: int64


Unnamed: 0,anon_id,pat_enc_csn_id_coded,author_prov_map_id,effective_dept_id,note_status_c,note_status,ambulatory,ltr_status_c,letter_status,note_type,...,effective_time_jittered,filing_date_jittered_utc,note_date_jittered_utc,activity_date_jittered_utc,effective_time_jittered_utc,admit_time,first_ED_time,prov_id,prov_map_id,name
0,JC1856510,1886223149,SS0056768,2000238.0,3.0,Addendum,N,,,"Progress Note, Inpatient",...,2021-08-19 16:00:00,2021-08-20 20:37:00 UTC,2021-08-19 23:00:00 UTC,,2021-08-19 23:00:00+00:00,2021-08-19 01:44:00+00:00,2021-08-18 23:03:00 UTC,SS0056768,SS0056768,Primary Sub-Intern
1,JC1856510,1886223149,SS0056768,2000238.0,3.0,Addendum,N,,,"Progress Note, Inpatient",...,2021-08-19 16:00:00,2021-08-20 20:37:00 UTC,2021-08-19 23:00:00 UTC,,2021-08-19 23:00:00+00:00,2021-08-19 01:44:00+00:00,2021-08-18 23:03:00 UTC,SS0056768,SS0056768,Primary Med Student
2,JC1856510,1886223149,SS0056768,2000238.0,3.0,Addendum,N,,,"Progress Note, Inpatient",...,2021-08-19 16:00:00,2021-08-20 20:37:00 UTC,2021-08-19 23:00:00 UTC,,2021-08-19 23:00:00+00:00,2021-08-19 01:44:00+00:00,2021-08-18 23:03:00 UTC,SS0056768,SS0056768,Consulting Service
3,JC1834385,1287516779,SS0002883,2000238.0,2.0,Signed,N,,,Progress/Discharge/Transfer Summary,...,2020-02-19 13:25:00,2020-02-19 21:35:00 UTC,2020-02-19 21:25:00 UTC,,2020-02-19 21:25:00+00:00,2020-02-19 04:54:00+00:00,2020-02-19 01:14:00 UTC,SS0002883,SS0002883,Registered Nurse
4,JC1049247,738133874,SS0076624,2000250.0,2.0,Signed,N,,,ED Note,...,2017-07-05 22:25:00,2017-07-06 05:26:00 UTC,2017-07-06 05:25:00 UTC,,2017-07-06 05:25:00+00:00,2017-07-06 05:50:00+00:00,2017-07-06 02:40:00 UTC,SS0076624,SS0076624,Registered Nurse


In [69]:
# add columns for the author name
rn = ['Registered Nurse', 'Resource Nurse', 'Float Nurse', 
      'Triage Nurse', 'Licensed Vocational Nurse', 'Post-Transplant Nurse', 
      'Nursery Nurse', 'Delivery Nurse', 'Specialty Nurse']
md = ['Primary Team', 'Emergency Resident', 'Primary Resident', 'Primary Intern', 'Senior Resident', 
        'Primary Advanced Practice Provider', 'Nurse Practitioner', 'Intern', 'Resident', 'Physician Assistant', 
         'Cardiologist', 'Primary Fellow', 'Pulmonologist', 'Fellow', 'Primary Sub-intern', 'Sub-Intern', 
         'Gastroenterologist', 'Infectious Disease', 'Chief Resident', 'Co-Attending', 'Primary Care Physician',  
         'Hematologist', 'Internist', 'Surgeon', 'Medical Oncologist', 'Dermatologist', 'Surgical Oncologist', 
         'Endocrinologist', 'Nephrologist', 'Trauma Attending', 'Diabetes MD', 'Trauma Resident', 
         'BMT Attending Provider', 'Urologist', 'Transplant Pulmonologist', 'Neurologist', 'Anesthesiologist', 
         'Primary Physician Assistant', 'Hepatologist', 'Transplant Surgeon', 'Obstetrician', 
         'Family Practitioner', 'Heart Failure Cardiologist', 'Radiation Oncologist', 'Referring Pulmonologist', 'Transplant Nephrologist',
         'Cross Cover Intern', 'Cross Cover Sub-Intern', 'Cross Cover Attending', 'Cross Cover Resident', 
         'Cross Cover Fellow', 'Cross Cover Advanced Practice Provider',
         'Consulting Service', 'Consulting Attending', 'Consulting Fellow', 'Consulting Resident', 'Consulting Intern', 
         'Psychologist', 'Consulting Medical Oncologist', 'Consulting Hematologist', 'Consulting Surgical Oncologist', 'Post-Transplant Nephrologist']


# assign the author type
notes_team['author_type'] = np.where(notes_team['name'].isin(rn), 'rn', None)
notes_team['author_type'] = np.where(notes_team['name'].isin(md), 'md', notes_team['author_type'])


print(notes_team.author_type.value_counts())

notes_team.head()

md    789781
rn    540917
Name: author_type, dtype: int64


Unnamed: 0,anon_id,pat_enc_csn_id_coded,author_prov_map_id,effective_dept_id,note_status_c,note_status,ambulatory,ltr_status_c,letter_status,note_type,...,filing_date_jittered_utc,note_date_jittered_utc,activity_date_jittered_utc,effective_time_jittered_utc,admit_time,first_ED_time,prov_id,prov_map_id,name,author_type
0,JC1856510,1886223149,SS0056768,2000238.0,3.0,Addendum,N,,,"Progress Note, Inpatient",...,2021-08-20 20:37:00 UTC,2021-08-19 23:00:00 UTC,,2021-08-19 23:00:00+00:00,2021-08-19 01:44:00+00:00,2021-08-18 23:03:00 UTC,SS0056768,SS0056768,Primary Sub-Intern,
1,JC1856510,1886223149,SS0056768,2000238.0,3.0,Addendum,N,,,"Progress Note, Inpatient",...,2021-08-20 20:37:00 UTC,2021-08-19 23:00:00 UTC,,2021-08-19 23:00:00+00:00,2021-08-19 01:44:00+00:00,2021-08-18 23:03:00 UTC,SS0056768,SS0056768,Primary Med Student,
2,JC1856510,1886223149,SS0056768,2000238.0,3.0,Addendum,N,,,"Progress Note, Inpatient",...,2021-08-20 20:37:00 UTC,2021-08-19 23:00:00 UTC,,2021-08-19 23:00:00+00:00,2021-08-19 01:44:00+00:00,2021-08-18 23:03:00 UTC,SS0056768,SS0056768,Consulting Service,md
3,JC1834385,1287516779,SS0002883,2000238.0,2.0,Signed,N,,,Progress/Discharge/Transfer Summary,...,2020-02-19 21:35:00 UTC,2020-02-19 21:25:00 UTC,,2020-02-19 21:25:00+00:00,2020-02-19 04:54:00+00:00,2020-02-19 01:14:00 UTC,SS0002883,SS0002883,Registered Nurse,rn
4,JC1049247,738133874,SS0076624,2000250.0,2.0,Signed,N,,,ED Note,...,2017-07-06 05:26:00 UTC,2017-07-06 05:25:00 UTC,,2017-07-06 05:25:00+00:00,2017-07-06 05:50:00+00:00,2017-07-06 02:40:00 UTC,SS0076624,SS0076624,Registered Nurse,rn


# Connect cohort CSN to notes

Connect the cohort CSN to the notes.

In [91]:
# get the csn and important columns from first ED table
cols = ['pat_enc_csn_id_coded', 'anon_id', 'admit_time', 'first_ED_time']
csns = first_ED[cols]

# chage admit time to datetime
csns.admit_time = pd.to_datetime(csns.admit_time, utc=True)
csns.head()

# join the notes table by anon_id
csn_notes = csns.merge(notes_team, how='left', on=['anon_id', 'first_ED_time', 'admit_time'], suffixes=('_cohort', '_notes'))

# change first ED time to datetime
csn_notes.first_ED_time = pd.to_datetime(csn_notes.first_ED_time, utc=True)

csn_notes.head()

Unnamed: 0,pat_enc_csn_id_coded_cohort,anon_id,admit_time,first_ED_time,pat_enc_csn_id_coded_notes,author_prov_map_id,effective_dept_id,note_status_c,note_status,ambulatory,...,activity_date_jittered,effective_time_jittered,filing_date_jittered_utc,note_date_jittered_utc,activity_date_jittered_utc,effective_time_jittered_utc,prov_id,prov_map_id,name,author_type
0,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373222818,SS0099302,2000237.0,2.0,Signed,N,...,,2015-01-02 02:08:00,2015-01-02 10:08:00 UTC,2015-01-02 10:08:00 UTC,,2015-01-02 10:08:00+00:00,SS0099302,SS0099302,Nurse Coordinator,
1,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373222818,SS0099302,2000237.0,2.0,Signed,N,...,,2015-01-02 02:08:00,2015-01-02 10:08:00 UTC,2015-01-02 10:08:00 UTC,,2015-01-02 10:08:00+00:00,SS0099302,SS0099302,Registered Nurse,rn
2,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373021881,SS0046296,2000237.0,2.0,Signed,N,...,,2015-01-01 11:20:00,2015-01-01 19:21:00 UTC,2015-01-01 19:20:00 UTC,,2015-01-01 19:20:00+00:00,SS0046296,SS0046296,Registered Nurse,rn
3,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373021881,SS0046296,2000237.0,2.0,Signed,N,...,,2015-01-01 11:20:00,2015-01-01 19:21:00 UTC,2015-01-01 19:20:00 UTC,,2015-01-01 19:20:00+00:00,SS0046296,SS0046296,Emergency Resident,md
4,131062572931,JC1170548,2015-01-02 03:48:00+00:00,2015-01-01 18:40:00+00:00,373186373,SS0106030,2000237.0,2.0,Signed,N,...,,2015-01-01 17:31:00,2015-01-29 20:53:00 UTC,2015-01-02 01:31:00 UTC,,2015-01-02 01:31:00+00:00,SS0106030,SS0106030,Primary Team,md


In [95]:
# make sure that the note's time is between first_ED_time and admit_time+24hours

filtered_csn_notes = csn_notes[(csn_notes.effective_time_jittered_utc >= csn_notes.first_ED_time) &
                              (csn_notes.effective_time_jittered_utc <= csn_notes.admit_time+timedelta(hours=24))]

# rename the csn that we're actually using
filtered_csn_notes = filtered_csn_notes.rename({'pat_enc_csn_id_coded_cohort':'pat_enc_csn_id_coded'}, axis=1)

filtered_csn_notes.pat_enc_csn_id_coded.nunique() # we have all of our csns

52532

# Door to dispo

Get the notes that occur prior to admission

In [96]:
door_to_dispo_notes = filtered_csn_notes[filtered_csn_notes.effective_time_jittered_utc < filtered_csn_notes.admit_time]

# some anon_ids do not have notes before admit (door_to_dispo window)
print(filtered_csn_notes.anon_id.nunique())
print(door_to_dispo_notes.anon_id.nunique())

# less than half of the notes occur before admit
print(filtered_csn_notes.shape)
print(door_to_dispo_notes.shape)

35144
34842
(1567752, 28)
(526164, 28)


In [71]:
door_to_dispo_notes.head()

Index(['anon_id', 'pat_enc_csn_id_coded', 'author_prov_map_id',
       'effective_dept_id', 'note_status_c', 'note_status', 'ambulatory',
       'ltr_status_c', 'letter_status', 'note_type', 'auth_lnked_prov_map_id',
       'cosign_prov_map_id', 'data_source', 'filing_date_jittered',
       'note_date_jittered', 'activity_date_jittered',
       'effective_time_jittered', 'filing_date_jittered_utc',
       'note_date_jittered_utc', 'activity_date_jittered_utc',
       'effective_time_jittered_utc', 'admit_time', 'first_ED_time', 'prov_id',
       'prov_map_id', 'name', 'author_type'],
      dtype='object')


Unnamed: 0,anon_id,pat_enc_csn_id_coded,author_prov_map_id,effective_dept_id,note_status_c,note_status,ambulatory,ltr_status_c,letter_status,note_type,...,filing_date_jittered_utc,note_date_jittered_utc,activity_date_jittered_utc,effective_time_jittered_utc,admit_time,first_ED_time,prov_id,prov_map_id,name,author_type
4,JC1049247,738133874,SS0076624,2000250.0,2.0,Signed,N,,,ED Note,...,2017-07-06 05:26:00 UTC,2017-07-06 05:25:00 UTC,,2017-07-06 05:25:00+00:00,2017-07-06 05:50:00+00:00,2017-07-06 02:40:00 UTC,SS0076624,SS0076624,Registered Nurse,rn
5,JC1765121,396665876,SS0039146,2000250.0,2.0,Signed,N,,,ED Note,...,2015-03-12 00:46:00 UTC,2015-03-12 00:46:00 UTC,,2015-03-12 00:46:00+00:00,2015-03-12 03:47:00+00:00,2015-03-11 21:39:00 UTC,SS0039146,SS0039146,Registered Nurse,rn
11,JC794612,916060753,SS0288815,2000251.0,2.0,Signed,N,,,ED Note,...,2018-09-20 23:30:00 UTC,2018-09-20 23:30:00 UTC,,2018-09-20 23:30:00+00:00,2018-09-21 00:02:00+00:00,2018-09-20 22:19:00 UTC,SS0288815,SS0288815,Registered Nurse,rn
12,JC794612,916060753,SS0288815,2000251.0,2.0,Signed,N,,,ED Note,...,2018-09-20 23:30:00 UTC,2018-09-20 23:30:00 UTC,,2018-09-20 23:30:00+00:00,2018-09-21 00:02:00+00:00,2018-09-20 22:19:00 UTC,SS0288815,SS0288815,Consulting Service,md
13,JC794612,916060753,SS0288815,2000251.0,2.0,Signed,N,,,ED Note,...,2018-09-20 23:30:00 UTC,2018-09-20 23:30:00 UTC,,2018-09-20 23:30:00+00:00,2018-09-21 00:02:00+00:00,2018-09-20 22:19:00 UTC,SS0288815,SS0288815,Emergency Resident,md


In [105]:
# get the counts for all notes, rn notes, and md notes for this time window
def get_counts(window_notes, window):
    all_counts = window_notes.groupby(['pat_enc_csn_id_coded']).size().reset_index(name='{}_all_notes'.format(window))
    all_counts.head()

    # get the rn counts
    rn_counts = window_notes[window_notes.author_type == 'rn'].groupby(['pat_enc_csn_id_coded']).size().reset_index(name='{}_rn_notes'.format(window))
    rn_counts.head()

    # get the md counts
    md_counts = window_notes[window_notes.author_type == 'md'].groupby(['pat_enc_csn_id_coded']).size().reset_index(name='{}_md_notes'.format(window))
    md_counts.head()

    counts = all_counts.merge(rn_counts.merge(md_counts, how='outer'), how='outer')
    counts.head()
    
    return(counts)

In [107]:
# get door to dispo counts
door_to_dispo_counts = get_counts(door_to_dispo_notes, "door_to_dispo")
door_to_dispo_counts.head()

Unnamed: 0,pat_enc_csn_id_coded,door_to_dispo_all_notes,door_to_dispo_rn_notes,door_to_dispo_md_notes
0,131062572931,38,13.0,16.0
1,131062745090,8,3.0,3.0
2,131062927111,24,11.0,11.0
3,131063006922,13,6.0,6.0
4,131063022232,31,9.0,16.0


# Dispo to 24hr counts

Get the counts for dispo to 24hr: 

- effective_time >= admit_time

In [108]:
dispo_to_24hr_notes = filtered_csn_notes[filtered_csn_notes.effective_time_jittered_utc >= filtered_csn_notes.admit_time]

# almost all anon_ids have notes during this window
print(filtered_csn_notes.anon_id.nunique())
print(dispo_to_24hr_notes.anon_id.nunique())

# majority of notes happen after admit
print(filtered_csn_notes.shape)
print(dispo_to_24hr_notes.shape)

35144
35133
(1567752, 28)
(1041588, 28)


In [109]:
dispo_to_24hr_counts = get_counts(dispo_to_24hr_notes, "dispo_to_24hr")
dispo_to_24hr_counts.head()

Unnamed: 0,pat_enc_csn_id_coded,dispo_to_24hr_all_notes,dispo_to_24hr_rn_notes,dispo_to_24hr_md_notes
0,131062572931,24,11.0,6.0
1,131062745090,8,2.0,3.0
2,131062927111,28,6.0,13.0
3,131063006922,26,8.0,4.0
4,131063022232,5,1.0,3.0


In [111]:
# join the two window counts together
all_counts = door_to_dispo_counts.merge(dispo_to_24hr_counts, how='outer')
all_counts.head()

Unnamed: 0,pat_enc_csn_id_coded,door_to_dispo_all_notes,door_to_dispo_rn_notes,door_to_dispo_md_notes,dispo_to_24hr_all_notes,dispo_to_24hr_rn_notes,dispo_to_24hr_md_notes
0,131062572931,38.0,13.0,16.0,24.0,11.0,6.0
1,131062745090,8.0,3.0,3.0,8.0,2.0,3.0
2,131062927111,24.0,11.0,11.0,28.0,6.0,13.0
3,131063006922,13.0,6.0,6.0,26.0,8.0,4.0
4,131063022232,31.0,9.0,16.0,5.0,1.0,3.0


In [112]:
savedir

'../../OutputTD/shc2021/'

In [113]:
# save the output
savefile = "8_cohort4_3hr_notes_counts.csv"
all_counts.to_csv(savefile, index=False)