# Exploratory analysis of caregiver note types over time

In [None]:
import os
from importlib_metadata import version
import psycopg2
from sqlalchemy import create_engine
import sys

import numpy as np
import pandas as pd
from datetime import datetime

## plotting
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
from sklearn.metrics import roc_curve, auc,  classification_report
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()

In [None]:
PASSWORD = os.environ.get("PASSWORD")
USERNAME = os.environ.get("USERNAME")
POSTGRES_CONNECT = os.environ.get("POSTGRES_CONNECT")
POSTGRES_ENGINE = os.environ.get("POSTGRES_ENGINE")

conn = psycopg2.connect(POSTGRES_CONNECT)

cur = conn.cursor();
cur.execute("""SET search_path = mimiciii;""")

engine = create_engine(POSTGRES_ENGINE)

In [None]:
path = ''

In [None]:
libraries = ['pandas','sqlalchemy','psycopg2','tqdm','numpy','matplotlib']
print('last ran: ',datetime.now() )
print("Python Version:", sys.version[0:7])
print( "operating system:", sys.platform)

for lib in libraries:
    print(lib + ' version: ' + version(lib))

## Combine note/cg descriptions, times

In [None]:
# read in hadm_ids and dates
hep = pd.read_sql("""select hadm_id, true_admittime from mimiciii.time_study_notes""", engine)

# read in note types
types = pd.read_sql("""SELECT category, description, hadm_id, cgid FROM mimiciii.noteevents""", engine)
types.dropna(subset = ['hadm_id'], inplace =  True) # drop null hadm_ids
types['hadm_id'] = types['hadm_id'].astype(int)
types = types[types['hadm_id'].isin(hep['hadm_id'])] # filter to IDs in heparin
types.fillna(0, inplace = True) # this will result in a category of '0' for null values

# read in caregiver types
cgs = pd.read_sql("""SELECT cgid, label, description FROM mimiciii.caregivers""",engine)
cgs['cgid'] = cgs['cgid'].astype(int)

In [None]:
# add times to note types
datedict = dict(zip(list(hep['hadm_id']), list(hep['true_admittime'])))
types['true_admittime'] = types['hadm_id'].map(datedict) # mapping times
types['yearmo'] = [x.strftime('%Y-%m') for x in types['true_admittime']] # make column for month-year

# add caregiver types to note types
cgid_dict = dict(zip(cgs['cgid'], cgs['description']))
types['cg_type'] = types['cgid'].map(cgid_dict)

In [None]:
types.head()

In [None]:
# aggregate by hadmid
agg = types.groupby(['hadm_id', 'category']).agg({'category': "count"})

In [None]:
agg.rename(columns = {'category' : 'count'}, inplace = True)

In [None]:
agg.reset_index(inplace = True)

In [None]:
agg = agg.pivot_table(values = 'count', index = 'hadm_id', columns = 'category')
agg.fillna(0, inplace = True)
agg = agg.applymap(lambda x: int(x))
try:
    agg.columns = agg.columns.map(int)
except:
    pass
agg.reset_index(inplace = True)
agg.fillna(0, inplace = True)

In [None]:
agg.shape

In [None]:
agg['true_admittime'] = agg['hadm_id'].map(datedict)

## Aggregate Over Time

In [None]:
def agg_df(df, agg_col):
    '''
    Function to aggregate the dataframe by time periods. 
    Args: Takes df and name of column to aggregate (could be 'cg_type', 'cgid', 'category', 'description')
    '''
    # groupby
    agg = df.groupby(['yearmo', agg_col]).agg({agg_col: "count"})
    agg.rename(columns = {agg_col: "count"}, inplace = True)
    agg.reset_index(inplace = True)
    
    # pivot table
    agg = agg.pivot_table(values = 'count', index = 'yearmo', columns = agg_col)
    agg.fillna(0, inplace = True)
    agg = agg.applymap(lambda x: int(x))
    try:
        agg.columns = agg.columns.map(int)
    except:
        pass
    agg.reset_index(inplace = True)
    agg.fillna(0, inplace = True)
    
    # get columns for plotting
    cols = list(agg.columns)[1:]
    
    return agg, cols

In [None]:
def perc_agg(agg, cols):
    '''
    Function to calculate counts as proportions.
    Args: Takes aggregated df and list of columns to sum/calculate on.
    '''
    # making each column a proportion of the total
    agg['sum'] = agg[cols].sum(axis = 1)
    for i in cols:
        agg[i] =  agg[i] / agg['sum']
        
    # getting max percentages
    try:
        notzero = agg.drop(columns = 0) # 0 column represents null values from original df
        newcols = cols
        newcols.remove(0)
    except:
        notzero = agg.copy()
        newcols = cols
    
    notzero['max_perc'] = notzero[newcols].max(axis = 1) # find max percentage of total among nonzero columns
    notzero['max_perc_id'] = notzero[newcols].idxmax(axis = 1) # find max percentage column name
    
    # getting number of unique IDs with nonzero values in a time period
    boolagg = agg[newcols].applymap(lambda x: 1 if x != 0 else 0)
    boolagg['unique_vals'] = boolagg[newcols].sum(axis = 1)
    
    # adding new columns to agg df
    agg['max_perc'] = notzero['max_perc']
    agg['max_perc_id'] = notzero['max_perc_id']
    agg['unique_vals'] =  boolagg['unique_vals']
    
    return agg

## Visualizations

### Line   Plot

In [None]:
def make_lineplot(agg, cols, title, save_name, path = path, zoom_07 = False, prop=6):
    
    plt.figure(figsize=(20,10), dpi = 300)
    
    if zoom_07 == True:
        agg = agg[agg['yearmo'] >= '2007-01']
        lim = agg[cols].max().max()
        plt.ylim(top=lim)
        plt.ylim(bottom=0)
        plt.xticks(np.arange(0, agg.shape[0], 1), rotation=20)
    else:
        plt.xticks(np.arange(4, agg.shape[0], 6), rotation=20)
    
    for i in cols:
        plt.plot(agg['yearmo'], agg[i], label = str(i), linewidth=3)
        
    plt.axvline(x = '2007-07') # plot vertical line for July 2007
    plt.xlabel('Time', fontsize = 20)
    plt.ylabel('Counts', fontsize = 20)
    plt.tick_params(labelsize=18)
    plt.title(title, fontsize = 20)
    
    plt.savefig(path + save_name + '.png')
    
    return plt.legend(bbox_to_anchor=(1.1, 1.05), prop={'size': prop}, framealpha = 1);

##### Caregiver  Types Over Time

In [None]:
# saving caregiver types tocsv
cgtype_agg, cols = agg_df(types, 'cg_type')
cgtype_agg.to_csv(path + 'cgtype_overtime.csv')

In [None]:
make_lineplot(cgtype_agg, cols, "Caregiver Types Over Time", "cg_types_overtime")

In [None]:
# zoomed in version
make_lineplot(cgtype_agg, cols, "Caregiver Types Over Time: 2007-After", "cg_types_overtime_07", zoom_07 = True)

In [None]:
# save percentages to csv
cgtype_perc = perc_agg(cgtype_agg, cols)
cgtype_perc.to_csv(path + 'cgtype_perc_overtime.csv')

In [None]:
cgtype_perc

##### Note Types Over Time

In [None]:
types['category'].value_counts()

In [None]:
types3 = types[(types['category']=='Nursing/other')|(types['category']=='Radiology')|(types['category']=='ECG')|
              (types['category']=='Discharge summary')|(types['category']=='Echo')]

In [None]:
# note types
notetype_agg, cols =  agg_df(types3, 'category')
#notetype_agg.to_csv(path + 'notetypes_overtime.csv')

In [None]:
make_lineplot(notetype_agg, cols, "Note Categories Over Time", "note_categories_overtime", prop=20)

In [None]:
# zoomed in version
make_lineplot(notetype_agg, cols, "Note Categories Over Time: 2007-After", 
              "note_categories_overtime_07", zoom_07 = True)

In [None]:
# save percentages to csv
notetype_perc = perc_agg(notetype_agg, cols)
notetype_perc.to_csv(path + 'notetype_perc_overtime.csv')

### Gantt Chart

In [None]:
# list of 14 unique colors for plotting cgids in gantt chart
colors = ["#000000", "#7ab643", "#8b58cb", "#c1a446", "#6c71c2", "#da8032", "#669ed7",
          "#ce4a36", "#3dbbb8", "#fffc00", "#59b078", "#cc7dd0", "#647832", "#808080"]

In [None]:
def make_gantt(agg, cols, colors = colors, agg_cgid = False):
    '''
    Function for plotting aggregated df as a gantt chart.
    Args: aggregated df, list of columns to plot; if plotting caregiver IDs over time, agg_cgid=True, else False'''
    
    # organizing into new df for Plotly
    mylist = []
    for i in cols:
        mydict = {}
        startid = agg[i].ne(0).idxmax()
        start = agg.loc[startid, 'yearmo']
        s = pd.Series(agg[i])
        endid = s[s != 0].index[-1]
        end = agg.loc[endid, 'yearmo']
        mydict['Task'] = i # each column in the aggregated df = a 'task'
        mydict['Start'] = start # the first time each column is nonzero
        mydict['Finish'] = end # the last time each column is nonzero
        mylist.append(mydict)
    
    mydf = pd.DataFrame(mylist)
    mydf.reset_index(inplace = True, drop = True) 
        
    # If plotting caregiver IDs, need to do some more organizing (because lots of data)
    if agg_cgid == True:
        mydf['Resource'] = mydf['Task'].map(cgid_dict)
        mydf['Resource'].fillna('None', inplace = True) # filling null values in the Resource column
        mydf = mydf.sort_values(by = ['Resource','Start','Finish'])
        
        # preserving order of sorting by overwriting caregiver IDs with index
        mydf.reset_index(inplace = True) # get column for index
        mydf.drop(columns = 'Task', inplace = True) # dropping the task
        mydf.rename(columns = {'index': 'Task'}, inplace = True)
        
        # gantt chart settings
        fig = ff.create_gantt(mydf, colors = colors, index_col = 'Resource', show_colorbar = True)
    
    else:
        mydf = mydf.sort_values(by = ['Start','Finish'])
        fig = ff.create_gantt(mydf) # no other args
    
    return iplot(fig)  # plot gantt chart

#####  Caregiver IDs over time

In [None]:
types['cg_type'].value_counts()

In [None]:
types['cg_type'].fillna('Other', inplace = True)

In [None]:
types2 = types[(types['cg_type']=='Other')|(types['cg_type']=='RN')|(types['cg_type']=='Respiratory')]

In [None]:
#agg_df(types, 'cg_type')

In [None]:
cg_agg, cols = agg_df(types2, 'cgid')

In [None]:
cg_agg.head()

In [None]:
make_gantt(cg_agg, cols, colors = colors, agg_cgid = True)