In [1]:
#run this cell when online
!pip install xlrd

[33mYou are using pip version 10.0.1, however version 19.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## install dependencies and set data paths

In [1]:
import pandas as pd
import numpy as np

import os
import re
from collections import Counter

In [12]:
#use this cell when working online
path = '/floyd/home/ed-triage'
data_path = '/floyd/home/data'

In [2]:
#use this cell when working from home
path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/ed-triage'
data_path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/data/ED triage project/bch'

In [3]:
#loading in the entire spreadsheet as a dataframe
clin = pd.read_excel(data_path + '/BCH ED visits FY1819_with additions.xlsx')

In [4]:
len(clin)

136993

In [5]:
clin.columns

Index(['ID', 'ChartNumber', 'EncounterNumber', 'TriageLevel', 'AgeNumber',
       'AgeInYrs', 'GenderDesc', 'Triage Date & Time', 'Reg Date & Time',
       'PIA Date & Time', 'Disposition Date & Time', 'DischargeDisposition',
       'DischargeDispositionDesc', 'Left ED Date & Time',
       'PresentingComplaint', 'PresentingComplaintDesc', 'MainDiagnosisCode',
       'MainDiagnosisCodeDesc', 'AdmitLocation', 'PatientService',
       'SubjectiveNotes', 'InfectionControlScreening', 'MedicalHistory',
       'BloodPressure_LastEDReading', 'O2Saturation_LastEDReading',
       'Pulse_LastEDReading', 'Temperature_LastEDReading'],
      dtype='object')

## The next few cells are for getting a feel of the data in the different columns and how they might be useful

In [9]:
discharge_count = Counter(clin['DischargeDisposition']); discharge_count

Counter({17: 70703,
         62: 762,
         7: 11199,
         6: 374,
         16: 493,
         40: 226,
         8: 242,
         72: 113,
         63: 250,
         30: 346,
         64: 171,
         12: 242,
         71: 3,
         14: 3,
         61: 12,
         90: 10,
         13: 3,
         9: 2})

In [10]:
discharge_count2 = Counter(clin['DischargeDispositionDesc']); discharge_count2

Counter({'Discharge to private home, condo, apt without support service/referral': 70703,
         'Left at his/her own risk post-initial treatment': 762,
         'Admit to reporting facility as inpatient to another unit from amb care': 11199,
         'Admit to reporting facility as inpatient to SCU or OR from amb care': 374,
         'Discharge to private home, condo, apt with support service/referral': 493,
         'Transfer to Group/supportive living': 226,
         'Transfer to another acute care facility directly from amb care': 242,
         'Died in Facility': 113,
         'Left After Triage': 250,
         'Transfer to Residential care': 346,
         'Left After Initial Assessment': 171,
         'Intra-facility transfer to day surgery': 242,
         'Dead on arrival': 3,
         'Intra-facility transfer to clinic': 3,
         'Left at his/her own risk following registration': 12,
         'Transfer to correctional facility': 10,
         'Intra-facility transfer to ED'

In [11]:
comp_count = Counter(clin['PatientService']); comp_count

Counter({nan: 73660,
         'Cardiology': 1550,
         'ALC General Medicine': 201,
         'General Medicine': 4512,
         'INTENSIVE CARE UNIT': 253,
         'Respirology': 852,
         'General Surgery': 650,
         'Paediatrics': 882,
         'Mental Health': 1198,
         'Orthopaedics': 430,
         'ALC Neurology': 64,
         'Genitourinary': 182,
         'Neurology': 456,
         'Palliative': 65,
         'ALC Respirology': 28,
         'Oncology': 38,
         'ALC Orthopaedics': 23,
         'ALC Cardiology': 38,
         'AD PALLIATIVE': 45,
         'AD General Medicine': 2,
         'Obstetrics': 17,
         'AD Respirology': 1,
         'AD Oncology': 1,
         'ALC General Surgery': 2,
         'Neonatal Retro Transfers': 1,
         'ALC Oncology': 1,
         'AD Neurology': 1,
         'Newborn': 1})

## refactoring some of the cells so they are clean and useful

In [6]:
o2sat = [item if item[-1] != '%' else item[:-1] for item in clin['O2Saturation_LastEDReading'].astype('str')]
o2sat = [np.nan if item == 'nan' else float(item) for item in o2sat] 
clin['o2sat'] = o2sat

In [7]:
pulse = clin['Pulse_LastEDReading']
pulse = [re.sub("[^0-9]", "", str(item)) for item in pulse]
pulse = [np.nan if item == '' else float(item) for item in pulse] 
#set(pulse)
clin['pulse'] = pulse

In [8]:
temp = clin['Temperature_LastEDReading']
temp = [re.sub("[^0-9]", "", str(item)) for item in temp]
temp = [np.nan if item == '' else float(item) for item in temp] 
temp = [item/10 if item > 100 else item for item in temp]
temp = [np.nan if item > 40  else item for item in temp]
#set(temp)
clin['temp'] = temp

In [9]:
def bptrans(bp):
    if pd.isnull(bp) or len(bp.split('/')) !=2:
        return [np.nan, np.nan]
    res = []
    for x in bp.split('/'):
        try:
            float(x)
            res =  [float(x) for x in bp.split('/')]
        except: 
            res =  [np.nan, np.nan]
        return res


In [14]:
#this is to clean the text in the subjective notes column (a little bit)
def fixencode(s):
    if pd.isnull(s):
        s = "None"
    s = s[14:]
    s=s.replace('<LT><LF>','')
    s= s.replace('<LT>LF>','')
    s = s.lower()
    s=s.replace('Pt.', 'patient')
    s=s.replace('Pt', 'patient')
    s= s.replace('pt.', 'patient')
    s=s.replace('pt', 'patient')
    s = s.replace('y/o', 'year old')
    s = s.replace('c/o', 'complains of')
    s = s.replace('C/o', 'complains of')
    #s = s.replace('rt', 'right')
    #s = s.replace('lt', 'left')
    s = s.replace('sob', 'shortness of breath')
    s = s.replace('c/p', 'chest pain')
    s = s.replace('hrs', 'hours')
    s = s.replace('hx', 'history')
    s = s.replace('n/v', 'nausea and/or vomiting')
    s = s.replace('a/e', 'air entry')
    s = s.replace('a/o', 'alert and oriented')
    s = s.replace('a&o', 'alert and oriented')
    s = s.replace('d/c', 'discharge')
    s = s.replace('u/s', 'ultrasound')
    s = s.replace('yrs', 'years')
    s = s.replace('lmp', 'last menstrual period')
    s = s.replace('w/', 'with')
    #others to deal with cp, rt, lt
    
    #note that what's missing here is removal of punctuation
    return s

In [15]:
clin['CleanSubjectiveNotes'] = clin.SubjectiveNotes.map(fixencode)

In [16]:
for item in clin['CleanSubjectiveNotes'].iloc[20000:20010]:
    print (item)


patient complains of fever since yesterday. has history lt breast cancer, lumpectomy domy on sepatient 2017, radiation done , last chemo on aug 2017. at present patient complains of swelling and red since 2 days from armpi towards her breast. last tylenol taken at 1600 hours.

patient had a cast placed on yesterday for a fractured elbow. returning today due to swelling in the left hand. fingers warm to touch. patient denies any pain, but was reporting pain to the mother earlier.
patient had hemorrhagic stroke sepatientember last year with no residual deficit. since the stroke, patient felt some something pulling on top of head, radiates down to neck. patient also feels pins and needles to all over body for nine months. sympatientoms have been worse. patient seen before at bch, given toradol with relief in sympatientoms.

chest pain for 4 years  under investigation no findings, today  having chest pounding  while at rest  no shortness of breath no fever.
patient reports of shortness of

In [17]:
clin['BP'] = clin.BloodPressure_LastEDReading.map(bptrans)

clin[['systolic', 'diastolic']] = pd.DataFrame(clin.BP.tolist(), index= clin.index)
clin["Gender"] = clin.GenderDesc.map(lambda x: 1 if x == "Male" else 0)

## split the screening questions
- result is a df called: screening_df

In [18]:
qlist = ['Are you feeling feverish or have had shakes or chills in the last 24 hours?',
         'Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?',
        'Do you have a new Rash?',
        'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?',
         'Have you travelled outside of Canada/USA in the last 3 weeks?',
         'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?',
         'Have you received Health Care in another country in the last 2 years?',
        'Do you have a new/worse cough or shortness of breath?',
         'If so, select all countries that apply',
        'If so, select all infectious diseases that apply']

In [19]:
#making a dict so that the indect of an item on the list, matches the question later
q_dict = {i:q for i,q in enumerate(qlist)}

In [20]:
q_dict

{0: 'Are you feeling feverish or have had shakes or chills in the last 24 hours?',
 1: 'Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?',
 2: 'Do you have a new Rash?',
 3: 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?',
 4: 'Have you travelled outside of Canada/USA in the last 3 weeks?',
 5: 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?',
 6: 'Have you received Health Care in another country in the last 2 years?',
 7: 'Do you have a new/worse cough or shortness of breath?',
 8: 'If so, select all countries that apply',
 9: 'If so, select all infectious diseases that apply'}

In [21]:
#this is going to be a list made of up all the entries in the infection control screening column
#casting the results as string to manipulate later
qans = list(clin['InfectionControlScreening'].str[14:].astype('str'))

In [22]:
#this is going to give me a way to split the questions by finding where they occur in the field
split_points = []
for item in qans:
    temp_points = []
    for word in qlist:
        temp_points.append(item.find(word))
    split_points.append(temp_points)
    

In [23]:
#this is going to give me list of places to split each entry (using only the questions that are actually present)
real_points = []
for item in split_points:
    real_points.append([idx for idx in sorted(item) if idx > 0])

In [24]:
split_points[10000:10010]

[[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [241, -1, 400, 325, 0, 70, -1, 179, -1, -1],
 [290, -1, 449, 374, 0, 119, -1, 228, 70, -1],
 [241, -1, 400, 325, 0, 70, -1, 179, -1, -1],
 [241, -1, 400, 325, 0, 70, -1, 179, -1, -1],
 [241, -1, 400, 325, 0, 70, -1, 179, -1, -1],
 [241, -1, 400, 325, 0, 70, -1, 179, -1, -1],
 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [241, -1, 400, 325, 0, 70, -1, 179, -1, -1],
 [241, -1, 400, 325, 0, 70, -1, 179, -1, -1]]

In [25]:
#anything from the list above which would have been empty in the original data
#gives a list of -1's in split_points and nothing in real_points
real_points[10000:10010]

[[],
 [70, 179, 241, 325, 400],
 [70, 119, 228, 290, 374, 449],
 [70, 179, 241, 325, 400],
 [70, 179, 241, 325, 400],
 [70, 179, 241, 325, 400],
 [70, 179, 241, 325, 400],
 [],
 [70, 179, 241, 325, 400],
 [70, 179, 241, 325, 400]]

In [26]:
print(qans[1000])
print()
print (split_points[1000])
print ()
print(real_points[1000])

Have you travelled outside of Canada/USA in the last 3 weeks? N<LT>LF>Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks? N<LT>LF>Do you have a new/worse cough or shortness of breath? N<LT>LF>Are you feeling feverish or have had shakes or chills in the last 24 hours? N<LT>LF>Do you have a new onset of Vomiting/Diarrhea in the last 24 hours? N<LT>LF>Do you have a new Rash? N

[241, -1, 400, 325, 0, 70, -1, 179, -1, -1]

[70, 179, 241, 325, 400]


In [27]:
#this should mutate qans so each item is now a list of q and a's
for idx in range(len(qans)):
    qans[idx] = [qans[idx][i : j] for i, j in zip([0] + real_points[idx], real_points[idx] + [None])]
    

In [28]:
qans[1000]

['Have you travelled outside of Canada/USA in the last 3 weeks? N<LT>LF>',
 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks? N<LT>LF>',
 'Do you have a new/worse cough or shortness of breath? N<LT>LF>',
 'Are you feeling feverish or have had shakes or chills in the last 24 hours? N<LT>LF>',
 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours? N<LT>LF>',
 'Do you have a new Rash? N']

In [29]:
qans[10000]

['nan']

In [30]:
#this will mutate each item of each list in qans and drop the <LT>LF> if present
for q in qans:
    for idx in range(len(q)):
        if q[idx].find('<LT>LF>') != -1:
            q[idx] = q[idx][:-7]
        

In [31]:
qans[1000]

['Have you travelled outside of Canada/USA in the last 3 weeks? N',
 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks? N',
 'Do you have a new/worse cough or shortness of breath? N',
 'Are you feeling feverish or have had shakes or chills in the last 24 hours? N',
 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours? N',
 'Do you have a new Rash? N']

In [32]:
qans[10000]

['nan']

In [33]:
qans[1000]

['Have you travelled outside of Canada/USA in the last 3 weeks? N',
 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks? N',
 'Do you have a new/worse cough or shortness of breath? N',
 'Are you feeling feverish or have had shakes or chills in the last 24 hours? N',
 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours? N',
 'Do you have a new Rash? N']

In [34]:
seg_list = []
for q in qans:
    #print ('initial:', q)
    test_question = q
    res = []
    for item in test_question:
        #print (item)
        for qq in qlist:
            if item.find(qq) != -1:
                #print ('question:',item[:len(qq)])
                #print ('answer:',item[len(qq)+1:])
                res.append([item[:len(qq)],item[len(qq)+1:] ])
    seg_list.append(res)
    #print ('\nresult:',res, '\n\n')

In [35]:
len(seg_list)

136993

In [36]:
seg_list2 = []
for q in qans:
    #print ('initial:', q)
    test_question = q
    res = {}
    for item in test_question:
        #print (item)
        for qq in qlist:
            if item.find(qq) != -1:
                question = item[:len(qq)]
                answer = item[len(qq)+1:]
                res[question] = answer
    seg_list2.append(res)

In [37]:
final_list = []
for q in range(len(split_points)):
    temp = []
    for i in range(len(split_points[q])):
        if split_points[q][i] == -1:
            #print('nan')
            temp.append('nan')
        else:
            #print (brief_list2[q][q_dict[i]])  # q_dict[i], i, brief_splits[q][i],
            temp.append(seg_list2[q][q_dict[i]])
    final_list.append(temp)
    #print ('\n next item \n')
            

In [38]:
len(final_list)

136993

In [39]:
final_list[100:110]

[['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan']]

In [40]:
final_list[1000:1020]

[['N', 'nan', 'N', 'N', 'N', 'N', 'nan', 'N', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['N', 'nan', 'N', 'N', 'N', 'N', 'nan', 'Y', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 [

In [41]:
screening_df = pd.DataFrame(final_list, columns = qlist)
screening_df.head()

Unnamed: 0,Are you feeling feverish or have had shakes or chills in the last 24 hours?,Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?,Do you have a new Rash?,Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?,Have you travelled outside of Canada/USA in the last 3 weeks?,Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?,Have you received Health Care in another country in the last 2 years?,Do you have a new/worse cough or shortness of breath?,"If so, select all countries that apply","If so, select all infectious diseases that apply"
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,


In [42]:
screening_df.shape

(136993, 10)

In [43]:
screening_df.iloc[80000:80010]

Unnamed: 0,Are you feeling feverish or have had shakes or chills in the last 24 hours?,Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?,Do you have a new Rash?,Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?,Have you travelled outside of Canada/USA in the last 3 weeks?,Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?,Have you received Health Care in another country in the last 2 years?,Do you have a new/worse cough or shortness of breath?,"If so, select all countries that apply","If so, select all infectious diseases that apply"
80000,N,,N,N,N,N,,N,,
80001,N,,N,N,N,N,,N,,
80002,N,,N,N,N,N,,N,,
80003,N,,N,N,N,N,,N,,
80004,N,,N,N,N,N,,N,,
80005,N,N,N,N,N,N,N,N,,
80006,,,,,,,,,,
80007,N,,N,N,Y,,,N,IND,
80008,N,,,,N,,,N,,
80009,N,,N,N,N,N,,N,,


In [44]:
screening_df.replace('nan',np.NaN, inplace = True)

In [45]:
screening_df.iloc[80000:80010]

Unnamed: 0,Are you feeling feverish or have had shakes or chills in the last 24 hours?,Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?,Do you have a new Rash?,Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?,Have you travelled outside of Canada/USA in the last 3 weeks?,Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?,Have you received Health Care in another country in the last 2 years?,Do you have a new/worse cough or shortness of breath?,"If so, select all countries that apply","If so, select all infectious diseases that apply"
80000,N,,N,N,N,N,,N,,
80001,N,,N,N,N,N,,N,,
80002,N,,N,N,N,N,,N,,
80003,N,,N,N,N,N,,N,,
80004,N,,N,N,N,N,,N,,
80005,N,N,N,N,N,N,N,N,,
80006,,,,,,,,,,
80007,N,,N,N,Y,,,N,IND,
80008,N,,,,N,,,N,,
80009,N,,N,N,N,N,,N,,


## making the new target cells

In [19]:
admit_count= Counter(clin['AdmitLocation']); admit_count

Counter({nan: 120106,
         'N.SUR GEN': 1024,
         'N.MED CARD': 1738,
         'N.ER IN': 1186,
         'N.MED SIX': 604,
         'N.CC ICU': 455,
         'N.CC CCU': 584,
         'N.MED ONC': 1024,
         'N.MAU': 2365,
         'N.SIMCU': 301,
         'N.MED CPU': 329,
         'N.MED GER': 1070,
         'N.MED CT': 144,
         'N.WC PAED': 1291,
         'N.ERMH IN': 29,
         'N.MED SIM': 450,
         'N.SUR ORTH': 661,
         'N.MH CHADI': 152,
         'N.MH GER': 68,
         'N.MH GENB': 217,
         'N.MH INTEN': 234,
         'N.MED RESP': 944,
         'N.MED NEUR': 921,
         'N.SUR SS': 582,
         'N.MH CHAD': 210,
         'N.WC POST': 119,
         'N.MH GENC': 47,
         'N.MED DIAL': 48,
         'N.CC NICU': 5,
         'N.WC LD': 2,
         'N.WC LDN': 1,
         'N.MAIN OR': 1,
         'N.MED STTU': 1,
         'N.MED FLEX': 10,
         'N.MED DSU': 69,
         'N.SUR DSU': 1})

In [48]:
clin['outcome'] = clin['AdmitLocation'].astype('str').map(
{'N.CC CCU': 'madmit',
 'N.CC ICU': 'ICU',
 'N.ER IN' : 'madmit',
 'N.ERMH IN': 'madmit',
 'N.MAU': 'madmit',
 'N.MED CARD': 'madmit',
 'N.MED CPU': 'madmit',
 'N.MED CT': 'madmit',
 'N.MED GER': 'madmit',
 'N.MED DIAL': 'madmit',
 'N.MED NEUR': 'madmit',
 'N.MED ONC': 'madmit',
 'N.MED RESP': 'madmit',
 'N.MED SIM': 'madmit',
 'N.MED SIX': 'madmit',
 'N.MH CHAD': 'madmit',
 'N.MH CHADI': 'madmit',
 'N.MH GENB': 'madmit',
 'N.MH GER': 'madmit',
 'N.MH INTEN': 'madmit',
 'N.SIMCU': 'ICU',
 'N.SUR GEN': 'sadmit',
 'N.SUR ORTH': 'sadmit',
 'N.SUR SS': 'sadmit',
 'N.WC PAED': 'madmit',
 'N.WC POST': 'madmit',
 'N.MH GENC': 'madmit',
 'N.CC NICU': 'ICU',
 'N.WC LD': 'sadmit',
  'N.WC LDN': 'sadmit',
  'N.MAIN OR': 'sadmit',
  'N.MED STTU': 'madmit',
  'N.MED FLEX': 'madmit',
    'N.MED DSU': 'madmit',
    'N.SUR DSU': 'sadmit',
 'nan': 'discharge'})

In [49]:
#first target columns is based on the admitting location
clin['target'] = clin['outcome'].map({'discharge': 1, 'madmit': 2, 'sadmit': 3, 'ICU': 4})

In [50]:
outcome_count = Counter(clin['outcome']); outcome_count

Counter({'discharge': 120106, 'sadmit': 2272, 'madmit': 13854, 'ICU': 761})

In [51]:
target_count = Counter(clin['target']); target_count

Counter({1: 120106, 3: 2272, 2: 13854, 4: 761})

In [52]:
clin['service'] = clin['PatientService'].astype('str').map(
    {'nan': 'discharge',
         'General Surgery': 'sadmit',
         'Respirology': 'madmit',
         'General Medicine': 'madmit',
         'INTENSIVE CARE UNIT': 'ICU',
         'Cardiology': 'madmit',
         'Oncology': 'madmit',
         'Palliative': 'madmit',
         'AD PALLIATIVE': 'madmit',
         'Genitourinary': 'sadmit',
         'Paediatrics': 'madmit',
         'Mental Health': 'madmit',
         'Orthopaedics': 'sadmit',
         'MH Child & Adolescent': 'madmit',
         'AD Oncology': 'madmit',
         'Neurology': 'madmit',
         'ALC General Medicine': 'madmit',
         'ALC General Surgery': 'sadmit',
         'Acute Care for Elders': 'madmit',
         'ALC Neurology': 'madmit',
         'ALC Cardiology': 'madmit',
         'ALC Respirology': 'madmit',
         'ALC Orthopaedics': 'madmit',
         'Neonatal Retro Transfers': 'madmit',
         'Newborn': 'madmit',
         'Obstetrics': 'madmit',
         'ALC Oncology': 'madmit',
         'AD General Medicine': 'madmit',
         'AD Acute Care for Elders': 'madmit',
         'AD Respirology': 'madmit',
         'ALC ACUTE CARE FOR EDLERS': 'madmit',
         'AD Cardiology': 'madmit',
         'AD Neurology': 'madmit',
         'ALC Palliative': 'madmit',
         'AD General Surgery': 'sadmit'})

In [53]:
service_count = Counter(clin['service'])
service_count

Counter({'discharge': 120106, 'sadmit': 2528, 'madmit': 13905, 'ICU': 454})

In [54]:
#second target is based on the admitting service
clin['target2'] = clin['service'].map({'discharge': 1, 'madmit': 2, 'sadmit': 3, 'ICU': 4})

In [55]:
target2_count = Counter(clin['target2']); target2_count

Counter({1: 120106, 3: 2528, 2: 13905, 4: 454})

In [188]:
#creating this for later use to map the numbers in the dataframe column to their original meaning
out_dict = {'discharge': 1, 'madmit': 2, 'sadmit': 3, 'ICU': 4}
out_dict = {v:k for k,v in out_dict.items()}
out_dict

{1: 'discharge', 2: 'madmit', 3: 'sadmit', 4: 'ICU'}

In [56]:
#discharge vs. not target
clin['discharge'] = clin['outcome'].map({'discharge': 'discharge', 'madmit': 'admit', 'sadmit': 'admit', 'ICU': 'admit'})

In [57]:
clin['target3'] = clin['outcome'].map({'discharge': 1, 'madmit': 0, 'sadmit': 0, 'ICU': 0})

In [58]:
Counter(clin['discharge']), Counter(clin['target3'])

(Counter({'discharge': 120106, 'admit': 16887}),
 Counter({1: 120106, 0: 16887}))

In [59]:
#a target for trying to tell apart the inpatient disposition, only in admitted patients
clin['dispo']= clin['outcome'].map({'discharge': np.nan, 'madmit': 'madmit', 'sadmit': 'sadmit', 'ICU': 'ICU'})

In [60]:
clin['target4'] = clin['dispo'].map({np.nan:np.nan, 'madmit': 1, 'sadmit': 2, 'ICU': 3})
#for some reason a can't make a counter out of this without it crashing so I've stopped trying

In [61]:
Counter(clin['dispo']), Counter(clin['target4'].dropna())

(Counter({nan: 120106, 'sadmit': 2272, 'madmit': 13854, 'ICU': 761}),
 Counter({2.0: 2272, 1.0: 13854, 3.0: 761}))

In [62]:
#gonna make a target that is ICU vs other
clin['ICUvsother'] = clin['PatientService'][clin['PatientService'] == 'INTENSIVE CARE UNIT']
#clin['PatientService']

In [63]:
clin['target5'] = clin['ICUvsother'].map({'INTENSIVE CARE UNIT':1, np.nan:0})

In [64]:
sorted(clin.columns)

['AdmitLocation',
 'AgeInYrs',
 'AgeNumber',
 'BP',
 'BloodPressure_LastEDReading',
 'ChartNumber',
 'CleanSubjectiveNotes',
 'DischargeDisposition',
 'DischargeDispositionDesc',
 'Disposition Date & Time',
 'EncounterNumber',
 'Gender',
 'GenderDesc',
 'ICUvsother',
 'ID',
 'InfectionControlScreening',
 'Left ED Date & Time',
 'MainDiagnosisCode',
 'MainDiagnosisCodeDesc',
 'MedicalHistory',
 'O2Saturation_LastEDReading',
 'PIA Date & Time',
 'PatientService',
 'PresentingComplaint',
 'PresentingComplaintDesc',
 'Pulse_LastEDReading',
 'Reg Date & Time',
 'SubjectiveNotes',
 'Temperature_LastEDReading',
 'Triage Date & Time',
 'TriageLevel',
 'diastolic',
 'discharge',
 'dispo',
 'o2sat',
 'outcome',
 'pulse',
 'service',
 'systolic',
 'target',
 'target2',
 'target3',
 'target4',
 'target5',
 'temp']

## splitting into a bunch of sub dataframes and saving them as their own csv files
- loading the excel file is very slow, but loading csv files is faster so I'm making a bunch of dataframes of subsets of data for easy saving/loading/combining

In [65]:
tab_df = clin[['ID', 'TriageLevel',
       'AgeInYrs', 'GenderDesc', 'DischargeDisposition',
       'PresentingComplaint', 'PresentingComplaintDesc', 'AdmitLocation', 'PatientService',
       'BloodPressure_LastEDReading','systolic', 'diastolic','temp','pulse','o2sat']]

In [66]:
target_df = clin[['outcome','target', 'service','target2', 'discharge', 'target3','dispo','target4', 'ICUvsother', 'target5']]

In [67]:
date_df = clin[['Triage Date & Time', 'Reg Date & Time',
       'PIA Date & Time', 'Disposition Date & Time', 'Left ED Date & Time']]

## parsing of the medical history into categories

In [68]:
histories = Counter(clin['MedicalHistory']); len(histories)

41467

In [69]:
history = list(clin['MedicalHistory'].str[16:].astype('str'))

In [70]:
for item in history[1000:1050]:
    print (item)

No Significant Medical History
nan
nan
nan
nan
nan
nan
nan
nan
unknown
nan
nan
COPD, Depression, Hypertension
nan
nan
nan
nan
nan
nan
nan
nan
No Significant Medical History
nan
No Significant Medical History
nan
nan
nan
nan
Atrial Fibrillation (A Fib)<LT>LF>Dementia<LT>LF>Hypertension (HTN)<LT>LF>frequent UTI<LT>LF>Gerd
nan
Hypertension (HTN)<LT>LF>High Cholesterol<LT>LF>Hypothyroid<LT>LF>Osteoporosis<LT>LF>NIDDM (Non-Insulin-Dependent Diabetes Mellitus)<LT>LF>Hysterectomy
nan
No Significant Medical History
nan
No Significant Medical History
nan
nan
nan
nan
IDDM (Insulin-Dependent Diabetes Mellitus)<LT>LF>GERD (Gastroesophageal Reflux Disease)<LT>LF>Glaucoma
nan
nan
nan
nan
Benign Prostatic Hypertension<LT>LF>Asthma<LT>LF>hx of lung CA
nan
nan
nan
nan
nan


In [71]:
#going through the medical histories field and splitting into a list of co-morbid conditions
#split_hist = [item.split('<LT>LF>') for item in history]# if item != 'nan']

In [72]:
replacements = (',','<LT>LF>', '.')
split_hist = []
for item in history:
    for r in replacements:
        item = item.replace(r, '::')
    item = item.split('::')
    item = [re.sub(r'\(.*?\)\ *', '', s) for s in item]
    
    item = [x.strip().lower() for x in item if x != 'nan']
    
    item = list(filter(None, item))
    split_hist.append(item)

In [73]:
#rejoining them as a string with diagnoses separated by commas
join_hist = [','.join(item) for item in split_hist]

In [74]:
#replacing empty string with NaN
join_hist = [np.nan if item == 'nan' else item for item in join_hist]

In [75]:
len(join_hist)

136993

In [76]:
join_hist[2000:2020]

['hypertension,acid reflux',
 'hypertension,high cholesterol,depression',
 'hypertension,thyroid,arthritis',
 '',
 'no significant medical history',
 'anxiety,depression',
 'pericarditis',
 'anxiety',
 'asthma',
 'no significant medical history',
 '',
 'no significant medical history',
 'chronic back pain,arthritis,hypertension',
 'dementia,iddm,hypertension,high cholesterol,cva,pancreatitis,crf,legally blind',
 'no significant medical history,depression',
 'asthma',
 'no significant medical history',
 'hypertension',
 'no significant medical history',
 '']

In [77]:
clin['pmhx'] = join_hist

In [78]:
subj_df = clin[['SubjectiveNotes', 'MedicalHistory', 'pmhx']]

### exploring different diagnoses

In [79]:
diagnoses = Counter()
for hx in split_hist:
    for item in hx:
        diagnoses[item] += 1

In [80]:
#there are 20464 different medical histories used in the dataset!!!
len(diagnoses)

21951

In [81]:
#sorted diagnoses
diagnoses.most_common(40)

[('no significant medical history', 48412),
 ('hypertension', 19512),
 ('high cholesterol', 17310),
 ('niddm', 9426),
 ('depression', 6086),
 ('high bp', 5559),
 ('asthma', 5297),
 ('anxiety', 4713),
 ('thyroid', 3861),
 ('gerd', 3804),
 ('htn', 3575),
 ('iddm', 3385),
 ('hypothyroid', 3335),
 ('diabetes', 3327),
 ('mi', 2203),
 ('arthritis', 2082),
 ('copd', 1943),
 ('acid reflux', 1932),
 ('anemia', 1410),
 ('chf', 1377),
 ('dementia', 1251),
 ('seizure', 1189),
 ('cva', 1186),
 ('schizophrenia', 1101),
 ('atrial fibrillation', 1056),
 ('substance misuse', 956),
 ('gout', 934),
 ('kidney stones', 923),
 ('enlarged prostate', 855),
 ('osteoporosis', 791),
 ('cholesterol', 780),
 ('immunizations up to date', 727),
 ('adhd', 723),
 ('afib', 700),
 ('migraines', 697),
 ('high chol', 684),
 ('bipolar', 678),
 ('chronic back pain', 634),
 ('pacemaker/cardioverter/defib', 620),
 ('cad', 609)]

In [82]:
#this is a dictionary mapping the rank of the diagnosis to its name
dx_rank_dict = {item[0]:i+1 for i,item in enumerate(diagnoses.most_common())}

#this is a dictionary mapping the frequency of occurence to its name
dx_freq_dict = {item[0]:item[1]/len(diagnoses) for item in diagnoses.most_common()}

In [84]:
#trying to get a feel for how many diagoses are provided for each patient
split_hist_len = [len(item) for item in split_hist]
hxdf = pd.DataFrame(split_hist_len)

In [85]:
hxdf.describe()

Unnamed: 0,0
count,136993.0
mean,1.72273
std,1.60856
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,27.0


In [86]:
lengths = Counter(split_hist_len); lengths

Counter({0: 14782,
         1: 74900,
         6: 2387,
         4: 7667,
         5: 4428,
         2: 18245,
         3: 11959,
         8: 652,
         10: 180,
         9: 343,
         7: 1208,
         13: 38,
         11: 101,
         12: 51,
         15: 19,
         17: 7,
         21: 2,
         16: 6,
         14: 13,
         18: 2,
         20: 1,
         22: 1,
         27: 1})

In [87]:
#most patients have only a single diagnosis, 80% have 2 or less, 96% have 6 or less
for k,v in sorted(lengths.items()):
    print (k, '{0:2f}%'.format(v*100/len(split_hist_len)))

0 10.790332%
1 54.674326%
2 13.318199%
3 8.729643%
4 5.596636%
5 3.232282%
6 1.742425%
7 0.881797%
8 0.475937%
9 0.250378%
10 0.131394%
11 0.073726%
12 0.037228%
13 0.027739%
14 0.009490%
15 0.013869%
16 0.004380%
17 0.005110%
18 0.001460%
20 0.000730%
21 0.001460%
22 0.000730%
27 0.000730%


In [88]:
#this will create a list for each item where the top 6 diagnoses are listed in order
ordered_hist = []
for item in split_hist:
    #print (item)
    #print (len(item))
    top_6 = []
    for hx in item:
        #print (hx)
        #print (dx_rank_dict[hx])
        #print (dx_rank_dict[hx], hx)
        if item == 'nan':
            top_6.append(np.nan)
        else:
            top_6.append((dx_rank_dict[hx], hx))
    #print (top_6)
    sort = sorted(top_6)
    #print (sort)
    sort2 = [ii for i,ii in sort]
    #print(sort2)
    if len(sort2) > 6:
        sort2 = sort2[:5]
    if len(sort2) < 6:
        pad_req = 6-len(sort2)
        i = 0
        while i < pad_req:
            sort2.append(np.nan) #('n/a')
            i = i+1
        
    #print (top_6)
    #print (sorted(top_6))
    ordered_hist.append(sort2)
    #print ('next item')
    #print ()

In [89]:
medhx_df = pd.DataFrame(ordered_hist, columns = ['medhx' + str(i+1) for i in range(6)])
medhx_df.shape

(136993, 6)

In [90]:
ordered_hist[2000:2020]

[['hypertension', 'acid reflux', nan, nan, nan, nan],
 ['hypertension', 'high cholesterol', 'depression', nan, nan, nan],
 ['hypertension', 'thyroid', 'arthritis', nan, nan, nan],
 [nan, nan, nan, nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 ['depression', 'anxiety', nan, nan, nan, nan],
 ['pericarditis', nan, nan, nan, nan, nan],
 ['anxiety', nan, nan, nan, nan, nan],
 ['asthma', nan, nan, nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 [nan, nan, nan, nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 ['hypertension', 'arthritis', 'chronic back pain', nan, nan, nan],
 ['hypertension', 'high cholesterol', 'iddm', 'dementia', 'cva', nan],
 ['no significant medical history', 'depression', nan, nan, nan, nan],
 ['asthma', nan, nan, nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 ['hypertension', nan, nan, nan, nan, nan],
 ['no significant medical history', nan, nan,

In [91]:
comorbids = []

for item in ordered_hist:
    res = [x for x in item if str(x) != 'nan']
    if len(res) == 0: comorbids.append(np.nan)
    else:
        res = [x for x in res if x != 'no significant medical history']
        #print (res, len(res))
        comorbids.append(len(res))

In [92]:
#so now I want to add this to the tabular data frame
tab_df['num_comorbids'] = comorbids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


### now I'm going to make a couple of balanced datasets

In [95]:
Counter(clin.target), Counter(clin.target3)

(Counter({1: 120106, 3: 2272, 2: 13854, 4: 761}),
 Counter({1: 120106, 0: 16887}))

In [96]:
dc_df = clin[clin.target3==1]
dc_df = dc_df.sample(n= 11494)  #this is the number of admissions from target 3

admit_df = clin[clin.target3 == 0]

In [97]:
balanced_df1 = pd.concat([admit_df, dc_df], axis = 0)

In [98]:
icu_df = clin[clin.target2 == 4]
madmit_df = clin[clin.target2 ==3].sample(n = len(icu_df))
sadmit_df = clin[clin.target2 ==2].sample(n = len(icu_df))
disc_df = clin[clin.target2 == 1].sample(n = len(icu_df))

In [99]:
#this is a dataframe with equal numbers of all 4 classes
balanced_df2 = pd.concat([icu_df, madmit_df, sadmit_df, disc_df], axis = 0)

In [100]:
non_icu_df = clin[clin.target2 !=4].sample(n = len(icu_df))

In [101]:
balanced_df3 = pd.concat([icu_df, non_icu_df], axis = 0)

In [102]:
balanced_df3.columns

Index(['ID', 'ChartNumber', 'EncounterNumber', 'TriageLevel', 'AgeNumber',
       'AgeInYrs', 'GenderDesc', 'Triage Date & Time', 'Reg Date & Time',
       'PIA Date & Time', 'Disposition Date & Time', 'DischargeDisposition',
       'DischargeDispositionDesc', 'Left ED Date & Time',
       'PresentingComplaint', 'PresentingComplaintDesc', 'MainDiagnosisCode',
       'MainDiagnosisCodeDesc', 'AdmitLocation', 'PatientService',
       'SubjectiveNotes', 'InfectionControlScreening', 'MedicalHistory',
       'BloodPressure_LastEDReading', 'O2Saturation_LastEDReading',
       'Pulse_LastEDReading', 'Temperature_LastEDReading', 'o2sat', 'pulse',
       'temp', 'CleanSubjectiveNotes', 'BP', 'systolic', 'diastolic', 'Gender',
       'outcome', 'target', 'service', 'target2', 'discharge', 'target3',
       'dispo', 'target4', 'ICUvsother', 'target5', 'pmhx'],
      dtype='object')

## saving dataframes to csv files

In [103]:
balanced_df1.to_csv(data_path + '/balanced_admit_dc_nlp_data.csv')

balanced_df2.to_csv(data_path +'/balanced_4cls_nlp_data.csv')

balanced_df3.to_csv(data_path +'/balanced_icuvsother_nlp_data.csv')

target_df.to_csv(data_path  + '/targets.csv')

screening_df.to_csv(data_path + '/inf_control_data.csv')

tab_df.to_csv(data_path + '/tabular_data.csv')

date_df.to_csv(data_path + '/data_data.csv')

subj_df.to_csv(data_path + '/subj_data.csv')

medhx_df.to_csv(data_path + '/med_hx.csv')