In [1]:
#run this cell when online
!pip install xlrd

[33mYou are using pip version 10.0.1, however version 19.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## install dependencies and set data paths

In [11]:
import pandas as pd
import numpy as np

import os
import re
from collections import Counter

In [12]:
#use this cell when working online
path = '/floyd/home/ed-triage'
data_path = '/floyd/home/data'

In [13]:
#use this cell when working from home
path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/ed-triage'
data_path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/data/ED triage project/egh'

In [111]:
#loading in the entire spreadsheet as a dataframe
clin = pd.read_excel(data_path + '/EGH ED visits FY1819_with additions.xlsx')

In [112]:
len(clin)

85154

In [113]:
clin.columns

Index(['ID', 'ChartNumber', 'EncounterNumber', 'TriageLevel', 'AgeNumber',
       'AgeInYrs', 'GenderDesc', 'Triage Date & Time', 'Reg Date & Time',
       'PIA Date & Time', 'Disposition Date & Time', 'DischargeDisposition',
       'DischargeDispositionDesc', 'Left ED Date & Time',
       'PresentingComplaint', 'PresentingComplaintDesc', 'MainDiagnosisCode',
       'MainDiagnosisCodeDesc', 'AdmitLocation', 'PatientService',
       'SubjectiveNotes', 'InfectionControlScreening', 'MedicalHistory',
       'BloodPressure_LastEDReading', 'O2Saturation_LastEDReading',
       'Pulse_LastEDReading', 'Temperature_LastEDReading'],
      dtype='object')

## The next few cells are for getting a feel of the data in the different columns and how they might be useful

In [9]:
discharge_count = Counter(clin['DischargeDisposition']); discharge_count

Counter({17: 70703,
         62: 762,
         7: 11199,
         6: 374,
         16: 493,
         40: 226,
         8: 242,
         72: 113,
         63: 250,
         30: 346,
         64: 171,
         12: 242,
         71: 3,
         14: 3,
         61: 12,
         90: 10,
         13: 3,
         9: 2})

In [10]:
discharge_count2 = Counter(clin['DischargeDispositionDesc']); discharge_count2

Counter({'Discharge to private home, condo, apt without support service/referral': 70703,
         'Left at his/her own risk post-initial treatment': 762,
         'Admit to reporting facility as inpatient to another unit from amb care': 11199,
         'Admit to reporting facility as inpatient to SCU or OR from amb care': 374,
         'Discharge to private home, condo, apt with support service/referral': 493,
         'Transfer to Group/supportive living': 226,
         'Transfer to another acute care facility directly from amb care': 242,
         'Died in Facility': 113,
         'Left After Triage': 250,
         'Transfer to Residential care': 346,
         'Left After Initial Assessment': 171,
         'Intra-facility transfer to day surgery': 242,
         'Dead on arrival': 3,
         'Intra-facility transfer to clinic': 3,
         'Left at his/her own risk following registration': 12,
         'Transfer to correctional facility': 10,
         'Intra-facility transfer to ED'

In [11]:
comp_count = Counter(clin['PatientService']); comp_count

Counter({nan: 73660,
         'Cardiology': 1550,
         'ALC General Medicine': 201,
         'General Medicine': 4512,
         'INTENSIVE CARE UNIT': 253,
         'Respirology': 852,
         'General Surgery': 650,
         'Paediatrics': 882,
         'Mental Health': 1198,
         'Orthopaedics': 430,
         'ALC Neurology': 64,
         'Genitourinary': 182,
         'Neurology': 456,
         'Palliative': 65,
         'ALC Respirology': 28,
         'Oncology': 38,
         'ALC Orthopaedics': 23,
         'ALC Cardiology': 38,
         'AD PALLIATIVE': 45,
         'AD General Medicine': 2,
         'Obstetrics': 17,
         'AD Respirology': 1,
         'AD Oncology': 1,
         'ALC General Surgery': 2,
         'Neonatal Retro Transfers': 1,
         'ALC Oncology': 1,
         'AD Neurology': 1,
         'Newborn': 1})

## refactoring some of the cells so they are clean and useful

In [114]:
o2sat = [item if item[-2:] != 'ra' else item[:-2] for item in clin['O2Saturation_LastEDReading'].astype('str')]
o2sat = [np.nan if (len(item) <2 or len(item) >3) else float(item) for item in o2sat] 
o2sat = [np.nan if item == 'nan' else float(item) for item in o2sat] 
clin['o2sat'] = o2sat

In [115]:
pulse = clin['Pulse_LastEDReading']
pulse = [re.sub("[^0-9]", "", str(item)) for item in pulse]
pulse = [np.nan if item == '' else float(item) for item in pulse] 
#set(pulse)
clin['pulse'] = pulse

In [116]:
temp = clin['Temperature_LastEDReading']
temp = [re.sub("[^0-9]", "", str(item)) for item in temp]
temp = [np.nan if item == '' else float(item) for item in temp] 
temp = [item/10 if item > 100 else item for item in temp]
temp = [np.nan if item > 40  else item for item in temp]
#set(temp)
clin['temp'] = temp

In [117]:
def bptrans(bp):
    if pd.isnull(bp) or len(bp.split('/')) !=2:
        return [np.nan, np.nan]
    res = []
    for x in bp.split('/'):
        try:
            float(x)
            res =  [float(x) for x in bp.split('/')]
        except: 
            res =  [np.nan, np.nan]
        return res


In [141]:
#this is to clean the text in the subjective notes column (a little bit)
def fixencode(s):
    if pd.isnull(s):
        s = "None"
    s = s[14:]
    s=s.replace('<LT><LF>','')
    s= s.replace('<LT>LF>','')
    s = s.lower()
    s=s.replace('Pt.', 'patient')
    s=s.replace('Pt', 'patient')
    s= s.replace('pt.', 'patient')
    s=s.replace('pt', 'patient')
    s = s.replace('y/o', 'year old')
    s = s.replace('c/o', 'complains of')
    s = s.replace('C/o', 'complains of')
    #s = s.replace('rt', 'right')
    #s = s.replace('lt', 'left')
    s = s.replace('sob', 'shortness of breath')
    s = s.replace('c/p', 'chest pain')
    s = s.replace('hrs', 'hours')
    s = s.replace('hx', 'history')
    s = s.replace('n/v', 'nausea and/or vomiting')
    s = s.replace('a/e', 'air entry')
    s = s.replace('a/o', 'alert and oriented')
    s = s.replace('a&o', 'alert and oriented')
    s = s.replace('d/c', 'discharge')
    s = s.replace('u/s', 'ultrasound')
    s = s.replace('yrs', 'years')
    s = s.replace('lmp', 'last menstrual period')
    s = s.replace('w/', 'with')
    
    #others to deal with cp, rt, lt
    
    #note that what's missing here is removal of punctuation
    return s

In [142]:
clin['CleanSubjectiveNotes'] = clin.SubjectiveNotes.map(fixencode)

In [147]:
for item in clin['CleanSubjectiveNotes'].iloc[30000:30010]:
    print (item)

patient complains of vomitting blood 30 minutes ago. patient 14 weeks pregnant. patient also complains of  lower abdominal pain.
patient complains of feeling " heated up and itchy ". patient here also to see a new psychatrist. patient also complains of hearing voices screaming his name.
staes @ work place on friday accidently hit her chest regrigrator door, pain increases on deep breathing.
patient complains of chest pain x1 week and seen to urgent care clinic prescribed medication as diagnosis with gerd. patient stopped taking medication and states return of pain. patient resumed medication but states pain still present.
patient complains of headache, chest pain, and left knee pain.

patient complains of abdominal pain x2 weeks post playing soccer. tonight patient complained of abdominal pain even when not playing soccer to parents. patient complains of upper abdominal pain. no change in diet.
patient had neck abcsess removed 2 weeks ago friday. patient here for dressing to back of ne

In [122]:
clin['BP'] = clin.BloodPressure_LastEDReading.map(bptrans)

clin[['systolic', 'diastolic']] = pd.DataFrame(clin.BP.tolist(), index= clin.index)
clin["Gender"] = clin.GenderDesc.map(lambda x: 1 if x == "Male" else 0)

## split the screening questions
- result is a df called: screening_df

In [152]:
qlist = ['Are you feeling feverish or have had shakes or chills in the last 24 hours?',
         'Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?',
        'Do you have a new Rash?',
        'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?',
         'Have you travelled outside of Canada/USA in the last 3 weeks?',
         'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?',
         'Have you received Health Care in another country in the last 2 years?',
        'Do you have a new/worse cough or shortness of breath?',
         'If so, select all countries that apply',
        'If so, select all infectious diseases that apply']

In [153]:
#making a dict so that the indect of an item on the list, matches the question later
q_dict = {i:q for i,q in enumerate(qlist)}

In [154]:
q_dict

{0: 'Are you feeling feverish or have had shakes or chills in the last 24 hours?',
 1: 'Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?',
 2: 'Do you have a new Rash?',
 3: 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?',
 4: 'Have you travelled outside of Canada/USA in the last 3 weeks?',
 5: 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?',
 6: 'Have you received Health Care in another country in the last 2 years?',
 7: 'Do you have a new/worse cough or shortness of breath?',
 8: 'If so, select all countries that apply',
 9: 'If so, select all infectious diseases that apply'}

In [155]:
#this is going to be a list made of up all the entries in the infection control screening column
#casting the results as string to manipulate later
qans = list(clin['InfectionControlScreening'].str[14:].astype('str'))

In [156]:
#this is going to give me a way to split the questions by finding where they occur in the field
split_points = []
for item in qans:
    temp_points = []
    for word in qlist:
        temp_points.append(item.find(word))
    split_points.append(temp_points)
    

In [157]:
#this is going to give me list of places to split each entry (using only the questions that are actually present)
real_points = []
for item in split_points:
    real_points.append([idx for idx in sorted(item) if idx > 0])

In [158]:
split_points[10000:10010]

[[241, 432, 400, 325, 0, 70, 558, 179, -1, -1],
 [241, 432, 400, 325, 0, 70, 558, 179, -1, -1],
 [241, 432, 400, 325, 0, 70, 558, 179, -1, -1],
 [241, 432, 400, 325, 0, 70, 558, 179, -1, -1],
 [241, 432, 400, 325, 0, 70, 558, 179, -1, -1],
 [241, 432, 400, 325, 0, 70, 558, 179, -1, -1],
 [241, -1, 400, 325, 0, 70, -1, 179, -1, -1],
 [241, -1, 400, 325, 0, 70, -1, 179, -1, -1],
 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 [241, -1, 400, 325, 0, 70, -1, 179, -1, -1]]

In [159]:
#anything from the list above which would have been empty in the original data
#gives a list of -1's in split_points and nothing in real_points
real_points[10000:10010]

[[70, 179, 241, 325, 400, 432, 558],
 [70, 179, 241, 325, 400, 432, 558],
 [70, 179, 241, 325, 400, 432, 558],
 [70, 179, 241, 325, 400, 432, 558],
 [70, 179, 241, 325, 400, 432, 558],
 [70, 179, 241, 325, 400, 432, 558],
 [70, 179, 241, 325, 400],
 [70, 179, 241, 325, 400],
 [],
 [70, 179, 241, 325, 400]]

In [160]:
print(qans[1000])
print()
print (split_points[1000])
print ()
print(real_points[1000])

Have you travelled outside of Canada/USA in the last 3 weeks? N<LT>LF>Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks? N<LT>LF>Do you have a new/worse cough or shortness of breath? N<LT>LF>Are you feeling feverish or have had shakes or chills in the last 24 hours? N<LT>LF>Do you have a new onset of Vomiting/Diarrhea in the last 24 hours? N<LT>LF>Do you have a new Rash? N<LT>LF>Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting? N

[241, 432, 400, 325, 0, 70, -1, 179, -1, -1]

[70, 179, 241, 325, 400, 432]


In [161]:
#this should mutate qans so each item is now a list of q and a's
for idx in range(len(qans)):
    qans[idx] = [qans[idx][i : j] for i, j in zip([0] + real_points[idx], real_points[idx] + [None])]
    

In [162]:
qans[1000]

['Have you travelled outside of Canada/USA in the last 3 weeks? N<LT>LF>',
 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks? N<LT>LF>',
 'Do you have a new/worse cough or shortness of breath? N<LT>LF>',
 'Are you feeling feverish or have had shakes or chills in the last 24 hours? N<LT>LF>',
 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours? N<LT>LF>',
 'Do you have a new Rash? N<LT>LF>',
 'Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting? N']

In [163]:
qans[10000]

['Have you travelled outside of Canada/USA in the last 3 weeks? N<LT>LF>',
 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks? N<LT>LF>',
 'Do you have a new/worse cough or shortness of breath? N<LT>LF>',
 'Are you feeling feverish or have had shakes or chills in the last 24 hours? N<LT>LF>',
 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours? N<LT>LF>',
 'Do you have a new Rash? N<LT>LF>',
 'Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting? N<LT>LF>',
 'Have you received Health Care in another country in the last 2 years? N']

In [164]:
#this will mutate each item of each list in qans and drop the <LT>LF> if present
for q in qans:
    for idx in range(len(q)):
        if q[idx].find('<LT>LF>') != -1:
            q[idx] = q[idx][:-7]
        

In [165]:
qans[1000]

['Have you travelled outside of Canada/USA in the last 3 weeks? N',
 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks? N',
 'Do you have a new/worse cough or shortness of breath? N',
 'Are you feeling feverish or have had shakes or chills in the last 24 hours? N',
 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours? N',
 'Do you have a new Rash? N',
 'Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting? N']

In [166]:
qans[10000]

['Have you travelled outside of Canada/USA in the last 3 weeks? N',
 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks? N',
 'Do you have a new/worse cough or shortness of breath? N',
 'Are you feeling feverish or have had shakes or chills in the last 24 hours? N',
 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours? N',
 'Do you have a new Rash? N',
 'Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting? N',
 'Have you received Health Care in another country in the last 2 years? N']

In [167]:
qans[1000]

['Have you travelled outside of Canada/USA in the last 3 weeks? N',
 'Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks? N',
 'Do you have a new/worse cough or shortness of breath? N',
 'Are you feeling feverish or have had shakes or chills in the last 24 hours? N',
 'Do you have a new onset of Vomiting/Diarrhea in the last 24 hours? N',
 'Do you have a new Rash? N',
 'Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting? N']

In [168]:
seg_list = []
for q in qans:
    #print ('initial:', q)
    test_question = q
    res = []
    for item in test_question:
        #print (item)
        for qq in qlist:
            if item.find(qq) != -1:
                #print ('question:',item[:len(qq)])
                #print ('answer:',item[len(qq)+1:])
                res.append([item[:len(qq)],item[len(qq)+1:] ])
    seg_list.append(res)
    #print ('\nresult:',res, '\n\n')

In [169]:
len(seg_list)

85154

In [170]:
seg_list2 = []
for q in qans:
    #print ('initial:', q)
    test_question = q
    res = {}
    for item in test_question:
        #print (item)
        for qq in qlist:
            if item.find(qq) != -1:
                question = item[:len(qq)]
                answer = item[len(qq)+1:]
                res[question] = answer
    seg_list2.append(res)

In [171]:
final_list = []
for q in range(len(split_points)):
    temp = []
    for i in range(len(split_points[q])):
        if split_points[q][i] == -1:
            #print('nan')
            temp.append('nan')
        else:
            #print (brief_list2[q][q_dict[i]])  # q_dict[i], i, brief_splits[q][i],
            temp.append(seg_list2[q][q_dict[i]])
    final_list.append(temp)
    #print ('\n next item \n')
            

In [172]:
len(final_list)

85154

In [173]:
final_list[100:110]

[['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan']]

In [174]:
final_list[1000:1020]

[['N', 'N', 'N', 'N', 'N', 'N', 'nan', 'N', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'nan', 'Y', 'nan', 'nan'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'nan', 'N', 'nan', 'nan'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'nan', 'N', 'nan', 'nan'],
 ['N', 'nan', 'N', 'N', 'N', 'N', 'nan', 'N', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'nan', 'Y', 'nan', 'nan'],
 ['N', 'N', 'nan', 'N', 'N', 'N', 'N', 'nan', 'nan', 'nan'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'nan', 'nan'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'nan', 'nan'],
 ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'nan', 'nan'],
 ['nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'

In [175]:
screening_df = pd.DataFrame(final_list, columns = qlist)
screening_df.head()

Unnamed: 0,Are you feeling feverish or have had shakes or chills in the last 24 hours?,Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?,Do you have a new Rash?,Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?,Have you travelled outside of Canada/USA in the last 3 weeks?,Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?,Have you received Health Care in another country in the last 2 years?,Do you have a new/worse cough or shortness of breath?,"If so, select all countries that apply","If so, select all infectious diseases that apply"
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,


In [176]:
screening_df.shape

(85154, 10)

In [177]:
screening_df.iloc[80000:80010]

Unnamed: 0,Are you feeling feverish or have had shakes or chills in the last 24 hours?,Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?,Do you have a new Rash?,Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?,Have you travelled outside of Canada/USA in the last 3 weeks?,Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?,Have you received Health Care in another country in the last 2 years?,Do you have a new/worse cough or shortness of breath?,"If so, select all countries that apply","If so, select all infectious diseases that apply"
80000,Y,,N,N,N,N,,Y,,
80001,N,,N,N,N,N,,N,,
80002,N,,N,N,N,N,,N,,
80003,,,,,,,,,,
80004,N,,N,N,N,N,,N,,
80005,N,,Y,N,N,N,,N,,
80006,,,,,,,,,,
80007,Y,,N,N,N,N,,Y,,
80008,,,,,,,,,,
80009,N,,N,N,N,,,Y,,


In [178]:
screening_df.replace('nan',np.NaN, inplace = True)

In [179]:
screening_df.iloc[80000:80010]

Unnamed: 0,Are you feeling feverish or have had shakes or chills in the last 24 hours?,Have you ever been isolated/required isolation for an infectious disease when receiving care in a healthcare setting?,Do you have a new Rash?,Do you have a new onset of Vomiting/Diarrhea in the last 24 hours?,Have you travelled outside of Canada/USA in the last 3 weeks?,Have you had contact with a sick person who has travelled outside of Canada/USA in the last 3 weeks?,Have you received Health Care in another country in the last 2 years?,Do you have a new/worse cough or shortness of breath?,"If so, select all countries that apply","If so, select all infectious diseases that apply"
80000,Y,,N,N,N,N,,Y,,
80001,N,,N,N,N,N,,N,,
80002,N,,N,N,N,N,,N,,
80003,,,,,,,,,,
80004,N,,N,N,N,N,,N,,
80005,N,,Y,N,N,N,,N,,
80006,,,,,,,,,,
80007,Y,,N,N,N,N,,Y,,
80008,,,,,,,,,,
80009,N,,N,N,N,,,Y,,


## making the new target cells

In [180]:
admit_count= Counter(clin['AdmitLocation']); admit_count

Counter({nan: 73660,
         'E.OVE': 1102,
         'E.9MEDICAL': 307,
         'E.CCU': 426,
         'E.10.RESP': 1133,
         'E.MAU': 1700,
         'E.ICU': 237,
         'E.8SURGERY': 1164,
         'E.6TH': 764,
         'E.5TH': 919,
         'E.OVEMH': 279,
         'E.7TH West': 725,
         'E.9th MED': 454,
         'E.7TH East': 1544,
         'E.4TH': 16,
         'E.8MEDFLEX': 110,
         'E.SCN': 2,
         'E.9TH EAST': 246,
         'E.9TH WEST': 351,
         'E.7MEDFLEX': 15})

In [181]:
clin['outcome'] = clin['AdmitLocation'].astype('str').map(
{'nan': 'discharge',
         'E.OVE': 'madmit',
         'E.9MEDICAL': 'madmit',
         'E.CCU': 'madmit',
         'E.10.RESP': 'madmit',
         'E.MAU': 'madmit',
         'E.ICU': 'ICU',
         'E.8SURGERY': 'sadmit',
         'E.6TH': 'madmit',
         'E.5TH': 'madmit',
         'E.OVEMH': 'madmit',
         'E.7TH West': 'madmit',
         'E.9th MED': 'madmit',
         'E.7TH East': 'madmit',
         'E.4TH': 'madmit',
         'E.8MEDFLEX': 'madmit',
         'E.SCN': 'madmit',
         'E.9TH EAST': 'madmit',
         'E.9TH WEST': 'madmit',
         'E.7MEDFLEX': 'madmit'})

In [182]:
#first target columns is based on the admitting location
clin['target'] = clin['outcome'].map({'discharge': 1, 'madmit': 2, 'sadmit': 3, 'ICU': 4})

In [183]:
serv_count = Counter(clin['PatientService']); serv_count

Counter({nan: 73660,
         'Cardiology': 1550,
         'ALC General Medicine': 201,
         'General Medicine': 4512,
         'INTENSIVE CARE UNIT': 253,
         'Respirology': 852,
         'General Surgery': 650,
         'Paediatrics': 882,
         'Mental Health': 1198,
         'Orthopaedics': 430,
         'ALC Neurology': 64,
         'Genitourinary': 182,
         'Neurology': 456,
         'Palliative': 65,
         'ALC Respirology': 28,
         'Oncology': 38,
         'ALC Orthopaedics': 23,
         'ALC Cardiology': 38,
         'AD PALLIATIVE': 45,
         'AD General Medicine': 2,
         'Obstetrics': 17,
         'AD Respirology': 1,
         'AD Oncology': 1,
         'ALC General Surgery': 2,
         'Neonatal Retro Transfers': 1,
         'ALC Oncology': 1,
         'AD Neurology': 1,
         'Newborn': 1})

In [184]:
clin['service'] = clin['PatientService'].astype('str').map(
    {'nan': 'discharge',
         'Cardiology': 'madmit',
         'ALC General Medicine': 'madmit',
         'General Medicine': 'madmit',
         'INTENSIVE CARE UNIT': 'ICU',
         'Respirology': 'madmit',
         'General Surgery': 'sadmit',
         'Paediatrics': 'madmit',
         'Mental Health': 'madmit',
         'Orthopaedics': 'sadmit',
         'ALC Neurology': 'madmit',
         'Genitourinary': 'sadmit',
         'Neurology': 'madmit',
         'Palliative': 'madmit',
         'ALC Respirology': 'madmit',
         'Oncology': 'madmit',
         'ALC Orthopaedics': 'sadmit',
         'ALC Cardiology': 'madmit',
         'AD PALLIATIVE': 'madmit',
         'AD General Medicine': 'madmit',
         'Obstetrics': 'madmit',
         'AD Respirology': 'madmit',
         'AD Oncology': 'madmit',
         'ALC General Surgery': 'sadmit',
         'Neonatal Retro Transfers': 'madmit',
         'ALC Oncology': 'madmit',
         'AD Neurology': 'madmit',
         'Newborn': 'madmit'})

In [185]:
service_count = Counter(clin['service']); service_count

Counter({'discharge': 73660, 'madmit': 9954, 'ICU': 253, 'sadmit': 1287})

In [186]:
#second target is based on the admitting service
clin['target2'] = clin['service'].map({'discharge': 1, 'madmit': 2, 'sadmit': 3, 'ICU': 4})

In [187]:
Counter(clin['target2'])

Counter({1: 73660, 2: 9954, 4: 253, 3: 1287})

In [188]:
#creating this for later use to map the numbers in the dataframe column to their original meaning
out_dict = {'discharge': 1, 'madmit': 2, 'sadmit': 3, 'ICU': 4}
out_dict = {v:k for k,v in out_dict.items()}
out_dict

{1: 'discharge', 2: 'madmit', 3: 'sadmit', 4: 'ICU'}

In [189]:
#discharge vs. not target
clin['discharge'] = clin['outcome'].map({'discharge': 'discharge', 'madmit': 'admit', 'sadmit': 'admit', 'ICU': 'admit'})

In [190]:
clin['target3'] = clin['outcome'].map({'discharge': 1, 'madmit': 0, 'sadmit': 0, 'ICU': 0})

In [191]:
Counter(clin['discharge']), Counter(clin['target3'])

(Counter({'discharge': 73660, 'admit': 11494}), Counter({1: 73660, 0: 11494}))

In [192]:
#a target for trying to tell apart the inpatient disposition, only in admitted patients
clin['dispo']= clin['outcome'].map({'discharge': np.nan, 'madmit': 'madmit', 'sadmit': 'sadmit', 'ICU': 'ICU'})

In [193]:
clin['target4'] = clin['dispo'].map({np.nan:np.nan, 'madmit': 1, 'sadmit': 2, 'ICU': 3})
#for some reason a can't make a counter out of this without it crashing so I've stopped trying

In [194]:
Counter(clin['dispo']), Counter(clin['target4'].dropna())

(Counter({nan: 73660, 'madmit': 10093, 'ICU': 237, 'sadmit': 1164}),
 Counter({1.0: 10093, 3.0: 237, 2.0: 1164}))

In [195]:
#gonna make a target that is ICU vs other
clin['ICUvsother'] = clin['PatientService'][clin['PatientService'] == 'INTENSIVE CARE UNIT']
#clin['PatientService']

In [196]:
clin['target5'] = clin['ICUvsother'].map({'INTENSIVE CARE UNIT':1, np.nan:0})

In [197]:
sorted(clin.columns)

['AdmitLocation',
 'AgeInYrs',
 'AgeNumber',
 'BP',
 'BloodPressure_LastEDReading',
 'ChartNumber',
 'CleanSubjectiveNotes',
 'DischargeDisposition',
 'DischargeDispositionDesc',
 'Disposition Date & Time',
 'EncounterNumber',
 'Gender',
 'GenderDesc',
 'ICUvsother',
 'ID',
 'InfectionControlScreening',
 'Left ED Date & Time',
 'MainDiagnosisCode',
 'MainDiagnosisCodeDesc',
 'MedicalHistory',
 'O2Saturation_LastEDReading',
 'PIA Date & Time',
 'PatientService',
 'PresentingComplaint',
 'PresentingComplaintDesc',
 'Pulse_LastEDReading',
 'Reg Date & Time',
 'SubjectiveNotes',
 'Temperature_LastEDReading',
 'Triage Date & Time',
 'TriageLevel',
 'diastolic',
 'discharge',
 'dispo',
 'o2sat',
 'outcome',
 'pulse',
 'service',
 'systolic',
 'target',
 'target2',
 'target3',
 'target4',
 'target5',
 'temp']

## splitting into a bunch of sub dataframes and saving them as their own csv files
- loading the excel file is very slow, but loading csv files is faster so I'm making a bunch of dataframes of subsets of data for easy saving/loading/combining

In [198]:
tab_df = clin[['ID', 'TriageLevel',
       'AgeInYrs', 'GenderDesc', 'DischargeDisposition',
       'PresentingComplaint', 'PresentingComplaintDesc', 'AdmitLocation', 'PatientService',
       'BloodPressure_LastEDReading','systolic', 'diastolic','temp','pulse','o2sat']]

In [200]:
target_df = clin[['outcome','target', 'service','target2', 'discharge', 'target3','dispo','target4', 'ICUvsother', 'target5']]

In [201]:
date_df = clin[['Triage Date & Time', 'Reg Date & Time',
       'PIA Date & Time', 'Disposition Date & Time', 'Left ED Date & Time']]

## parsing of the medical history into categories

In [204]:
histories = Counter(clin['MedicalHistory']); len(histories)

24733

In [205]:
history = list(clin['MedicalHistory'].str[16:].astype('str'))

In [277]:
for item in history[1000:1050]:
    print (item)

No Significant Medical History
nan
Anemia, depression , HTN, schzophrenia, High cholesterol, NIDDM,
nan
No Significant Medical History
hypothyroid
chf,niddm.high chol
No Significant Medical History
nan
nan
Kidney Stones
No Significant Medical History
No Significant Medical History
vitamin D defiency
No Significant Medical History
nan
Select from list or enter free textgerd/dementia/<LT>LF>bph<LT>LF>gerd/dementia/bph
chf<LT>LF>iddm, htn<LT>LF>enlarged liver
nan
High Cholesterol<LT>LF>Colitis<LT>LF>Diabetes
Vertigo<LT>LF>Hypertension (HTN)
No Significant Medical History
Hypertension (HTN)<LT>LF>kidney <LT>LF>exzema<LT>LF>Diabetes
iddm,htn,
No Significant Medical History
Hypertension (HTN)<LT>LF>High Cholesterol
High Cholesterol
schizophrenia
htn, high cholestrol, glaucoma <LT>LF>thyroid problem
nan
nan
PE
Hypertension (HTN)<LT>LF>Diverticulitis
nan
nan
nan
nan
Breast Cancer
Hypertension (HTN)<LT>LF>NIDDM (Non-Insulin-Dependent Diabetes Mellitus)<LT>LF>High Cholesterol
constipation
G3P0, 

In [274]:
#going through the medical histories field and splitting into a list of co-morbid conditions
#split_hist = [item.split('<LT>LF>') for item in history]# if item != 'nan']

In [400]:
replacements = (',','<LT>LF>', '.')
split_hist = []
for item in history:
    for r in replacements:
        item = item.replace(r, '::')
    item = item.split('::')
    item = [re.sub(r'\(.*?\)\ *', '', s) for s in item]
    
    item = [x.strip().lower() for x in item if x != 'nan']
    
    item = list(filter(None, item))
    split_hist.append(item)

In [401]:
#rejoining them as a string with diagnoses separated by commas
join_hist = [','.join(item) for item in split_hist]

In [426]:
#replacing empty string with NaN
join_hist = [np.nan if item == 'nan' else item for item in join_hist]

In [427]:
len(join_hist)

85154

In [428]:
join_hist[2000:2020]

['urterine fibroids',
 'strep throat',
 'no significant medical history',
 'no significant medical history',
 'niddm',
 'no significant medical history',
 'uti 3-4wk ago treated with abx,eczema',
 nan,
 'asthma,hypothyroid,hypertension',
 'no significant medical history',
 'asthma,atrial fibrillation,hypothyroid',
 'born 38wks,c-section',
 'no significant medical history',
 'no significant medical history',
 'no significant medical history',
 'chf,arthritis,acute renal failure,hypothyroid,high cholesterol,atherosclerosis',
 'niddm,hypertension,chronic back pain',
 'one kidney,htn,anxiety,depression',
 'no significant medical history',
 'no significant medical history']

In [429]:
clin['pmhx'] = join_hist

In [430]:
subj_df = clin[['SubjectiveNotes', 'MedicalHistory', 'pmhx']]

### exploring different diagnoses

In [406]:
diagnoses = Counter()
for hx in split_hist:
    for item in hx:
        diagnoses[item] += 1

In [407]:
#there are 20464 different medical histories used in the dataset!!!
len(diagnoses)

13194

In [409]:
#sorted diagnoses
diagnoses.most_common(40)

[('no significant medical history', 32253),
 ('hypertension', 8662),
 ('high cholesterol', 8225),
 ('htn', 7489),
 ('niddm', 6534),
 ('asthma', 3226),
 ('depression', 2681),
 ('high bp', 2122),
 ('anxiety', 2044),
 ('hypothyroid', 2001),
 ('thyroid', 1964),
 ('gerd', 1896),
 ('iddm', 1867),
 ('arthritis', 1130),
 ('high chol', 1069),
 ('dementia', 1054),
 ('mi', 958),
 ('copd', 882),
 ('anemia', 855),
 ('chf', 847),
 ('acid reflux', 766),
 ('cva', 689),
 ('cholesterol', 682),
 ('gout', 647),
 ('seizure', 625),
 ('afib', 601),
 ('chol', 582),
 ('osteoporosis', 575),
 ('diabetes', 563),
 ('atrial fibrillation', 484),
 ('schizophrenia', 475),
 ('enlarged prostate', 434),
 ('cad', 391),
 ('bph', 387),
 ('chronic back pain', 363),
 ('bipolar', 353),
 ('uti', 349),
 ('cholestrol', 334),
 ('migraines', 331),
 ('pacemaker', 327)]

In [410]:
#this is a dictionary mapping the rank of the diagnosis to its name
dx_rank_dict = {item[0]:i+1 for i,item in enumerate(diagnoses.most_common())}

#this is a dictionary mapping the frequency of occurence to its name
dx_freq_dict = {item[0]:item[1]/len(diagnoses) for item in diagnoses.most_common()}

In [454]:
split_hist[0:20]

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [411]:
#trying to get a feel for how many diagoses are provided for each patient
split_hist_len = [len(item) for item in split_hist]
hxdf = pd.DataFrame(split_hist_len)

In [412]:
hxdf.describe()

Unnamed: 0,0
count,85154.0
mean,1.603037
std,1.504511
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,22.0


In [413]:
lengths = Counter(split_hist_len); lengths

Counter({0: 10523,
         2: 10378,
         1: 48073,
         3: 7057,
         4: 4348,
         9: 191,
         6: 1188,
         7: 595,
         5: 2299,
         15: 4,
         8: 316,
         10: 87,
         14: 6,
         11: 51,
         12: 24,
         13: 7,
         16: 3,
         19: 1,
         22: 1,
         17: 1,
         18: 1})

In [414]:
#most patients have only a single diagnosis, 80% have 2 or less, 96% have 6 or less
for k,v in sorted(lengths.items()):
    print (k, '{0:2f}%'.format(v*100/len(split_hist_len)))

0 12.357611%
1 56.454189%
2 12.187331%
3 8.287338%
4 5.106043%
5 2.699814%
6 1.395119%
7 0.698734%
8 0.371092%
9 0.224300%
10 0.102168%
11 0.059891%
12 0.028184%
13 0.008220%
14 0.007046%
15 0.004697%
16 0.003523%
17 0.001174%
18 0.001174%
19 0.001174%
22 0.001174%


In [415]:
#this will create a list for each item where the top 6 diagnoses are listed in order
ordered_hist = []
for item in split_hist:
    #print (item)
    #print (len(item))
    top_6 = []
    for hx in item:
        #print (hx)
        #print (dx_rank_dict[hx])
        #print (dx_rank_dict[hx], hx)
        if item == 'nan':
            top_6.append(np.nan)
        else:
            top_6.append((dx_rank_dict[hx], hx))
    #print (top_6)
    sort = sorted(top_6)
    #print (sort)
    sort2 = [ii for i,ii in sort]
    #print(sort2)
    if len(sort2) > 6:
        sort2 = sort2[:5]
    if len(sort2) < 6:
        pad_req = 6-len(sort2)
        i = 0
        while i < pad_req:
            sort2.append(np.nan) #('n/a')
            i = i+1
        
    #print (top_6)
    #print (sorted(top_6))
    ordered_hist.append(sort2)
    #print ('next item')
    #print ()

In [420]:
medhx_df = pd.DataFrame(ordered_hist, columns = ['medhx' + str(i+1) for i in range(6)])
medhx_df.shape

(85154, 6)

In [458]:
ordered_hist[2000:2020]

[['urterine fibroids', nan, nan, nan, nan, nan],
 ['strep throat', nan, nan, nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 ['niddm', nan, nan, nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 ['eczema', 'uti 3-4wk ago treated with abx', nan, nan, nan, nan],
 [nan, nan, nan, nan, nan, nan],
 ['hypertension', 'asthma', 'hypothyroid', nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 ['asthma', 'hypothyroid', 'atrial fibrillation', nan, nan, nan],
 ['c-section', 'born 38wks', nan, nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 ['no significant medical history', nan, nan, nan, nan, nan],
 ['high cholesterol',
  'hypothyroid',
  'arthritis',
  'chf',
  'atherosclerosis',
  'acute renal failure'],
 ['hypertension', 'niddm', 'chronic back pain', nan

In [477]:
comorbids = []

for item in ordered_hist:
    res = [x for x in item if str(x) != 'nan']
    if len(res) == 0: comorbids.append(np.nan)
    else:
        res = [x for x in res if x != 'no significant medical history']
        #print (res, len(res))
        comorbids.append(len(res))

In [481]:
#so now I want to add this to the tabular data frame
tab_df['num_comorbids'] = comorbids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


### now I'm going to make a couple of balanced datasets

In [431]:
Counter(clin.target), Counter(clin.target3)

(Counter({1: 73660, 2: 10093, 4: 237, 3: 1164}), Counter({1: 73660, 0: 11494}))

In [433]:
dc_df = clin[clin.target3==1]
dc_df = dc_df.sample(n= 11494)  #this is the number of admissions from target 3

admit_df = clin[clin.target3 == 0]

In [437]:
balanced_df1 = pd.concat([admit_df, dc_df], axis = 0)

In [444]:
icu_df = clin[clin.target2 == 4]
madmit_df = clin[clin.target2 ==3].sample(n = len(icu_df))
sadmit_df = clin[clin.target2 ==2].sample(n = len(icu_df))
disc_df = clin[clin.target2 == 1].sample(n = len(icu_df))

In [446]:
#this is a dataframe with equal numbers of all 4 classes
balanced_df2 = pd.concat([icu_df, madmit_df, sadmit_df, disc_df], axis = 0)

In [447]:
non_icu_df = clin[clin.target2 !=4].sample(n = len(icu_df))

In [449]:
balanced_df3 = pd.concat([icu_df, non_icu_df], axis = 0)

In [452]:
balanced_df3.columns

Index(['ID', 'ChartNumber', 'EncounterNumber', 'TriageLevel', 'AgeNumber',
       'AgeInYrs', 'GenderDesc', 'Triage Date & Time', 'Reg Date & Time',
       'PIA Date & Time', 'Disposition Date & Time', 'DischargeDisposition',
       'DischargeDispositionDesc', 'Left ED Date & Time',
       'PresentingComplaint', 'PresentingComplaintDesc', 'MainDiagnosisCode',
       'MainDiagnosisCodeDesc', 'AdmitLocation', 'PatientService',
       'SubjectiveNotes', 'InfectionControlScreening', 'MedicalHistory',
       'BloodPressure_LastEDReading', 'O2Saturation_LastEDReading',
       'Pulse_LastEDReading', 'Temperature_LastEDReading', 'o2sat', 'pulse',
       'temp', 'CleanSubjectiveNotes', 'BP', 'systolic', 'diastolic', 'Gender',
       'outcome', 'target', 'service', 'target2', 'discharge', 'target3',
       'dispo', 'target4', 'ICUvsother', 'target5', 'pmhx'],
      dtype='object')

## saving dataframes to csv files

In [483]:
balanced_df1.to_csv(data_path + '/balanced_admit_dc_nlp_data.csv')

balanced_df2.to_csv(data_path +'/balanced_4cls_nlp_data.csv')

balanced_df3.to_csv(data_path +'/balanced_icuvsother_nlp_data.csv')

target_df.to_csv(data_path  + '/targets.csv')

screening_df.to_csv(data_path + '/inf_control_data.csv')

tab_df.to_csv(data_path + '/tabular_data.csv')

date_df.to_csv(data_path + '/data_data.csv')

subj_df.to_csv(data_path + '/subj_data.csv')

medhx_df.to_csv(data_path + '/med_hx.csv')