In [1]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import csv
import math
import operator
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re
import os
import json
import asyncio
import requests
import aiohttp
from nltk.tokenize import RegexpTokenizer
tqdm.pandas()

### Full

In [3]:
data_path = '../data/'

In [23]:
dfproc = pd.read_csv(data_path + 'PROCEDURES_ICD.csv')
dfdiag = pd.read_csv(data_path + 'DIAGNOSES_ICD.csv')

In [24]:
dfproc.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,944,62641,154460,3,3404
1,945,2592,130856,1,9671
2,946,2592,130856,2,3893
3,947,55357,119355,1,9672
4,948,55357,119355,2,331


In [25]:
dfdiag.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254


Put a period in the right place because the MIMIC-3 data files exclude them.Generally, procedure codes have dots after the first two digits, while diagnosis codes have dots after the first three digits.

In [19]:
def reformat(code, is_diag):
    code = ''.join(code.split('.'))
    if is_diag:
        if code.startswith('E'):
            if len(code) > 4:
                code = code[:4] + '.' + code[4:]
        else:
            if len(code) > 3:
                code = code[:3] + '.' + code[3:]
    else:
        code = code[:2] + '.' + code[2:]
    return code

In [28]:
dfdiag['ICD9_CODE'] = dfdiag.apply(lambda row: str(reformat(str(row[4]), True)), axis=1)
dfproc['ICD9_CODE'] = dfproc.apply(lambda row: str(reformat(str(row[4]), False)), axis=1)

In [29]:
dfcodes = pd.concat([dfdiag, dfproc])
dfcodes.to_csv(data_path + 'ALL_CODES.csv', index=False,
               columns=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE'],
               header=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE'])


In [30]:
dfcodes.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,403.01
1,1298,109,172335,2.0,486.0
2,1299,109,172335,3.0,582.81
3,1300,109,172335,4.0,585.5
4,1301,109,172335,5.0,425.4


In [31]:
df = pd.read_csv(data_path + 'ALL_CODES.csv',dtype={"ICD9_CODE": str})
len(df['ICD9_CODE'].unique())

8994

In [11]:
#df = df[df['ICD9_CODE'].notna()]

### MAP

In [32]:
icd9codes = df['ICD9_CODE'].unique()
len(icd9codes)

8994

In [13]:
# mapping = {} 
# async def main():
#     count = 0
#     tasks = []
#     headers = {
#         "user-agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
#     async with aiohttp.ClientSession(headers=headers) as session:
#         for i in tqdm(range(len(icd9codes))):
#             icd_9 = icd9codes[i]
#             url = f'https://www.icd10data.com/Convert/{icd_9}'
#             async with session.get(url) as response:
#                 text = await response.text()
#                 tasks.append(text)
#         for i in tqdm(range(len(tasks))):
#             icd_9 = icd9codes[i]
#             url = f'https://www.icd10data.com/Convert/{icd_9}'
#             task = tasks[i]
#             soup = BeautifulSoup(task, 'html.parser')
#             icd_10 = soup.find_all("span", {"class": "identifier"})
#             if len(icd_10) == 0:
#                 # print("ERROR: No ICD10 code found for code: ",icd_9)
#                 print(url,end=" ")
#                 print(icd_10)
#                 icd_10 = icd_9
#             elif len(icd_10) == 1:
#                 icd_10 = str(icd_10[0]).split('>')[1].split('<')[0]
#             else:
#                 icd_10 = str(icd_10[1]).split('>')[1].split('<')[0]
#             mapping[icd_9] = icd_10
#     print(count)

# await main()

In [14]:
# with open('icd9to10mapping.json', 'w') as f:
#     json.dump(mapping, f)

In [4]:
f = open(data_path + 'icd9to10mapping.json','r')
mapping = json.load(f)

In [34]:
len(mapping.keys())

8994

In [38]:
def icd9to10(icd_9):
    if icd_9 not in mapping.keys():
        return icd_9
    return mapping[icd_9]
df['ICD10_CODE'] = df.apply(lambda row: icd9to10(row[4]), axis=1)

In [39]:
df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ICD10_CODE
0,1297,109,172335,1.0,403.01,I12.0
1,1298,109,172335,2.0,486.0,J18.9
2,1299,109,172335,3.0,582.81,N08
3,1300,109,172335,4.0,585.5,N18.5
4,1301,109,172335,5.0,425.4,I42.5


In [40]:
len(df['ICD9_CODE'].unique())

8994

In [41]:
df.to_csv(data_path + 'ALL_MAPPED_CODES.csv', index=False,
               columns=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE','ICD10_CODE'],
               header=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE','ICD10_CODE'])

In [42]:
df = pd.read_csv(data_path + 'ALL_MAPPED_CODES.csv',dtype={"ICD9_CODE": str})
len(df['ICD9_CODE'].unique())

8994

### Get Discharge Summary

In [43]:
noteeventsdf = pd.read_csv(data_path + 'NOTEEVENTS.csv')

  noteeventsdf = pd.read_csv(data_path + 'NOTEEVENTS.csv')


In [44]:
noteeventsdf.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [45]:
len(noteeventsdf.index)

2083180

In [46]:
dich_full = noteeventsdf[noteeventsdf['CATEGORY']=='Discharge summary']

In [47]:
len(dich_full.index)

59652

In [48]:
len(dich_full['HADM_ID'].unique())

52726

In [49]:
# dich_full = dich_full[dich_full['HADM_ID'].notna()]

In [50]:
dich_full = dich_full.sort_values(['SUBJECT_ID', 'HADM_ID'])

In [51]:
dich_full.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
48470,44005,3,145834.0,2101-10-31,,,Discharge summary,Report,,,Admission Date: [**2101-10-20**] Discharg...
4782,4788,4,185777.0,2191-03-23,,,Discharge summary,Report,,,Admission Date: [**2191-3-16**] Discharge...
24476,20825,6,107064.0,2175-06-15,,,Discharge summary,Report,,,Admission Date: [**2175-5-30**] Dischar...
22764,20070,9,150750.0,2149-11-13,,,Discharge summary,Report,,,Admission Date: [**2149-11-9**] Dischar...
57341,57115,9,150750.0,2149-11-14,,,Discharge summary,Addendum,,,"Name: [**Known lastname 10050**], [**Known fi..."


In [52]:
dich_full.to_csv(data_path + 'disch_full.csv', index=False)

In [53]:
del noteeventsdf

In [54]:
dich_full = pd.read_csv(data_path + 'disch_full.csv')
dich_full.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,44005,3,145834.0,2101-10-31,,,Discharge summary,Report,,,Admission Date: [**2101-10-20**] Discharg...
1,4788,4,185777.0,2191-03-23,,,Discharge summary,Report,,,Admission Date: [**2191-3-16**] Discharge...
2,20825,6,107064.0,2175-06-15,,,Discharge summary,Report,,,Admission Date: [**2175-5-30**] Dischar...
3,20070,9,150750.0,2149-11-13,,,Discharge summary,Report,,,Admission Date: [**2149-11-9**] Dischar...
4,57115,9,150750.0,2149-11-14,,,Discharge summary,Addendum,,,"Name: [**Known lastname 10050**], [**Known fi..."


### Sort

In [55]:
labeldf = pd.read_csv(data_path + 'ALL_MAPPED_CODES.csv',dtype={"ICD9_CODE": str})

In [56]:
labeldf = labeldf.drop(['ROW_ID'],axis=1)
labeldf = labeldf.sort_values(['SUBJECT_ID', 'HADM_ID'])
labeldf.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ICD10_CODE
608,2,163353,1.0,V30.01,Z38.01
609,2,163353,2.0,V05.3,Z23
610,2,163353,3.0,V29.0,Z05.1
746132,2,163353,1.0,99.55,3E0134Z
611,3,145834,1.0,038.9,A41.9


In [57]:
labeldf.to_csv(data_path + 'ALL_MAPPED_CODES.csv', index=False)

### Filter

In [58]:
labeldf = pd.read_csv(data_path + 'ALL_MAPPED_CODES.csv',dtype={"ICD9_CODE": str})
labeldf.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ICD10_CODE
0,2,163353,1.0,V30.01,Z38.01
1,2,163353,2.0,V05.3,Z23
2,2,163353,3.0,V29.0,Z05.1
3,2,163353,1.0,99.55,3E0134Z
4,3,145834,1.0,038.9,A41.9


In [59]:
len(dich_full['HADM_ID'].unique()), len(labeldf['HADM_ID'].unique())

(52726, 58976)

In [60]:
admission_ids = list(dich_full['HADM_ID'].unique()) 
labeldf = labeldf.loc[labeldf['HADM_ID'].isin(admission_ids)]

In [61]:
labeldf = labeldf[labeldf['ICD9_CODE'].notna()]

In [62]:
labeldf.to_csv(data_path + 'ALL_CODES_filtered.csv', index=False)

In [63]:
labeldf = pd.read_csv(data_path + 'ALL_CODES_filtered.csv',dtype={"ICD9_CODE": str})
len(labeldf['HADM_ID'].unique())

52722

In [64]:
labeldf.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ICD10_CODE
0,3,145834,1.0,38.9,A41.9
1,3,145834,2.0,785.59,R57.1
2,3,145834,3.0,584.9,N17.9
3,3,145834,4.0,427.5,I46.9
4,3,145834,5.0,410.71,I21.4


In [65]:
dich_full = pd.read_csv(data_path + 'disch_full.csv')
admission_ids = list(labeldf['HADM_ID'].unique()) 
dich_full = dich_full.loc[dich_full['HADM_ID'].isin(admission_ids)]

In [66]:
dich_full.to_csv(data_path + 'disch_full.csv', index=False)

In [67]:
dich_full = pd.read_csv(data_path + 'disch_full.csv')
len(dich_full['HADM_ID'].unique())

52722

### concatenate

In [68]:
labeldf = pd.read_csv(data_path + 'ALL_CODES_filtered.csv',dtype={"ICD9_CODE": str})

In [69]:
labeldf=labeldf.groupby(['SUBJECT_ID','HADM_ID']).aggregate([('ICD9_CODE',';'.join)],[('ICD10_CODE',';'.join)])

  labeldf=labeldf.groupby(['SUBJECT_ID','HADM_ID']).aggregate([('ICD9_CODE',';'.join)],[('ICD10_CODE',';'.join)])


In [70]:
labeldf = labeldf.droplevel(axis=1, level=1).reset_index()

In [71]:
labeldf.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE,ICD10_CODE
0,3,145834,038.9;785.59;584.9;427.5;410.71;428.0;682.6;42...,A41.9;R57.1;N17.9;I46.9;I21.4;I50.814;L03.119;...
1,4,185777,042;136.3;799.4;276.3;790.7;571.5;041.11;V09.0...,B20;B59;R64;E87.3;R78.81;K74.0;B95.61;Z16.11;E...
2,6,107064,403.91;444.0;997.2;276.6;276.7;285.9;275.3;V15...,I12.0;444.0;T81.719A;276.6;E87.5;D64.9;E83.30;...
3,9,150750,431;507.0;428.0;584.9;276.5;401.9;96.72;96.04,I61.9;J69.0;I50.814;N17.9;276.5;I10;5A1955Z;0B...
4,10,184167,V30.00;774.2;765.25;765.15;V29.0;99.83;99.15;96.6,Z38.00;P59.0;P07.32;P07.15;Z05.1;6A600ZZ;3E033...


In [72]:
dich_full = pd.read_csv(data_path + 'disch_full.csv')

In [73]:
dich_full=dich_full.groupby(['SUBJECT_ID','HADM_ID']).aggregate([('TEXT',' '.join)])

  dich_full=dich_full.groupby(['SUBJECT_ID','HADM_ID']).aggregate([('TEXT',' '.join)])


In [74]:
dich_full = dich_full.droplevel(axis=1, level=1).reset_index()
dich_full = dich_full.drop(['CHARTDATE','CATEGORY','DESCRIPTION'],axis=1)

In [75]:
dich_full.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT
0,3,145834.0,Admission Date: [**2101-10-20**] Discharg...
1,4,185777.0,Admission Date: [**2191-3-16**] Discharge...
2,6,107064.0,Admission Date: [**2175-5-30**] Dischar...
3,9,150750.0,Admission Date: [**2149-11-9**] Dischar...
4,10,184167.0,Admission Date: [**2103-6-28**] Dischar...


In [76]:
notes_labeled_full = pd.merge(labeldf,dich_full,on=['HADM_ID','SUBJECT_ID'],how='inner')

In [77]:
notes_labeled_full = notes_labeled_full.loc[:,['SUBJECT_ID','HADM_ID','TEXT','ICD9_CODE','ICD10_CODE']]

In [78]:
notes_labeled_full.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE
0,3,145834,Admission Date: [**2101-10-20**] Discharg...,038.9;785.59;584.9;427.5;410.71;428.0;682.6;42...,A41.9;R57.1;N17.9;I46.9;I21.4;I50.814;L03.119;...
1,4,185777,Admission Date: [**2191-3-16**] Discharge...,042;136.3;799.4;276.3;790.7;571.5;041.11;V09.0...,B20;B59;R64;E87.3;R78.81;K74.0;B95.61;Z16.11;E...
2,6,107064,Admission Date: [**2175-5-30**] Dischar...,403.91;444.0;997.2;276.6;276.7;285.9;275.3;V15...,I12.0;444.0;T81.719A;276.6;E87.5;D64.9;E83.30;...
3,9,150750,Admission Date: [**2149-11-9**] Dischar...,431;507.0;428.0;584.9;276.5;401.9;96.72;96.04,I61.9;J69.0;I50.814;N17.9;276.5;I10;5A1955Z;0B...
4,10,184167,Admission Date: [**2103-6-28**] Dischar...,V30.00;774.2;765.25;765.15;V29.0;99.83;99.15;96.6,Z38.00;P59.0;P07.32;P07.15;Z05.1;6A600ZZ;3E033...


In [79]:
notes_labeled_full.to_csv(data_path + 'notes_labeled.csv', index=False)

In [80]:
del dich_full
del notes_labeled_full

### Preprocess Text

In [81]:
df  = pd.read_csv(data_path + 'notes_labeled.csv')

In [82]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE
0,3,145834,Admission Date: [**2101-10-20**] Discharg...,038.9;785.59;584.9;427.5;410.71;428.0;682.6;42...,A41.9;R57.1;N17.9;I46.9;I21.4;I50.814;L03.119;...
1,4,185777,Admission Date: [**2191-3-16**] Discharge...,042;136.3;799.4;276.3;790.7;571.5;041.11;V09.0...,B20;B59;R64;E87.3;R78.81;K74.0;B95.61;Z16.11;E...
2,6,107064,Admission Date: [**2175-5-30**] Dischar...,403.91;444.0;997.2;276.6;276.7;285.9;275.3;V15...,I12.0;444.0;T81.719A;276.6;E87.5;D64.9;E83.30;...
3,9,150750,Admission Date: [**2149-11-9**] Dischar...,431;507.0;428.0;584.9;276.5;401.9;96.72;96.04,I61.9;J69.0;I50.814;N17.9;276.5;I10;5A1955Z;0B...
4,10,184167,Admission Date: [**2103-6-28**] Dischar...,V30.00;774.2;765.25;765.15;V29.0;99.83;99.15;96.6,Z38.00;P59.0;P07.32;P07.15;Z05.1;6A600ZZ;3E033...


In [83]:
df.iloc[0,2]

"Admission Date:  [**2101-10-20**]     Discharge Date:  [**2101-10-31**]\n\nDate of Birth:   [**2025-4-11**]     Sex:  M\n\nService:  Medicine\n\nCHIEF COMPLAINT:  Admitted from rehabilitation for\nhypotension (systolic blood pressure to the 70s) and\ndecreased urine output.\n\nHISTORY OF PRESENT ILLNESS:  The patient is a 76-year-old\nmale who had been hospitalized at the [**Hospital1 190**] from [**10-11**] through [**10-19**] of [**2101**]\nafter undergoing a left femoral-AT bypass graft and was\nsubsequently discharged to a rehabilitation facility.\n\nOn [**2101-10-20**], he presented again to the [**Hospital1 346**] after being found to have a systolic\nblood pressure in the 70s and no urine output for 17 hours.\nA Foley catheter placed at the rehabilitation facility\nyielded 100 cc of murky/brown urine.  There may also have\nbeen purulent discharge at the penile meatus at this time.\n\nOn presentation to the Emergency Department, the patient was\nwithout subjective complaints.  I

In [87]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [88]:
tokenizer = RegexpTokenizer(r'\w+')
def preprocess_text(discharge_summary):
#     ds = discharge_summary
#     ds = re.sub('\[.*\]', '', ds)
#     ds = ds.replace('\n',' ')
#     ds = ds.replace('\r',' ')
#     terms = ['Admission Date:','Discharge Date:','Date of Birth:','Service:','ADDENDUM:','Date/Time:','Completed by:','Dictated By:',' D:',' T:','JOB#:','Followup Instructions:','Follow up w/ Dr.', 'Provider:']
#     for term in terms:
#         ds = re.sub(term,'',ds)
#     ds = re.sub('\d+:\d+','',ds)
#     ds = re.sub(' +',' ',ds)
#     ds = re.sub('\.','',ds)
#     ds = ds.strip()
#     tokens = [t.lower() for t in tokenizer.tokenize(ds)]
#     ds = ' '.join(tokens)
#     return ds

    ds = discharge_summary
    ds = re.sub('\[.*\]', '', ds)
    ds = ds.replace('\n',' ')
    ds = ds.replace('\r',' ')
#     terms = ['Admission Date:','Discharge Date:','Date of Birth:','Service:','ADDENDUM:','Date/Time:','Completed by:','Dictated By:',' D:',' T:','JOB#:','Followup Instructions:','Follow up w/ Dr.', 'Provider:']
#     for term in terms:
#         ds = re.sub(term,'',ds)
    ds = re.sub('\d+:\d+','',ds)
    ds = re.sub('\d+','',ds)
    ds = re.sub(' +',' ',ds)
    ds = re.sub('\.','',ds)
    ds = ds.strip()
    tokens = [t.lower() for t in tokenizer.tokenize(ds)]
    tokens = [w for w in tokens if not w in stop_words]
    ds = ' '.join(tokens)   
    return ds
preprocess_text(df.iloc[0,2])

'admission date date birth sex service medicine chief complaint admitted rehabilitation hypotension systolic blood pressure decreased urine output history present illness patient year old male hospitalized undergoing left femoral bypass graft subsequently discharged rehabilitation facility found systolic blood pressure urine output hours foley catheter placed rehabilitation facility yielded cc murky brown urine may also purulent discharge penile meatus time presentation emergency department patient without subjective complaints emergency department found systolic blood pressure given liters intravenous fluids transiently started dopamine systolic blood pressure past medical history coronary artery disease diffuse vessel disease right dominant status post proximal left circumflex stent occlusion distal left circumflex status post right coronary artery stent percutaneous coronary intervention diagonal left circumflex small proximal left anterior descending artery small distal left anteri

In [91]:
df['TEXT'] = df.apply(lambda row: str(preprocess_text(row[2])), axis=1)

In [92]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE
0,3,145834,admission date date birth sex service medicine...,038.9;785.59;584.9;427.5;410.71;428.0;682.6;42...,A41.9;R57.1;N17.9;I46.9;I21.4;I50.814;L03.119;...
1,4,185777,admission date date birth sex f service chief ...,042;136.3;799.4;276.3;790.7;571.5;041.11;V09.0...,B20;B59;R64;E87.3;R78.81;K74.0;B95.61;Z16.11;E...
2,6,107064,admission date date birth sex f service admiss...,403.91;444.0;997.2;276.6;276.7;285.9;275.3;V15...,I12.0;444.0;T81.719A;276.6;E87.5;D64.9;E83.30;...
3,9,150750,admission date date birth sex service neurolog...,431;507.0;428.0;584.9;276.5;401.9;96.72;96.04,I61.9;J69.0;I50.814;N17.9;276.5;I10;5A1955Z;0B...
4,10,184167,admission date date birth sex f service histor...,V30.00;774.2;765.25;765.15;V29.0;99.83;99.15;96.6,Z38.00;P59.0;P07.32;P07.15;Z05.1;6A600ZZ;3E033...


In [93]:
types = set()
num_tok = 0
for row in df.itertuples():
    for w in row[-3].split(' '):
        types.add(w)
        num_tok += 1
print("Num types", len(types))
print("Num tokens", str(num_tok))

Num types 139263
Num tokens 51895155


In [94]:
df.to_csv(data_path + 'processed_notes_labeled.csv', index=False)

In [95]:
del df

### Train and Test split

In [96]:
df  = pd.read_csv(data_path + 'processed_notes_labeled.csv')

In [97]:
import numpy as np
X = df.iloc[:,:2]
y = df.iloc[:,-1:]

from collections import defaultdict
from sklearn.model_selection import GroupShuffleSplit

subject_ids = list(df['SUBJECT_ID'])
# print(subject_ids)
d_dict = defaultdict(lambda: len(d_dict))
list_ids= [d_dict[n] for n in subject_ids]
# Print ids of the dictionary
# print("The list of ids : ", list_ids)

# groups = np.array([1, 1, 2, 2, 2, 3, 3, 3])
# print(groups.shape)

gss = GroupShuffleSplit(n_splits=1, train_size=.9050978607, random_state=42)
gss.get_n_splits()
train_splt = []
test_splt = []
for train_idx, test_idx in gss.split(X, y, list_ids):
    print("TRAIN:", train_idx, "TEST:", test_idx)
    train_splt = train_idx
    test_splt = test_idx
    print(len(train_idx), len(test_idx))

TRAIN: [    0     2     3 ... 52719 52720 52721] TEST: [    1     4     7 ... 52701 52710 52716]
47762 4960


In [98]:
traindf = df.iloc[train_splt]
train_subid = list(traindf['SUBJECT_ID'].unique())
traindf['length'] = traindf.apply(lambda row: len(str(row.loc['TEXT']).split()), axis=1)
traindf = traindf.sort_values(['length'])
traindf.to_csv(data_path + 'train_full.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  traindf['length'] = traindf.apply(lambda row: len(str(row.loc['TEXT']).split()), axis=1)


In [99]:
traindf.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE,length
2992,2896,178124,name admission date date birth sex service med...,211.3;427.31;578.9;560.1;496;584.9;428.0;276.5...,D12.0;I48.91;K92.2;K56.0;J44.9;N17.9;I50.814;2...,20
184,158,169433,admission date date birth sex service med blum...,532.40;493.20;V45.81;412;401.9;44.43,K26.4;J44.9;Z95.1;I25.2;I10;0W3P8ZZ,23
8357,7995,190945,admission date date birth sex f service vascul...,440.22;492.8;401.9;714.0;39.29;88.48,I70.229;J43.9;I10;M06.9;0312090;B40F0ZZ,30
3678,3564,117638,admission date service medicine please see omr...,038.49;041.6;785.59;518.81;507.0;592.1;591;276...,A41.59;B96.4;R57.1;J96.00;J69.0;N20.1;N13.30;E...,36
43779,70734,124010,name admission date date birth sex f service s...,852.20;410.71;805.6;285.1;816.12;E880.9;922.32...,S06.5X0A;I21.4;S32.10XA;D62;S62.523B;W10.8XXA;...,41


In [100]:
testdf = df.iloc[test_splt]
test_subid = list(testdf['SUBJECT_ID'].unique())
testdf['length'] = testdf.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
testdf = testdf.sort_values(['length'])
testdf.to_csv(data_path + 'test_full.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf['length'] = testdf.apply(lambda row: len(str(row['TEXT']).split()), axis=1)


In [101]:
testdf.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE,length
6751,6495,139808,admission date date birth sex service psu disc...,998.59;998.32;905.4;E929.0;041.85;86.22;86.69;...,K68.11;T81.31XA;M84.369S;V87.0XXS;B96.89;0HB0X...,29
22375,21434,191109,admission date date birth sex service micu his...,510.0;162.3;410.91;427.32;427.31;518.81;507.0;...,J86.0;C34.10;I21.3;I48.92;I48.91;J96.00;J69.0;...,64
1379,1284,118883,admission date date birth sex service history ...,571.2;572.2;789.5;584.5;54.91;89.64;38.95;38.93,K70.30;K72.90;789.5;N17.0;0D9530Z;02HP32Z;05HY...,70
17229,16525,134157,name admission date date birth sex f service a...,851.00;518.5;780.39;276.1;285.9;401.9;E888.9;7...,S06.330A;518.5;R56.9;E87.1;D64.9;I10;W19.XXXA;...,74
18942,18139,152558,admission date service medicine history presen...,507.0;518.81;413.9;707.14;008.45;038.9;250.01;...,J69.0;J96.00;I20.8;L97.409;A04.71;A41.9;E10.9;...,78


In [102]:
# devdf = df.iloc[dev_splt]
# dev_subid = list(devdf['SUBJECT_ID'].unique())
# devdf['length'] = devdf.progress_apply(lambda row: len(str(row['TEXT']).split()), axis=1)
# devdf = devdf.sort_values(['length'])
# devdf.to_csv(data_path + 'dev_full.csv',index=False)

In [103]:
# devdf.head()

In [104]:
(len(train_subid),len(test_subid))

(37223, 3903)

### Top 50 codes

In [105]:
#first calculate the top k
counts = Counter()
dfnl = pd.read_csv(data_path + 'processed_notes_labeled.csv')
for row in dfnl.itertuples():
    for label in str(row[-1]).split(';'):
        counts[label] += 1

In [106]:
codes_50 = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

In [107]:
codes_50 = [code[0] for code in codes_50[:50]]

In [108]:
ICD10_50 = codes_50

In [109]:
ICD10_50

['I10',
 '02H633Z',
 'I25.10',
 'I50.814',
 'I48.91',
 '0BH17EZ',
 '0DH67UZ',
 'N17.9',
 'E11.9',
 '5A1935Z',
 'E78.4',
 'J96.00',
 '30233N1',
 '5A1221Z',
 'N39.0',
 'B2000ZZ',
 'K21.9',
 '5A1955Z',
 'E78.00',
 'D64.9',
 'J44.9',
 'A41.9',
 'E03.9',
 'J18.9',
 '02HW32Z',
 'D62',
 '0210088',
 'E87.2',
 '3E0336Z',
 'R65.20',
 'Z79.01',
 'J69.0',
 'B244YZZ',
 'I12.9',
 'F32.9',
 'N18.9',
 'F17.200',
 'I21.4',
 '4A020N7',
 'I25.2',
 '0B933ZX',
 '5A1D70Z',
 '0DJ08ZZ',
 'D69.6',
 'E87.1',
 'Z95.1',
 'I34.0',
 '0BJ08ZZ',
 'I12.0',
 'Z87.891']

In [110]:
#first calculate the top k
counts = Counter()
dfnl = pd.read_csv(data_path + 'processed_notes_labeled.csv')
for row in dfnl.itertuples():
    for label in str(row[-2]).split(';'):
        counts[label] += 1

In [111]:
codes_50 = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

In [112]:
codes_50 = [code[0] for code in codes_50[:50]]

In [113]:
ICD9_50 = codes_50

In [114]:
ICD9_50

['401.9',
 '38.93',
 '428.0',
 '427.31',
 '414.01',
 '96.04',
 '96.6',
 '584.9',
 '250.00',
 '96.71',
 '272.4',
 '518.81',
 '99.04',
 '39.61',
 '599.0',
 '530.81',
 '96.72',
 '272.0',
 '285.9',
 '88.56',
 '244.9',
 '486',
 '38.91',
 '285.1',
 '36.15',
 '276.2',
 '496',
 '99.15',
 '995.92',
 'V58.61',
 '507.0',
 '038.9',
 '88.72',
 '585.9',
 '403.90',
 '311',
 '305.1',
 '37.22',
 '412',
 '33.24',
 '39.95',
 '287.5',
 '410.71',
 '276.1',
 'V45.81',
 '424.0',
 '45.13',
 'V15.82',
 '511.9',
 '93.90']

In [115]:
ICD9_50[-1] = '37.23'

In [116]:
with open(data_path + 'processed_notes_labeled.csv', 'r') as f:
    with open(data_path + 'TOP50_notes_labeled.csv', 'w') as of:
        r = csv.reader(f)
        w = csv.writer(of)
        #header
        w.writerow(next(r))
        for row in r:
#             codes10 = set(str(row[-2]).split(';'))
#             filtered_codes10 = codes10.intersection(set(ICD10_50))
            codes9 = set(str(row[-2]).split(';'))
            filtered_codes9 = codes9.intersection(set(ICD9_50))
            filtered_codes10 = [mapping[code] for code in filtered_codes9]
            if len(filtered_codes9)>0:
                w.writerow(row[:-2]+ [';'.join(filtered_codes9)] + [';'.join(filtered_codes10)])

In [117]:
t50df = pd.read_csv(data_path + 'TOP50_notes_labeled.csv')

In [118]:
t50df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE
0,3,145834,admission date date birth sex service medicine...,96.72;96.6;38.93;038.9;410.71;96.04;428.0;584.9,5A1955Z;0DH67UZ;02H633Z;A41.9;I21.4;0BH17EZ;I5...
1,4,185777,admission date date birth sex f service chief ...,88.72;38.93,B244YZZ;02H633Z
2,6,107064,admission date date birth sex f service admiss...,99.04;V15.82;285.9,30233N1;Z87.891;D64.9
3,9,150750,admission date date birth sex service neurolog...,96.72;507.0;96.04;401.9;428.0;584.9,5A1955Z;J69.0;0BH17EZ;I10;I50.814;N17.9
4,10,184167,admission date date birth sex f service histor...,96.6;99.15,0DH67UZ;3E0336Z


In [119]:
len(t50df.index)

49414

### Subset

In [120]:
t50df = pd.read_csv(data_path + 'TOP50_notes_labeled.csv')

In [121]:
train_ids = pd.read_csv(data_path+'train_50_hadm_ids.csv',names=['HADM_ID'],header=None)
train_admisn_ids  = list(train_ids['HADM_ID'])
train_50 = t50df.loc[t50df['HADM_ID'].isin(train_admisn_ids)]

In [122]:
len(train_50.index)

8066

In [123]:
train_50.to_csv(data_path + 'train.csv', index=False)

In [124]:
test_ids = pd.read_csv(data_path+'test_50_hadm_ids.csv',names=['HADM_ID'],header=None)
test_admisn_ids  = list(test_ids['HADM_ID'])
test_50 = t50df.loc[t50df['HADM_ID'].isin(test_admisn_ids)]

In [125]:
len(test_50.index)

1729

In [126]:
test_50.to_csv(data_path + 'test.csv', index=False)

In [127]:
dev_ids = pd.read_csv(data_path+'dev_50_hadm_ids.csv',names=['HADM_ID'],header=None)
dev_admisn_ids  = list(dev_ids['HADM_ID'])
dev_50 = t50df.loc[t50df['HADM_ID'].isin(dev_admisn_ids)]

In [128]:
len(dev_50.index)

1573

In [129]:
dev_50.to_csv(data_path + 'dev.csv', index=False)

In [130]:
all_admisn_ids = train_admisn_ids + test_admisn_ids + dev_admisn_ids

In [131]:
all_50 = t50df.loc[t50df['HADM_ID'].isin(all_admisn_ids)]

In [132]:
len(all_50.index)

11368

In [133]:
all_50.to_csv(data_path + 'all_50.csv', index=False)

### sort

In [135]:
train_50 = pd.read_csv(data_path + 'train.csv')
train_50['length'] = train_50.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
train_50 = train_50.sort_values(['length'])
train_50.to_csv(data_path + 'train.csv', index=False)

In [136]:
test_50 = pd.read_csv(data_path + 'test.csv')
test_50['length'] = test_50.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
test_50 = test_50.sort_values(['length'])
test_50.to_csv(data_path + 'test.csv', index=False)

In [138]:
dev_50 = pd.read_csv(data_path + 'dev.csv')
dev_50['length'] = dev_50.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
dev_50 = dev_50.sort_values(['length'])
dev_50.to_csv(data_path + 'dev.csv', index=False)

### Label Description

In [16]:
diag_desc = pd.read_csv(data_path + 'D_ICD_DIAGNOSES.csv')
proc_desc = pd.read_csv(data_path + 'D_ICD_PROCEDURES.csv')

In [17]:
diag_desc.head()

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,1173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."


In [18]:
proc_desc.head()

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,264,851,Canthotomy,Canthotomy
1,265,852,Blepharorrhaphy,Blepharorrhaphy
2,266,859,Adjust lid position NEC,Other adjustment of lid position
3,267,861,Lid reconst w skin graft,Reconstruction of eyelid with skin flap or graft
4,268,862,Lid reconst w muc graft,Reconstruction of eyelid with mucous membrane ...


In [20]:
diag_desc['ICD9_CODE'] = diag_desc.apply(lambda row: str(reformat(str(row[1]), True)), axis=1)
proc_desc['ICD9_CODE'] = proc_desc.apply(lambda row: str(reformat(str(row[1]), False)), axis=1)

In [21]:
diag_desc.head()

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,11.66,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,11.7,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,11.71,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,11.72,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,11.73,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."


In [22]:
proc_desc.head()

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,264,85.1,Canthotomy,Canthotomy
1,265,85.2,Blepharorrhaphy,Blepharorrhaphy
2,266,85.9,Adjust lid position NEC,Other adjustment of lid position
3,267,86.1,Lid reconst w skin graft,Reconstruction of eyelid with skin flap or graft
4,268,86.2,Lid reconst w muc graft,Reconstruction of eyelid with mucous membrane ...


In [23]:
code_desc = pd.concat([diag_desc, proc_desc])
code_desc.to_csv(data_path + 'D_ICD.csv', index=False,
               columns=['ICD9_CODE','LONG_TITLE'],
               header=['ICD9_CODE','DESC'])

In [24]:
code_desc = pd.read_csv(data_path + 'D_ICD.csv')

In [25]:
code_desc.head()

Unnamed: 0,ICD9_CODE,DESC
0,11.66,"Tuberculous pneumonia [any form], tubercle bac..."
1,11.7,"Tuberculous pneumothorax, unspecified"
2,11.71,"Tuberculous pneumothorax, bacteriological or h..."
3,11.72,"Tuberculous pneumothorax, bacteriological or h..."
4,11.73,"Tuberculous pneumothorax, tubercle bacilli fou..."


In [149]:
code_desc_50 = code_desc.loc[code_desc['ICD9_CODE'].isin(ICD9_50)]

In [150]:
code_desc_50.head()

Unnamed: 0,ICD9_CODE,DESC
609,38.9,Unspecified septicemia
1588,250.0,Diabetes mellitus without mention of complicat...
1796,244.9,Unspecified acquired hypothyroidism
2386,272.0,Pure hypercholesterolemia
2390,272.4,Other and unspecified hyperlipidemia


In [151]:
code_desc_50.to_csv(data_path + 'D_ICD_50.csv', index=False)

In [152]:
code_desc_50 = pd.read_csv(data_path + 'D_ICD_50.csv')

In [153]:
code_desc_50.head()

Unnamed: 0,ICD9_CODE,DESC
0,38.9,Unspecified septicemia
1,250.0,Diabetes mellitus without mention of complicat...
2,244.9,Unspecified acquired hypothyroidism
3,272.0,Pure hypercholesterolemia
4,272.4,Other and unspecified hyperlipidemia


In [154]:
code_desc_50.iloc[39,-1]

'Closed [endoscopic] biopsy of bronchus'

In [155]:
tokenizer = RegexpTokenizer(r'\w+')
def preprocess_desc(desc):
    ds = desc
    ds = ds.replace('\n',' ')
    ds = ds.replace('\r',' ')
    ds = re.sub(' +',' ',ds)
    ds = re.sub('{\(,\),\[,\]}','',ds)
    ds = ds.strip()
    tokens = [t.lower() for t in tokenizer.tokenize(ds)]
    ds = ' '.join(tokens)
    return ds
preprocess_desc(code_desc_50.iloc[39,-1])

'closed endoscopic biopsy of bronchus'

In [158]:
code_desc_50['DESC'] = code_desc_50.apply(lambda row: str(preprocess_desc(row[-1])), axis=1)

In [159]:
code_desc_50.head()

Unnamed: 0,ICD9_CODE,DESC
0,38.9,unspecified septicemia
1,250.0,diabetes mellitus without mention of complicat...
2,244.9,unspecified acquired hypothyroidism
3,272.0,pure hypercholesterolemia
4,272.4,other and unspecified hyperlipidemia


In [160]:
code_desc_50 = pd.read_csv(data_path + 'D_ICD_50.csv')

In [162]:
def icd9to10(icd_9):
    return mapping[icd_9]
code_desc_50['ICD9_CODE'] = code_desc_50.apply(lambda row: icd9to10(row[0]), axis=1)

In [163]:
code_desc_50.to_csv(data_path + 'D_ICD_50.csv', index=False,
                    columns=['ICD9_CODE','DESC'],
                    header=['ICD10_CODE','DESC'])

In [164]:
code_desc_50 = pd.read_csv(data_path + 'D_ICD_50.csv')

In [165]:
code_desc_50

Unnamed: 0,ICD10_CODE,DESC
0,A41.9,Unspecified septicemia
1,E11.9,Diabetes mellitus without mention of complicat...
2,E03.9,Unspecified acquired hypothyroidism
3,E78.00,Pure hypercholesterolemia
4,E78.4,Other and unspecified hyperlipidemia
5,E87.1,Hyposmolality and/or hyponatremia
6,E87.2,Acidosis
7,F17.200,Tobacco use disorder
8,F32.9,"Depressive disorder, not elsewhere classified"
9,D62,Acute posthemorrhagic anemia


In [166]:
final_codes = ['0210088', '02H633Z', '02HW32Z', '0B933ZX', '0BH17EZ', '0DH67UZ', '0DJ08ZZ', '30233N1', '3E0336Z', '4A020N7', '4A020N8', '5A1221Z', '5A1935Z', '5A1955Z', '5A1D70Z', 'A41.9', 'B2000ZZ', 'B244YZZ', 'D62', 'D64.9', 'D69.6', 'E03.9', 'E11.9', 'E78.00', 'E78.4', 'E87.1', 'E87.2', 'F17.200', 'F32.9', 'I10', 'I12.9', 'I21.4', 'I25.10', 'I25.2', 'I34.0', 'I48.91', 'I50.814', 'J18.9', 'J44.9', 'J69.0', 'J91.8', 'J96.00', 'K21.9', 'N17.9', 'N18.9', 'N39.0', 'R65.20', 'Z79.01', 'Z87.891', 'Z95.1']

In [167]:
got_codes = list(code_desc_50['ICD10_CODE'])

In [168]:
for c in final_codes:
    if c not in got_codes:
        print(c)

In [4]:
df = pd.read_csv(data_path + 'processed_notes_labeled.csv')
df

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE
0,3,145834,admission date date birth sex service medicine...,038.9;785.59;584.9;427.5;410.71;428.0;682.6;42...,A41.9;R57.1;N17.9;I46.9;I21.4;I50.814;L03.119;...
1,4,185777,admission date date birth sex f service chief ...,042;136.3;799.4;276.3;790.7;571.5;041.11;V09.0...,B20;B59;R64;E87.3;R78.81;K74.0;B95.61;Z16.11;E...
2,6,107064,admission date date birth sex f service admiss...,403.91;444.0;997.2;276.6;276.7;285.9;275.3;V15...,I12.0;444.0;T81.719A;276.6;E87.5;D64.9;E83.30;...
3,9,150750,admission date date birth sex service neurolog...,431;507.0;428.0;584.9;276.5;401.9;96.72;96.04,I61.9;J69.0;I50.814;N17.9;276.5;I10;5A1955Z;0B...
4,10,184167,admission date date birth sex f service histor...,V30.00;774.2;765.25;765.15;V29.0;99.83;99.15;96.6,Z38.00;P59.0;P07.32;P07.15;Z05.1;6A600ZZ;3E033...
...,...,...,...,...,...
52717,99985,176670,admission date date birth sex service medicine...,038.9;518.81;482.41;487.0;785.52;V42.81;995.92...,A41.9;J96.00;J15.211;J11.00;R65.21;Z94.81;R65....
52718,99991,151118,admission date date birth sex service surgery ...,562.11;038.9;557.0;584.9;995.92;560.81;789.59;...,K57.32;A41.9;K55.011;N17.9;R65.20;K56.50;R18.8...
52719,99992,197084,admission date date birth sex f service medici...,999.9;568.81;577.2;285.1;584.9;579.9;729.92;53...,T88.8XXA;K66.1;K86.2;D62;N17.9;K90.9;M79.81;K2...
52720,99995,137810,admission date service surgery allergies zanta...,441.4;428.33;998.12;285.1;424.1;250.00;998.11;...,I71.4;I50.33;D78.01;D62;I35.0;E11.9;D78.01;T82...


In [9]:
ICD10_32 = ['I10','I50.9','I48.91','I25.10','N17.9','E11.9','E78.5','N39.0','E78.0','D64.9','E03.9','J18.9','D62','R65.20','F32.9','F17.200','D69.6','Z95.1','Z87.891','I12.0','R65.21','Z79.4','G47.33','J45.909','M81.0','R56.9','N18.6','E66.9','R78.81','F05','E46','E66.01']

In [6]:
len(ICD10_32)

32

In [8]:
with open(data_path + 'processed_notes_labeled.csv', 'r') as f:
    with open(data_path + 'TOP32_notes_labeled.csv', 'w') as of:
        r = csv.reader(f)
        w = csv.writer(of)
        #header
        w.writerow(next(r))
        for row in r:
            codes10 = set(str(row[-1]).split(';'))
            filtered_codes10 = codes10.intersection(set(ICD10_32))
            if len(filtered_codes10)>0:
                w.writerow(row[:-2]+ [';'.join(filtered_codes10)])

In [9]:
df32 = pd.read_csv(data_path + 'TOP32_notes_labeled.csv')
df32

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE
0,3,145834,admission date date birth sex service medicine...,N17.9;E46,
1,4,185777,admission date date birth sex f service chief ...,R78.81,
2,6,107064,admission date date birth sex f service admiss...,D64.9;I12.0;Z87.891,
3,9,150750,admission date date birth sex service neurolog...,N17.9;I10,
4,12,112213,admission date date birth sex service surgery ...,I10,
...,...,...,...,...,...
44133,99985,176670,admission date date birth sex service medicine...,E03.9;D64.9;R65.21;R65.20,
44134,99991,151118,admission date date birth sex service surgery ...,N17.9;E11.9;R65.20,
44135,99992,197084,admission date date birth sex f service medici...,N17.9;I10;D62,
44136,99995,137810,admission date service surgery allergies zanta...,I25.10;Z95.1;E11.9;D62;Z87.891,


In [11]:
df32 = df32.drop(['ICD10_CODE'],axis=1)

In [13]:
df32.to_csv(data_path + 'TOP32_notes_labeled.csv',index=False)

In [18]:
df = pd.read_csv(data_path + 'TOP32_notes_labeled.csv')

In [43]:
import numpy as np
X = df.iloc[:,:2]
y = df.iloc[:,-1:]

from collections import defaultdict
from sklearn.model_selection import GroupShuffleSplit

subject_ids = list(df['SUBJECT_ID'])
# print(subject_ids)
d_dict = defaultdict(lambda: len(d_dict))
list_ids= [d_dict[n] for n in subject_ids]
# Print ids of the dictionary
# print("The list of ids : ", list_ids)

# groups = np.array([1, 1, 2, 2, 2, 3, 3, 3])
# print(groups.shape)

gss = GroupShuffleSplit(n_splits=1, train_size=.7, random_state=42)
gss.get_n_splits()
train_splt = []
test_splt = []
for train_idx, test_idx in gss.split(X, y, list_ids):
    print("TRAIN:", train_idx, "TEST:", test_idx)
    train_splt = train_idx
    test_splt = test_idx
    print(len(train_idx), len(test_idx))

TRAIN: [    1     2     4 ... 12966 12967 12970] TEST: [    0     3    10 ... 12969 12971 12972]
9042 3931


In [34]:
train_df = df.iloc[train_splt]
train_df

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE
1,4,185777,admission date date birth sex f service chief ...,R78.81
2,6,107064,admission date date birth sex f service admiss...,D64.9;I12.0;Z87.891
5,13,143045,admission date date birth sex f service cardia...,I25.10;I10;E11.9
9,20,157681,admission date date birth sex f service histor...,I25.10;I10;E11.9
10,21,109451,admission date service medicine allergies pati...,N17.9;I25.10;E11.9;I48.91;I12.0
...,...,...,...,...
44131,99982,183791,admission date date birth sex service medicine...,N17.9;D69.6;I48.91
44132,99983,117390,admission date date birth sex service medicine...,N17.9;I25.10;I10
44134,99991,151118,admission date date birth sex service surgery ...,N17.9;E11.9;R65.20
44135,99992,197084,admission date date birth sex f service medici...,N17.9;I10;D62


In [37]:
train_df.to_csv(data_path + 'train32.csv',index=False)

In [35]:
test_df = df.iloc[test_splt]
test_df

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE
0,3,145834,admission date date birth sex service medicine...,N17.9;E46
3,9,150750,admission date date birth sex service neurolog...,N17.9;I10
4,12,112213,admission date date birth sex service surgery ...,I10
6,17,161087,admission date date birth sex f service cardio...,F32.9
7,18,188822,admission date date birth sex service psychiat...,Z79.4;R56.9;I10
...,...,...,...,...
44087,99830,176834,admission date date birth sex service cardioth...,N17.9;I10;E11.9;R65.20;I48.91;D62
44118,99936,107913,admission date service neurology allergies pat...,I48.91
44123,99946,157197,admission date date birth sex service medicine...,N39.0
44133,99985,176670,admission date date birth sex service medicine...,E03.9;D64.9;R65.21;R65.20


In [38]:
df= test_df

In [44]:
test_df = df.iloc[train_splt]
test_df

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE
3,9,150750,admission date date birth sex service neurolog...,N17.9;I10
4,12,112213,admission date date birth sex service surgery ...,I10
7,18,188822,admission date date birth sex service psychiat...,Z79.4;R56.9;I10
8,19,109235,admission date service surgery allergies patie...,M81.0;N39.0;I10
19,30,104557,admission date service identification chief co...,I25.10;I10
...,...,...,...,...
44080,99814,186518,admission date service surgery allergies morph...,D64.9;N39.0;E11.9;I48.91
44083,99822,146997,admission date date birth sex service medicine...,Z87.891;I10
44084,99822,163117,admission date date birth sex service medicine...,I10;J18.9
44085,99822,195871,admission date date birth sex service medicine...,N17.9;I10;R65.20;I48.91;E46


In [45]:
test_df.to_csv(data_path + 'test32.csv',index=False)

In [46]:
val_df = df.iloc[test_splt]
val_df.to_csv(data_path + 'val32.csv',index=False)

In [47]:
val_df

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE
0,3,145834,admission date date birth sex service medicine...,N17.9;E46
6,17,161087,admission date date birth sex f service cardio...,F32.9
33,45,157907,admission date date birth sex service history ...,F17.200
37,53,155385,admission date date birth sex service cme hist...,I25.10;I10
39,56,181711,admission date service medicine allergies pati...,R56.9;I10
...,...,...,...,...
44081,99817,195557,admission date date birth sex service medicine...,N39.0;I25.10;N17.9;J18.9
44087,99830,176834,admission date date birth sex service cardioth...,N17.9;I10;E11.9;R65.20;I48.91;D62
44118,99936,107913,admission date service neurology allergies pat...,I48.91
44133,99985,176670,admission date date birth sex service medicine...,E03.9;D64.9;R65.21;R65.20


In [49]:
mapping

{'403.01': 'I12.0',
 '486': 'J18.9',
 '582.81': 'N08',
 '585.5': 'N18.5',
 '425.4': 'I42.5',
 '276.2': 'E87.2',
 '710.0': 'M32.10',
 '276.7': 'E87.5',
 '724.3': 'M54.30',
 '458.29': 'I95.2',
 '287.5': 'D69.6',
 '285.21': 'D63.1',
 '285.29': 'D63.8',
 '275.41': 'E83.51',
 '585.6': 'N18.6',
 '583.81': 'N08',
 '558.9': 'K52.3',
 '327.23': 'G47.33',
 '228.04': 'D18.03',
 '338.29': 'G89.29',
 '789.00': 'R10.9',
 '790.92': 'R79.1',
 'V45.11': 'Z99.2',
 '531.00': 'K25.0',
 '410.71': 'I21.4',
 '285.9': 'D64.9',
 '414.01': 'I25.10',
 '725': 'M35.3',
 '191.5': 'C71.5',
 '331.4': 'G91.1',
 '530.81': 'K21.9',
 '411.1': 'I20.0',
 '482.83': 'J15.6',
 '272.0': 'E78.00',
 '305.1': 'F17.200',
 '194.0': 'C74.90',
 '197.7': 'C78.7',
 '255.3': 'E27.0',
 '424.0': 'I34.0',
 '584.5': 'N17.0',
 '998.59': 'K68.11',
 '682.2': 'L03.319',
 '511.9': 'J91.8',
 '599.0': 'N39.0',
 '428.0': 'I50.814',
 '349.82': 'G92',
 '401.9': 'I10',
 'V10.00': 'Z85.00',
 'V45.3': 'Z98.0',
 'V58.65': 'Z79.51',
 '041.3': 'B96.1',
 '2

In [5]:
reverse_mapping = {v: k for k, v in mapping.items()}

In [34]:
mapping = {v: k for k, v in reverse_mapping.items()}

In [15]:
ICD9_32 = [reverse_mapping[key] for key in ICD10_32]

In [13]:
reverse_mapping['E78.0'] = 272.0

In [14]:
for key in ICD10_32:
    if key not in reverse_mapping.keys():
        print(key)

In [26]:
ICD9_32

['401.1',
 428.0,
 '427.31',
 '429.2',
 '584.9',
 '250.00',
 272.4,
 '599.0',
 272.0,
 '285.9',
 '244.9',
 '486',
 '285.1',
 '995.92',
 '296.20',
 '305.1',
 '287.5',
 'V45.81',
 'V15.82',
 '403.11',
 '785.52',
 'V58.67',
 '327.23',
 '493.90',
 '733.01',
 '780.39',
 '585.6',
 '278.00',
 '771.83',
 '293.1',
 '263.8',
 '278.01']

In [27]:
code_desc_32 = code_desc.loc[code_desc['ICD9_CODE'].isin(ICD9_32)]

In [32]:
len(code_desc_32.index)

32

In [31]:
code_desc_32.loc[len(code_desc_32.index)] = ['428.0','Congestive heart failure unspecified']
code_desc_32.loc[len(code_desc_32.index)] = ['272.4','Other and unspecified hyperlipidemia']
code_desc_32.loc[len(code_desc_32.index)] = ['272.0','Pure hypercholesterolemia']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  code_desc_32.loc[len(code_desc_32.index)] = ['428.0','Congestive heart failure unspecified']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  code_desc_32.loc[len(code_desc_32.index)] = ['272.4','Other and unspecified hyperlipidemia']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  code_desc_32.loc[len(code_desc_32.index)] = ['272.0','Pure hypercholesterolemia']


In [35]:
def map9_10(value):
    return mapping[value]
code_desc_32['ICD9_CODE'] = code_desc_32.apply(lambda row: map9_10(row['ICD9_CODE']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  code_desc_32['ICD9_CODE'] = code_desc_32.apply(lambda row: map9_10(row['ICD9_CODE']),axis=1)


In [42]:
code_desc_32

Unnamed: 0,ICD9_CODE,DESC
1588,E11.9,Diabetes mellitus without mention of complicat...
1638,E46,Other protein-calorie malnutrition
1796,E03.9,Unspecified acquired hypothyroidism
2462,E66.9,"Obesity, unspecified"
2463,E66.01,Morbid obesity
2898,F17.200,Tobacco use disorder
3118,G47.33,Obstructive sleep apnea (adult)(pediatric)
3140,D62,Acute posthemorrhagic anemia
3146,D64.9,"Anemia, unspecified"
3168,D69.6,"Thrombocytopenia, unspecified"


In [37]:
reverse_mapping['I50.9']

428.0

In [41]:
code_desc_32.iloc[-1,0] = 'E78.0'

In [44]:
code_desc_32.to_csv(data_path + 'D_ICD_32.csv',index=False)