In [1]:
# Settings
import pickle
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import scispacy
import spacy
from functools import reduce
import swifter
import sys
from spacy import displacy
import en_ner_bc5cdr_md

In [2]:
# Load WV
filename = '/home/mimic/Downloads/BioWordVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin'
wv_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [3]:
# Load NLP
nlp_model = en_ner_bc5cdr_md.load()
file_code = 'bc5cdr'

# WV Code block
```
from gensim.models import KeyedVectors

filename = '/home/mimic/Downloads/BioWordVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin'
wv_model = KeyedVectors.load_word2vec_format(filename, binary=True)
word = ""
wv_model.get_vector(word.lower())
```

In [4]:
train_data_path = '/home/mimic/Data/NOTEEVENTS_all_fold0/train_notes_seq.pickle'
val_data_path = '/home/mimic/Data/NOTEEVENTS_all_fold0/val_notes_seq.pickle'
test_data_path = '/home/mimic/Data/NOTEEVENTS_all_fold0/test_notes_seq.pickle'

with open(train_data_path, 'rb') as f:
    train_data = pickle.load(f)
    
with open(val_data_path, 'rb') as f:
    val_data = pickle.load(f)

with open(test_data_path, 'rb') as f:
    test_data = pickle.load(f)

In [32]:
match_list = ['CHIEF COMPLAINT:',
'History of Present Illness:',
'Past Medical History:',
'Social History:',
'Family History:',
'Physical Exam:',
'Pertinent Results:',
'Brief Hospital Course:',
'Discharge Medication:',
'Discharge Diagnosis:',
'Discharge Condition:',
'Followup Instructions:',
'PAST MEDICAL HISTORY:'
'MEDICATIONS ON ADMISSION:',
'DISCHARGE CONDITION:',
'DISCHARGE DIAGNOSES:',
'DISCHARGE MEDICATIONS:']
match_list = list(set([w.lower() for w in match_list]))  # matching을 위해 소문자로 변경, 중복 인원으로부터 리스트를 단순 병합하므로 unique 수행

def get_matched(text):
    text_split = text.split('\n\n')
    
    is_keep = False  # 저장해야하는 단락인지 구분
    for text_in in text_split:
        # match_list 키워드가 있는 경우에 저장, matching을 위해 소문자로 변경하고 원문 형태는 유지
        text_in_lower = text_in.lower()
        if any(match_key in text_in_lower  for match_key in match_list):
            is_keep = True  # match에 해당하는 단락이므로 저장
            continue
        else: 
            if ':' in text_in:  # match가 아닌 새로운 단락이므로 제거
                text_split.remove(text_in)
                is_keep = False  # 저장할 필요 없는 단락이므로 제거
            else:
                if is_keep:  # 저장할 단락인지 여부에 따라 저장하거나 제거
                    continue
                else:
                    text_split.remove(text_in)
    text = '\n\n'.join(text_split)
    unique_ner= set([str(ent).lower() for ent in nlp_model(text).ents])  # 향후 wv을 위해 소문자로 변경, wv모델이 소문자만 허용
    
    return unique_ner

def convert_to_vector(word_list):
    vector_list = []
    for word in word_list:
        try:
            vector_list.append(wv_model.get_vector(word))
        except KeyError:
            continue
    vector_list = [0]  if len(vector_list) == 0 else vector_list # 낮은 확률도 빈 값이 가능, 빈 값은 0으로 조정
    return np.mean(vector_list, axis=0, dtype=np.float64)

In [6]:
%%time
total_data = pd.concat([train_data, val_data,  test_data])

CPU times: user 18.8 ms, sys: 11.3 ms, total: 30 ms
Wall time: 33.9 ms


In [7]:
%%time
text_df = total_data[['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'TEXT']]\
    .groupby(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID'])\
    .agg('\n\n'.join)\
    .reset_index()

CPU times: user 330 ms, sys: 22.7 ms, total: 353 ms
Wall time: 359 ms


In [8]:
%%time
text_df['word_list'] = text_df['TEXT'].swifter.apply(lambda text: get_matched(text))
with open('/home/mimic/Data/pickle/text_df.pk', 'wb') as f:
    pickle.dump(text_df, f, pickle.HIGHEST_PROTOCOL)

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=37981.0), HTML(value='')))


CPU times: user 2h 15min 59s, sys: 2min 53s, total: 2h 18min 53s
Wall time: 2h 18min 36s


In [9]:
%%time
text_df['mean_wv']  = text_df['word_list'].swifter.apply(lambda word_list : convert_to_vector(word_list))
with open('/home/mimic/Data/pickle/discharge_note_wv.pk', 'wb') as f:
    pickle.dump(text_df, f, pickle.HIGHEST_PROTOCOL)

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=37981.0), HTML(value='')))


CPU times: user 6.98 s, sys: 552 ms, total: 7.53 s
Wall time: 8.96 s


In [4]:
# Load from pk
discharge_note_wv_path = '/home/mimic/Data/pickle/discharge_note_wv.pk'
with open(discharge_note_wv_path, 'rb') as f:
    result_df = pickle.load(f)
result_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,TEXT,word_list,mean_wv
0,3,145834.0,211552,Admission Date: [**2101-10-20**] Discharg...,"{q.4, 3-vessel disease, metoprolol, atenolol, ...","[0.320220878461794, 0.03231701895366518, -0.28..."
1,4,185777.0,294638,Admission Date: [**2191-3-16**] Discharge...,"{lantus, rash, fatigue, pneumocystis carinii, ...","[0.21231900266296155, 0.11549778116514554, -0...."
2,12,112213.0,232669,Admission Date: [**2104-8-7**] Discharge ...,"{allopurinol, atenolol, lymphadenopathy, cyano...","[0.17977227432032425, 0.3296726663907369, -0.3..."
3,13,143045.0,263738,Admission Date: [**2167-1-8**] Discharg...,"{back pain, metoprolol, euglycemia, atenolol, ...","[0.33918255337170866, 0.19825733437104837, -0...."
4,17,161087.0,257980,Admission Date: [**2135-5-9**] D...,"{pleural effusion, metoprolol tartrate, cardio...","[-0.0037960425357926974, 0.27882304292341525, ..."


In [55]:
def get_code_set(file_name):
    code_df = pd.read_csv(file_name)
    if 'TOTAL' in code_df.columns:
        code_df['m'] = code_df['TOTAL']  # Input 종류마다 대응을 위해 추가
        code_df['w'] = code_df['0']
    code_df = code_df[['w', 'm']]
    code_df = code_df[code_df['m'] > 0]  # 0 이하는 버림
    return set(code_df['w'])

In [65]:
# Select code
note_300 = get_code_set('/home/mimic/Data/csv/m300.csv')
note_400 = get_code_set('/home/mimic/Data/csv/m400.csv')
note_500 = get_code_set('/home/mimic/Data/csv/m500.csv')
note_600 = get_code_set('/home/mimic/Data/csv/m600.csv')
note_900 = get_code_set('/home/mimic/Data/csv/m900.csv')

In [66]:
len(note_300), len(note_400), len(note_500), len(note_600), len(note_900)

(320, 401, 527, 601, 934)

In [67]:
%%time
result_df['k300_list'] = result_df['word_list'].apply(lambda word_list: word_list &  note_300)
result_df['k300_mean_wv']  = result_df['k300_list'].swifter.apply(lambda word_list : convert_to_vector(word_list))
result_df['k400_list'] = result_df['word_list'].apply(lambda word_list: word_list & note_400)
result_df['k400_mean_wv']  = result_df['k400_list'].swifter.apply(lambda word_list : convert_to_vector(word_list))
result_df['k500_list'] = result_df['word_list'].apply(lambda word_list: word_list & note_500)
result_df['k500_mean_wv']  = result_df['k500_list'].swifter.apply(lambda word_list : convert_to_vector(word_list))
result_df['k600_list'] = result_df['word_list'].apply(lambda word_list: word_list & note_600)
result_df['k600_mean_wv']  = result_df['k500_list'].swifter.apply(lambda word_list : convert_to_vector(word_list))
result_df['k900_list'] = result_df['word_list'].apply(lambda word_list: word_list & note_900)  # 겹치는 항목 없이 별도 추출
result_df['k900_mean_wv']  = result_df['k500_list'].swifter.apply(lambda word_list : convert_to_vector(word_list))

with open('/home/mimic/Data/pickle/keyword_discharge_note_wv.pk', 'wb') as f:
    pickle.dump(result_df, f, pickle.HIGHEST_PROTOCOL)

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=37981.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=37981.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=37981.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=37981.0), HTML(value='')))




HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=37981.0), HTML(value='')))


CPU times: user 15.2 s, sys: 1.91 s, total: 17.1 s
Wall time: 18.7 s


In [68]:
result_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,TEXT,word_list,mean_wv,k300_list,k300_mean_wv,k400_list,k400_mean_wv,k500_list,k500_mean_wv,k600_list,k600_mean_wv,k900_list,k900_mean_wv
0,3,145834.0,211552,Admission Date: [**2101-10-20**] Discharg...,"{q.4, 3-vessel disease, metoprolol, atenolol, ...","[0.320220878461794, 0.03231701895366518, -0.28...","{ulcers, wheezing, hypotension, hypoxia, tachy...","[0.01767680738121271, 0.2923035994172096, -0.2...","{ulcers, furosemide, fluconazole, amiodarone, ...","[0.34048854805190454, -0.0165365453470837, -0....","{ulcers, furosemide, fluconazole, amiodarone, ...","[0.31337650236673653, 0.050148998076717057, -0...","{ulcers, furosemide, fluconazole, amiodarone, ...","[0.31337650236673653, 0.050148998076717057, -0...","{ulcers, hypercholesterolemia, pertinent, whee...","[0.31337650236673653, 0.050148998076717057, -0..."
1,4,185777.0,294638,Admission Date: [**2191-3-16**] Discharge...,"{lantus, rash, fatigue, pneumocystis carinii, ...","[0.21231900266296155, 0.11549778116514554, -0....","{pneumonia, hypothyroidism, dyspnea, fevers}","[-0.09185649687424302, 0.5237937457859516, -0....","{pneumonia, prednisone, nausea, dyspnea, fever...","[0.16540342862052576, 0.1545292820249285, -0.1...","{pneumonia, prednisone, nausea, dyspnea, fever...","[0.2055454987566918, 0.11993937194347382, -0.1...","{pneumonia, prednisone, nausea, dyspnea, fever...","[0.2055454987566918, 0.11993937194347382, -0.1...","{pneumonia, myalgias, fatigue, fibrosis, chill...","[0.2055454987566918, 0.11993937194347382, -0.1..."
2,12,112213.0,232669,Admission Date: [**2104-8-7**] Discharge ...,"{allopurinol, atenolol, lymphadenopathy, cyano...","[0.17977227432032425, 0.3296726663907369, -0.3...","{jaundiced, edema, asystolic}","[-0.09920000036557515, 0.18325999875863394, -0...","{jaundiced, edema, asystolic}","[-0.09920000036557515, 0.18325999875863394, -0...","{jaundiced, haldol, hypertension, edema, asyst...","[-0.20152320079505442, 0.34668799340724943, -0...","{jaundiced, haldol, hypertension, edema, asyst...","[-0.20152320079505442, 0.34668799340724943, -0...","{jaundiced, lymphadenopathy, agitation, clubbi...","[-0.20152320079505442, 0.34668799340724943, -0..."
3,13,143045.0,263738,Admission Date: [**2167-1-8**] Discharg...,"{back pain, metoprolol, euglycemia, atenolol, ...","[0.33918255337170866, 0.19825733437104837, -0....","{obesity, hyperglycemia, infarction}","[0.3263743395606677, 0.5349100132783254, -0.13...","{nitroglycerin, hyperglycemia, infarction, nau...","[0.3651686057448387, 0.40142000615596773, -0.3...","{nitroglycerin, hyperglycemia, infarction, hyp...","[0.30683117142568034, 0.46513166775306064, -0....","{nitroglycerin, hyperglycemia, infarction, hyp...","[0.30683117142568034, 0.46513166775306064, -0....","{euglycemia, hyperglycemia, constipation, infa...","[0.30683117142568034, 0.46513166775306064, -0...."
4,17,161087.0,257980,Admission Date: [**2135-5-9**] D...,"{pleural effusion, metoprolol tartrate, cardio...","[-0.0037960425357926974, 0.27882304292341525, ...","{depression, anxiety, anxiety/depression, hypo...","[-0.34851058945059776, 0.5781935974955559, -0....","{furosemide, depression, anxiety, anxiety/depr...","[-0.07847756307039942, 0.2979339946593557, -0....","{furosemide, depression, anxiety, anxiety/depr...","[-0.07847756307039942, 0.2979339946593557, -0....","{furosemide, depression, anxiety, anxiety/depr...","[-0.07847756307039942, 0.2979339946593557, -0....","{depression, anxiety, anxiety/depression, hypo...","[-0.07847756307039942, 0.2979339946593557, -0...."


In [71]:
np.mean(result_df['word_list'].apply(lambda l: len(l))), \
np.mean(result_df['k300_list'].apply(lambda l: len(l))), \
np.mean(result_df['k400_list'].apply(lambda l: len(l))), \
np.mean(result_df['k500_list'].apply(lambda l: len(l))), \
np.mean(result_df['k600_list'].apply(lambda l: len(l))), \
np.mean(result_df['k900_list'].apply(lambda l: len(l)))

(57.75263947763355,
 5.178984228956583,
 7.133040204312683,
 9.071720070561597,
 9.190753271372529,
 10.839340722993075)

In [72]:
np.mean(result_df['word_list'].apply(lambda l: len(l)) == 0), \
np.mean(result_df['k300_list'].apply(lambda l: len(l)) == 0), \
np.mean(result_df['k400_list'].apply(lambda l: len(l)) == 0), \
np.mean(result_df['k500_list'].apply(lambda l: len(l)) == 0), \
np.mean(result_df['k600_list'].apply(lambda l: len(l)) == 0), \
np.mean(result_df['k900_list'].apply(lambda l: len(l)) == 0)

(0.00018430267765461679,
 0.04818198572970696,
 0.0218003738711461,
 0.009583739238040073,
 0.009346778652484136,
 0.006213633132355651)