# Word2Vec

In [1]:
num_threads = 4
w2vType = 'Word2Vec' # or Word2Vec 'FastText'
n_dim = 100
embed_name = '{}{}d'.format(w2vType, n_dim)
vocab_name = 'vocab.json'
min_count = 1
matrix_name = 'wv_matrix{}d'.format(n_dim)
#filename = 'mlds_hw2_2_data/clr_conversation.txt'

In [2]:
import json
import pandas as pd
from gensim.models import Word2Vec, KeyedVectors, FastText
from tqdm import tqdm_notebook as tqdm
import numpy as np
import os

In [3]:
BOS = '<bos>'
EOS = '<eos>'
PAD = '<pad>'
UNK = '<unk>'
MAX_Q_LEN = 10
MAX_A_LEN = 10

In [4]:
sents = []
def loadsents(name):
    with open(name, 'r', encoding='utf-8') as f:
        for s in f:
            if s is not "+++$+++":
                sents.append([BOS] + s.split() + [EOS] + [PAD])
#loadsents(filename)
#sents[0]

In [5]:
for root, dirs, files in os.walk("../week3/"):
    for _dir in dirs:
        if(_dir.isnumeric()):
            dir_name = os.path.join(root, _dir);
            for f in os.listdir(dir_name):
                if(f.find('ipynb') == -1):
                    filename = os.path.join(dir_name, f)
                    loadsents(filename)
                

In [6]:
from gensim.models.callbacks import CallbackAny2Vec
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [7]:
epoch_logger = EpochLogger()
if w2vType == 'FastText':
    model = FastText(size=n_dim, window=5, min_count=min_count, workers=num_threads, callbacks=[epoch_logger])
else:
    model = Word2Vec(size=n_dim, window=5, min_count=min_count, workers=num_threads, callbacks=[epoch_logger])

model.build_vocab(sents)
total_examples = model.corpus_count
model.train(sents, total_examples=total_examples, epochs=5)

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


(3593898, 5455625)

# 測試

In [8]:
word = 'patient'
print(word)
print(model.wv.similar_by_word(word))

patient
[("patient's", 0.955096960067749), ('recommended', 0.9459973573684692), ('son', 0.9380838871002197), ('wife', 0.9299428462982178), ('that', 0.9284614324569702), ('made', 0.9284200668334961), ('family,', 0.9238922595977783), ('patient,', 0.9200592637062073), ('morning,', 0.9185083508491516), ('they', 0.9183948636054993)]


# 假設用藥與病患身體器官的疾病有關，且該器官會影響病患生命週期

In [9]:
df = pd.read_pickle("../week5_6_7/id_life_med")
df

Unnamed: 0,life span,frequent medicine
1,46,H
10,34,m
100,16,C
101,14,C
102,27,po
103,19,m
104,1,F
105,8,C
106,127,m
107,15,F


In [10]:
with open("../week4/medicines_dict.json") as f:
    medicines_dict = json.load(f) 

with open("../week4/medicines_abb_map.json") as f:
    medicines_abb_map = json.load(f) 

In [11]:
effective = []
for med in medicines_dict:
    try:
        print(med)
        print(model.wv.similar_by_word(med))
        effective.append(med)
    except KeyError:
        print("not in vocab")
    print("")
        

EG abb
not in vocab

ABG
[('EF', 0.9812142848968506), ('RBBB', 0.9769055843353271), ('GLU:10', 0.9765130281448364), ('VBG', 0.9762697219848633), ('2/28', 0.9762067198753357), ('color,', 0.9758282899856567), ('screening', 0.9753310680389404), ('5/8:', 0.9749839305877686), ('material.', 0.9748139381408691), ('ERCP,', 0.9743691682815552)]

ACE
not in vocab

ACTH
[('2015/10/04', 0.9913219213485718), ('[0547]', 0.9907346963882446), ('SVV', 0.9901636242866516), ('336', 0.9895258545875549), ('No:', 0.9895095825195312), ('11.2', 0.9887610673904419), ('21.2', 0.9886276721954346), ('01:32', 0.9881524443626404), ('237', 0.9880552887916565), ('MRI)', 0.9879759550094604)]

ADH
not in vocab

AED
[('albumin', 0.9606443643569946), ('use', 0.9518072605133057), ('antibiotics', 0.9513625502586365), ('titrate', 0.9487053155899048), ('Taper', 0.9479968547821045), ('Abx', 0.9472439885139465), ('PPI', 0.9435383081436157), ('TPN', 0.9424633979797363), ('Brosym', 0.9414775371551514), ('abx', 0.9406676888465881

[('Pigtail', 0.9612295627593994), ('ABG', 0.958040714263916), ('screening', 0.9568829536437988), ('Ascites', 0.9558737277984619), ('D7', 0.9513881206512451), ('0.20.', 0.9509432315826416), ('CXR,', 0.9486680030822754), ('NT-proBNP', 0.9482648372650146), ('material.', 0.9479853510856628), ('testing', 0.946348249912262)]

PET
[('disclosed', 0.964920699596405), ('07/07', 0.947624921798706), ('Bone', 0.9447804093360901), ('FDG', 0.9428813457489014), ('14:20', 0.9424992203712463), ('2016/08/22', 0.9424794316291809), ('showed:', 0.9424680471420288), ('2017/02/25', 0.9422338604927063), ('2012/10/23', 0.9344688653945923), ('|Contrast-Abdomen', 0.9303827285766602)]

pg
[('M/μL', 0.9913740158081055), ('<-', 0.9911940097808838), ('fL', 0.9911817312240601), ('1050630', 0.9911141991615295), ('SpO2:98%(30%,L,VCR)', 0.9905661940574646), ('toleralte', 0.990378201007843), ('1041204', 0.9902157783508301), ('家屬接受安寧共照且希望加強疼痛控制.', 0.9891231060028076), ('[0502]', 0.9888980388641357), ('說明安寧照顧內涵,', 0.9888115

In [12]:
print(effective)

['ABG', 'ACTH', 'AED', 'ALP', 'ALT', 'AST', 'bid', 'BP', 'BUN', 'BR', 'C', 'Ca', 'CBC', 'CK', 'Cl', 'cm', 'CNS', 'CO2', 'COPD', 'CPK', 'CPR', 'CSF', 'CT', 'DNA', 'ECG', 'EEG', 'EGD', 'ENT', 'ERCP', 'ESR', 'F', 'G', 'GFR', 'GI', 'GVHD', 'G6PD', 'GU', 'Hb', 'HCl', 'HCO3', 'Hct', 'HIV', 'HLA', 'hs', 'ICU', 'IgA', 'IM', 'INR', 'IU', 'IV', 'K', 'kg', 'L', 'LDH', 'M', 'm', 'MCH', 'MCHC', 'mCi', 'MCV', 'mEq', 'Mg', 'mg', 'MI', 'MIC', 'mL', 'mm', 'MRI', 'N', 'Na', 'NaCl', 'ng', 'NSAID', 'O2', 'P', 'PCR', 'PET', 'pg', 'pH', 'PMN', 'po', 'PPD', 'ppm', 'prn', 'PT', 'PTT', 'RA', 'RBC', 'RNA', 'sc', 'SI', 'SLE', 'sp', 'TB', 'TIBC', 'tid', 'TPN', 'URI', 'UTI', 'WBC', 'WHO', 'wt']


In [13]:
import requests
url = "http://www.vhct.gov.tw/index.php?mo=SitePage&ac=sitepage_show&pgsn=92"
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)
organ_table = pd.read_html(r.text)



In [14]:
print(organ_table[0])

            0                                                  1
0         NaN                                                NaN
1         中文名                                                英文名
2        人體器官                               Organs of Human Body
3          頭部                                               Head
4          頭髮                                   Hair of the head
5         後頭部                                   Back of the head
6           臉                                               Face
7           額  Forehead, frontal emninence, frontal bump, sup...
8           頰                                              Cheek
9           口                                              Mouth
10          頦                                               Chin
11          眼                                                Eye
12          鼻                                               Nose
13          眉                                            Eyebrow
14        上眼瞼            

In [15]:
organs = []
for ct,i in enumerate(list(organ_table[0].loc[: ,0])):
        if(ct>1):
            organs.append(i)

In [16]:
print(organs)

['人體器官', '頭部', '頭髮', '後頭部', '臉', '額', '頰', '口', '頦', '眼', '鼻', '眉', '上眼瞼', '下眼瞼', '睫毛', '虹膜', '瞳孔', '眼外肌', '眼球', '晶體狀', '玻璃體', '角膜', '視網膜', '鞏膜', '視神經', '盲點', '鼻唇溝', '人中（上唇中溝）', '頸', '項頸背', '喉（嚨）', '頜，顎', '上頜', '腦', '大腦', '小腦', '耳朵', '外耳', '耳廓', '耳垂', '外耳道', '中耳', '鼓膜', '鼓室', '聽小骨', '錘骨', '砧骨', '鐙骨', '耳咽管（咽鼓管）', '內耳', '半規管', '耳蝸', '前庭', '聽神經', '鼻子', '鼻梁', '鼻孔', '鼻中隔', '鼻腔', '鼻骨', '鼻旁竇', '鼻甲', '鼻後孔', '鼻翼', '鼻道', '鼻毛', '口腔和咽', '上唇', '下唇', '牙齦', '牙', '硬腭', '軟腭', '口角', '懸雍垂（小舌）', '（腭）扁桃體', '咽（峽）', '舌', '喉', '會厭', '聲帶', '聲門', '牙齒', '切齒門齒', '尖牙，犬牙', '雙尖牙，前磨牙，前臼齒', '磨牙，臼齒（後牙）', '乳牙（暫齒）', '智齒', '桓牙', '三尖牙', '牙槽', '牙周膜', '牙骨質', '牙冠', '牙根', '牙釉質', '牙質', '牙髓', '血管和神經', '人體', '肩', '肩胛', '腰', '腋，腋窩', '腋毛', '胸，胸膛', '乳房', '乳頭', '乳暈', '脅', '後臀', '臍', '腹部', '上腹', '下腹', '腹股溝', '屁股', '上臂', '前臂', '肘', '腕', '拳', '大腿', '膝，膝�', '膕窩', '小腿', '腓腸', '手', '拇指', '食指', '中指', '無名指，環指', '小指', '手背', '手掌', '撓（骨）側', '尺（骨）側', '指甲', '指紋', '魚際', '掌紋', '腕骨', '腕關節', '指骨', '指尖', '指關節', '腳，足', '拇指', '第二趾', '第三趾', '第四趾', '小趾', 

In [None]:
for med in effective:
    print(med)
    for organ in organs:
        #model.wv.dot(med, )
        try:
            ans = model.similarity(med, organ)
            print(organ)
            print(ans)
            input("")
        except KeyError:
            pass
        


  


ABG
wtf
wtf
wtf
wtf
wtf
wtf
wtf
wtf
wtf
眼
0.8773076185029987
