In [14]:
import csv
import numpy as np
import pandas as pd
import operator
from collections import defaultdict
from scipy.sparse import csr_matrix
import gensim.models.word2vec as w2v
import gensim.models
from tqdm.notebook import tqdm
tqdm.pandas()

In [15]:
data_path = '../data/'

In [16]:
df = pd.read_csv(data_path + 'all_50.csv')

In [17]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE
0,17,161087,admission date date birth sex f service cardio...,38.93;272.4;511.9;88.72;311,02H633Z;E78.4;J91.8;B244YZZ;F32.9
1,20,157681,admission date date birth sex f service histor...,272.4;36.15;414.01;401.9;250.00;39.61,E78.4;0210088;I25.10;I10;E11.9;5A1221Z
2,33,176176,admission date service history present illness...,038.9,A41.9
3,38,185910,admission date date birth sex service urology ...,96.72;995.92;96.6;38.93;427.31;038.9;428.0;584...,5A1955Z;R65.20;0DH67UZ;02H633Z;I48.91;A41.9;I5...
4,41,101757,admission date date birth sex service medicine...,96.71;507.0;285.9;96.6;38.93;401.9;99.04;305.1...,5A1935Z;J69.0;D64.9;0DH67UZ;02H633Z;I10;30233N...


In [18]:
labeldf = pd.read_csv(data_path + 'D_ICD_50.csv')

In [19]:
labeldf.head()

Unnamed: 0,ICD10_CODE,DESC
0,A41.9,Unspecified septicemia
1,E11.9,Diabetes mellitus without mention of complicat...
2,E03.9,Unspecified acquired hypothyroidism
3,E78.00,Pure hypercholesterolemia
4,E78.4,Other and unspecified hyperlipidemia


### Build Vocab

In [20]:
types = set()
for row in df.itertuples():
    for w in row[-3].split(' '):
        types.add(w)

In [21]:
len(types)

66320

In [22]:
for row in labeldf.itertuples():
    for w in row[-1].split(' '):
        types.add(w)

In [23]:
len(types)

66391

In [24]:
vocab_list = list(types)

In [25]:
vocab_list.sort()

In [26]:
with open(data_path + 'vocab.csv', 'w') as vocab_file:
    for word in vocab_list:
        vocab_file.write(word + "\n")

### Pre-train word embeddings

In [27]:
textsentences = df.progress_apply(lambda row: str(row['TEXT']).split(), axis=1)

  0%|          | 0/11368 [00:00<?, ?it/s]

In [28]:
labelsentences = labeldf.progress_apply(lambda row: str(row['DESC']).split(), axis=1)

  0%|          | 0/50 [00:00<?, ?it/s]

In [29]:
sentences = pd.concat([textsentences,labelsentences])

In [30]:
model = w2v.Word2Vec(vector_size=100, min_count=0, workers=4, epochs=50)
model.build_vocab(sentences)

In [31]:
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

(560173079, 599665250)

In [32]:
model.wv.most_similar('fever', topn=10)

[('fevers', 0.8807467222213745),
 ('fivers', 0.6618841290473938),
 ('yestderday', 0.6599153876304626),
 ('temps', 0.6471483707427979),
 ('addominal', 0.6402809023857117),
 ('temperatures', 0.637224555015564),
 ('temperature', 0.6262152194976807),
 ('cevers', 0.5718331336975098),
 ('nonprod', 0.5659204125404358),
 ('febrile', 0.5648596286773682)]

In [33]:
model.save(data_path + 'processed_50.w2v')

### Map words with its embeddings

In [34]:
model = gensim.models.Word2Vec.load(data_path + 'processed_50.w2v')
wv = model.wv                             

In [36]:
vocabdf = pd.read_csv(data_path+'vocab.csv',names=['word'],header=None,on_bad_lines='skip')

In [37]:
vocab = set(vocabdf['word'])

In [38]:
ind2w = {i+1:w for i,w in enumerate(sorted(vocab))}

In [39]:
PAD_CHAR = "**PAD**"
W = np.zeros((len(ind2w)+1, len(wv.get_vector(wv.index_to_key[0])) ))
words = [PAD_CHAR]
W[0][:] = np.zeros(len(wv.get_vector(wv.index_to_key[0])))
for idx, word in tqdm(ind2w.items()):
    if idx >= W.shape[0]:
        break    
    W[idx][:] = wv.get_vector(word)
    words.append(word)

  0%|          | 0/66377 [00:00<?, ?it/s]

In [40]:
W.shape[0]

66378

In [41]:
with open(data_path + 'processed_50.embed', 'w') as o:
    #pad token already included
    for i in range(len(words)):
        line = [words[i]]
        line.extend([str(d) for d in W[i]])
        o.write(" ".join(line) + "\n")