In [34]:
import csv
import numpy as np
import pandas as pd
import operator
from collections import defaultdict
from scipy.sparse import csr_matrix
import gensim.models.word2vec as w2v
import gensim.models
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
data_path = '../data/'

In [5]:
df = pd.read_csv(data_path + 'all_50.csv')

In [6]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE
0,17,161087,sex f cardiothoracic allergies bactrim ampicil...,311;38.93;511.9;88.72;272.4,F32.9;02H633Z;J91.8;B244YZZ;E78.4
1,20,157681,sex f history of present illness the patient i...,250.00;401.9;414.01;36.15;272.4;39.61,E11.9;I10;I25.10;0210088;E78.4;5A1221Z
2,33,176176,history of present illness patient is an 82 ye...,038.9,A41.9
3,38,185910,sex m urology allergies inderal bactrim codein...,V45.81;428.0;96.72;38.93;96.6;427.31;995.92;03...,Z95.1;I50.814;5A1955Z;02H633Z;0DH67UZ;I48.91;R...
4,41,101757,sex m medicine allergies patient recorded as h...,496;285.9;33.24;99.04;401.9;38.93;96.6;507.0;9...,J44.9;D64.9;0B933ZX;30233N1;I10;02H633Z;0DH67U...


### Build Vocab

In [15]:
types = set()
for row in df.itertuples():
    for w in row[-3].split(' '):
        types.add(w)

In [16]:
len(types)

78896

In [17]:
vocab_list = list(types)

In [18]:
vocab_list.sort()

In [19]:
with open(data_path + 'vocab.csv', 'w') as vocab_file:
    for word in vocab_list:
        vocab_file.write(word + "\n")

### Pre-train word embeddings

In [24]:
sentences = df.progress_apply(lambda row: str(row['TEXT']).split(), axis=1)

  0%|          | 0/11368 [00:00<?, ?it/s]

In [27]:
model = w2v.Word2Vec(vector_size=100, min_count=0, workers=4, epochs=50)
model.build_vocab(sentences)

In [28]:
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

(796434445, 946923450)

In [29]:
model.wv.most_similar('fever', topn=10)

[('fevers', 0.8748624920845032),
 ('fivers', 0.6912068128585815),
 ('temps', 0.6847617030143738),
 ('temperatures', 0.6791739463806152),
 ('febrile', 0.638131856918335),
 ('associatd', 0.6223352551460266),
 ('condused', 0.6027154922485352),
 ('temperature', 0.5882642269134521),
 ('addominal', 0.585747480392456),
 ('nonprod', 0.5726056098937988)]

In [33]:
model.save(data_path + 'processed_50.w2v')

### Map words with its embeddings

In [None]:
extract_wvs.gensim_to_embeddings('%s/processed_full.w2v' % MIMIC_3_DIR, '%s/vocab.csv' % MIMIC_3_DIR, Y)

In [40]:
model = gensim.models.Word2Vec.load(data_path + 'processed_50.w2v')
wv = model.wv                             

In [35]:
vocabdf = pd.read_csv(data_path+'vocab.csv',names=['word'],header=None)

In [37]:
vocab = set(vocabdf['word'])

In [38]:
ind2w = {i+1:w for i,w in enumerate(sorted(vocab))}

In [49]:
PAD_CHAR = "**PAD**"
W = np.zeros((len(ind2w)+1, len(wv.get_vector(wv.index_to_key[0])) ))
words = [PAD_CHAR]
W[0][:] = np.zeros(len(wv.get_vector(wv.index_to_key[0])))
for idx, word in tqdm(ind2w.items()):
    if idx >= W.shape[0]:
        break    
    W[idx][:] = wv.get_vector(word)
    words.append(word)

  0%|          | 0/78896 [00:00<?, ?it/s]

In [51]:
W.shape[0]

78897

In [58]:
with open(data_path + 'processed_50.embed', 'w') as o:
    #pad token already included
    for i in range(len(words)):
        line = [words[i]]
        line.extend([str(d) for d in W[i]])
        o.write(" ".join(line) + "\n")