In [2]:
import csv
import numpy as np
import pandas as pd
import operator
from collections import defaultdict
from scipy.sparse import csr_matrix
import gensim.models.word2vec as w2v
import gensim.models
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
data_path = '../data/'

In [4]:
train_df = pd.read_csv(data_path + 'ensemble_train_32.csv')
test_df = pd.read_csv(data_path + 'ensemble_test_32.csv')
dev_df = pd.read_csv(data_path + 'ensemble_dev_32.csv')

In [6]:
all_32_df = train_df.append(test_df)
all_32_df = all_32_df.append(dev_df)
all_32_df

  all_32_df = train_df.append(test_df)
  all_32_df = all_32_df.append(dev_df)


Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE
0,4,185777,admission date date birth sex f service chief ...,R78.81
1,21,109451,admission date service medicine allergies pati...,N17.9;I25.10;E11.9;I48.91;I12.0
2,21,111970,admission date service medicine allergies pati...,D64.9;I25.10;E03.9;E11.9;R65.20;I48.91;R65.21;...
3,26,197661,admission date date birth sex service ccu hist...,I25.10;I48.91
4,31,128652,admission date date birth sex service neurolog...,I10;J18.9
...,...,...,...,...
1489,99389,196423,admission date date birth sex f service neuros...,N39.0;I25.10;I10;I48.91
1490,99783,126090,admission date date birth sex service cardioth...,N17.9;R65.20;I48.91;R65.21;D62
1491,99817,195557,admission date date birth sex service medicine...,N39.0;I25.10;N17.9;J18.9
1492,99830,176834,admission date date birth sex service cardioth...,N17.9;I10;E11.9;R65.20;I48.91;D62


In [7]:
df = all_32_df

In [16]:
df = pd.read_csv(data_path + 'all_50.csv')

In [17]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD10_CODE
0,17,161087,admission date date birth sex f service cardio...,38.93;272.4;511.9;88.72;311,02H633Z;E78.4;J91.8;B244YZZ;F32.9
1,20,157681,admission date date birth sex f service histor...,272.4;36.15;414.01;401.9;250.00;39.61,E78.4;0210088;I25.10;I10;E11.9;5A1221Z
2,33,176176,admission date service history present illness...,038.9,A41.9
3,38,185910,admission date date birth sex service urology ...,96.72;995.92;96.6;38.93;427.31;038.9;428.0;584...,5A1955Z;R65.20;0DH67UZ;02H633Z;I48.91;A41.9;I5...
4,41,101757,admission date date birth sex service medicine...,96.71;507.0;285.9;96.6;38.93;401.9;99.04;305.1...,5A1935Z;J69.0;D64.9;0DH67UZ;02H633Z;I10;30233N...


In [9]:
labeldf = pd.read_csv(data_path + 'D_ICD_32.csv')

In [10]:
labeldf.head()

Unnamed: 0,ICD9_CODE,DESC
0,E11.9,Diabetes mellitus without mention of complicat...
1,E46,Other protein-calorie malnutrition
2,E03.9,Unspecified acquired hypothyroidism
3,E66.9,"Obesity, unspecified"
4,E66.01,Morbid obesity


### Build Vocab

In [12]:
types = set()
for row in df.itertuples():
    for w in row[-2].split(' '):
        types.add(w)

In [13]:
len(types)

89942

In [14]:
for row in labeldf.itertuples():
    for w in row[-1].split(' '):
        types.add(w)

In [15]:
len(types)

89991

In [16]:
vocab_list = list(types)

In [17]:
vocab_list.sort()

In [18]:
with open(data_path + 'vocab.csv', 'w') as vocab_file:
    for word in vocab_list:
        vocab_file.write(word + "\n")

### Pre-train word embeddings

In [19]:
textsentences = df.progress_apply(lambda row: str(row['TEXT']).split(), axis=1)

  0%|          | 0/17087 [00:00<?, ?it/s]

In [20]:
labelsentences = labeldf.progress_apply(lambda row: str(row['DESC']).split(), axis=1)

  0%|          | 0/32 [00:00<?, ?it/s]

In [21]:
sentences = pd.concat([textsentences,labelsentences])

In [22]:
model = w2v.Word2Vec(vector_size=100, min_count=0, workers=4, epochs=50)
model.build_vocab(sentences)

In [23]:
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

(978003656, 1041306250)

In [24]:
model.wv.most_similar('fever', topn=10)

[('fevers', 0.912713348865509),
 ('temperatures', 0.6981709003448486),
 ('yestderday', 0.6981277465820312),
 ('temps', 0.6843907833099365),
 ('temperature', 0.6769816875457764),
 ('coughin', 0.6671100854873657),
 ('temprature', 0.6273037791252136),
 ('temp', 0.6269897222518921),
 ('febrile', 0.6213597059249878),
 ('condused', 0.6149826645851135)]

In [25]:
model.save(data_path + 'processed_32.w2v')

### Map words with its embeddings

In [26]:
model = gensim.models.Word2Vec.load(data_path + 'processed_32.w2v')
wv = model.wv                             

In [27]:
vocabdf = pd.read_csv(data_path+'vocab.csv',names=['word'],header=None,on_bad_lines='skip')

In [28]:
vocab = set(vocabdf['word'])

In [29]:
ind2w = {i+1:w for i,w in enumerate(sorted(vocab))}

In [30]:
PAD_CHAR = "**PAD**"
W = np.zeros((len(ind2w)+1, len(wv.get_vector(wv.index_to_key[0])) ))
words = [PAD_CHAR]
W[0][:] = np.zeros(len(wv.get_vector(wv.index_to_key[0])))
for idx, word in tqdm(ind2w.items()):
    if idx >= W.shape[0]:
        break    
    W[idx][:] = wv.get_vector(word)
    words.append(word)

  0%|          | 0/89978 [00:00<?, ?it/s]

In [31]:
W.shape[0]

89979

In [32]:
with open(data_path + 'processed_32.embed', 'w') as o:
    #pad token already included
    for i in range(len(words)):
        line = [words[i]]
        line.extend([str(d) for d in W[i]])
        o.write(" ".join(line) + "\n")