## Install and Import

In [14]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [15]:
!pip install pytorch-pretrained-bert pytorch-nlp

[31mmenpo 0.8.1 has requirement matplotlib<2.0,>=1.4, but you'll have matplotlib 3.0.2 which is incompatible.[0m
[31mmenpo 0.8.1 has requirement pillow<5.0,>=3.0, but you'll have pillow 5.4.0 which is incompatible.[0m
[31mmenpo 0.8.1 has requirement scipy<1.0,>=0.16, but you'll have scipy 1.2.0 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [17]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
#% matplotlib inline

In order for torch to use the GPU, we need to identify and specify the GPU as the device. Later, in our training loop, we will load data onto the device. 

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla K80'

In [19]:
#use this cell when working online
path = '/floyd/home/ed-triage'
data_path = '/floyd/home/data/egh'

## second pass
now going to figure out how to run this on the whole dataset

In [20]:
def create_dummy_column(s):
  if str(s) == 'nan':
    return 'empty cell'
  else:
    return s

In [21]:
data = pd.read_csv(data_path + '/subj_data.csv', index_col = 0)
data.shape

(63474, 2)

In [22]:
data.head()

Unnamed: 0,CleanSubjectiveNotes,pmhx
511,complains of central chest tightness since yes...,"childhood heart murmur, drug abuse"
754,pt says noted blood in stool yesterday and mor...,seizure
755,"rt flank pain since morning,on her periods now.","no significant medical history, ovarian cyst"
757,"abdo pain onset monday,not seen by fd.",no significant medical history
758,"fell last monday landed on ground, "" dizzy"" at...","high cholesterol, 2 stents 2014, low bp"


In [23]:
data['for embedding'] = data.CleanSubjectiveNotes.map(create_dummy_column)

In [24]:
sentences = data['for embedding'].values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

In [25]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['[CLS]', 'complain', '##s', 'of', 'central', 'chest', 'tight', '##ness', 'since', 'yesterday', 'morning', '.', 'worse', 'with', 'pal', '##pati', '##on', '.', 'relieved', 'with', 'movement', 'and', 'deep', 'breathing', '.', 'denies', 'nausea', 'and', '/', 'or', 'vomiting', '.', 'complain', '##s', 'of', 'feeling', 'cold', 'and', '"', 'tired', '"', '.', 'says', 'took', 'nearly', 'mel', '##aton', '##in', '100', '##mg', ',', 'cocaine', ',', 'and', 'et', '##oh', 'on', 'sunday', '.', '[SEP]']


In [26]:
print ('show a few other tokenized sentences')
print (tokenized_texts[10])
print (tokenized_texts[100])
print (tokenized_texts[1000])
print (tokenized_texts[10000])

show a few other tokenized sentences
['[CLS]', 'presently', 'feels', 'like', 'heart', 'is', 'going', 'fast', '.', 'intermittent', 'lt', 'sided', ',', 'non', '-', 'radiating', '.', 'no', 'cp', 'now', '.', 'says', 'he', 'has', 'thoughts', 'of', 'harm', '##ing', 'self', 'and', 'others', 'at', 'tri', '##age', 'but', 'says', 'plan', 'is', '"', 'to', 'get', 'help', '"', '.', 'says', 'takes', 'hiv', '/', 'aids', 'med', '##s', 'when', 'he', 'remembers', '.', 'cannot', 'remember', 'the', 'names', 'of', 'med', '##s', '.', '[SEP]']
['[CLS]', 'vomit', '##ed', 'x', '5', ',', 'headache', ',', 'di', '##zziness', ',', 'not', 'able', 'to', 'tolerate', 'food', 'or', 'fluids', 'since', '113', '##0', 'hours', '[SEP]']
['[CLS]', 'pt', 'complain', '##s', 'of', 'short', '##ness', 'of', 'breath', 'that', 'started', '1', 'week', 'ago', ',', 'denies', 'cp', '.', 'pt', 'has', 'swelling', 'to', 'feet', '.', 'report', 'feeling', 'weak', '.', 'wife', 'stated', 'pt', 'has', 'blood', 'in', 'stool', '.', '[SEP]']
['[C

In [27]:
data['tokenized_subj_notes'] = sentences

In [28]:
sent_lens = [len(x) for x in tokenized_texts]
print ('mean sentence length', np.mean(sent_lens))
print ('max sentence length', np.max(sent_lens))

mean sentence length 40.27126067366166
max sentence length 216


In [29]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = np.max(sent_lens)

In [30]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [31]:
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [32]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [33]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(input_ids)
train_masks = torch.tensor(attention_masks)

In [34]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

In [35]:
#we are going to download the model and transfer it to cuda
from pytorch_pretrained_bert import BertModel
model = BertModel.from_pretrained('bert-base-uncased')
model.cuda();

## sentence embedding below

In [36]:
#gonna redo, but this time only keep the sentences
model.eval()
embedded_notes = []
for i, x in enumerate(train_dataloader):
  
    inpseq = x[0].to(device)
    inpmask = x[1].to(device)
    embeds,_ = model(inpseq, attention_mask = inpmask)
    sentence_vec = torch.mean(embeds[11],1)  #this is supposedly where the sentences are
    embedded_notes.append(sentence_vec.cpu().detach().numpy())
    if i%500 == 0:
        print ('batch #:', i+1)
        print ('results stored...')


batch #: 1
results stored...
batch #: 501
results stored...
batch #: 1001
results stored...
batch #: 1501
results stored...
batch #: 2001
results stored...
batch #: 2501
results stored...
batch #: 3001
results stored...
batch #: 3501
results stored...


In [42]:
len(embedded_notes)

3968

In [43]:
flat_sentences = [item for sublist in embedded_notes for item in sublist]

In [44]:
data['embedded_subjnotes'] = flat_sentences

In [45]:
data.head()

Unnamed: 0,CleanSubjectiveNotes,pmhx,for embedding,tokenized_subj_notes,embedded_subjnotes
511,complains of central chest tightness since yes...,"childhood heart murmur, drug abuse",complains of central chest tightness since yes...,[CLS] complains of central chest tightness sin...,"[-0.2552049, 0.060189128, 0.2712358, -0.307374..."
754,pt says noted blood in stool yesterday and mor...,seizure,pt says noted blood in stool yesterday and mor...,[CLS] pt says noted blood in stool yesterday a...,"[0.23464553, -0.18784478, 0.47729325, -0.02351..."
755,"rt flank pain since morning,on her periods now.","no significant medical history, ovarian cyst","rt flank pain since morning,on her periods now.","[CLS] rt flank pain since morning,on her perio...","[-0.26586953, -0.1800395, 0.37560245, -0.34440..."
757,"abdo pain onset monday,not seen by fd.",no significant medical history,"abdo pain onset monday,not seen by fd.","[CLS] abdo pain onset monday,not seen by fd. ...","[-0.20064865, -0.20033248, 0.32697883, -0.2581..."
758,"fell last monday landed on ground, "" dizzy"" at...","high cholesterol, 2 stents 2014, low bp","fell last monday landed on ground, "" dizzy"" at...","[CLS] fell last monday landed on ground, "" diz...","[-0.09094222, -0.15834723, 0.48943293, 0.03535..."


In [47]:
data['embedded_subjnotes'].to_csv(data_path + '/subjnote_embeds.csv')

## now gonna sentence embed medical history

In [48]:
#need an way to handle empty cells so i can still embed the whole dataset and plug it back into the dataframe
data['for embedding'] = data['pmhx'].map(create_dummy_column)

In [49]:
sentences = data['for embedding'].values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

In [50]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])


Tokenize the first sentence:
['[CLS]', 'childhood', 'heart', 'murmur', ',', 'drug', 'abuse', '[SEP]']


In [51]:
print ('show a few other tokenized sentences')
print (tokenized_texts[10])
print (tokenized_texts[100])
print (tokenized_texts[1000])
print (tokenized_texts[10000])

show a few other tokenized sentences
['[CLS]', 'hiv', '[SEP]']
['[CLS]', 'no', 'significant', 'medical', 'history', '[SEP]']
['[CLS]', 'hyper', '##tension', ',', 'high', 'cho', '##les', '##terol', '[SEP]']
['[CLS]', 'ge', '##rd', '[SEP]']


In [52]:
data['tokenized_medhx'] = sentences

In [53]:
sent_lens = [len(x) for x in tokenized_texts]
print ('mean sentence length:', np.mean(sent_lens))
print ('max sentence length:', np.max(sent_lens))

mean sentence length: 7.990295238995494
max sentence length: 101


In [54]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = np.max(sent_lens)

In [55]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [56]:
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

Create the attention masks 

In [57]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [58]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(input_ids)
train_masks = torch.tensor(attention_masks)

In [59]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

In [60]:
#we are going to download the model and transfer it to cuda
from pytorch_pretrained_bert import BertModel
model = BertModel.from_pretrained('bert-base-uncased')
model.cuda();

In [61]:
#gonna redo, but this time only keep the sentences
model.eval()
embedded_notes = []
for i, x in enumerate(train_dataloader):
  
    inpseq = x[0].to(device)
    inpmask = x[1].to(device)
    embeds,_ = model(inpseq, attention_mask = inpmask)
    sentence_vec = torch.mean(embeds[11],1)  #this is supposedly where the sentences are
    embedded_notes.append(sentence_vec.cpu().detach().numpy())
    if i%500 == 0:
        print ('batch #:', i+1)
        print ('results stored...')


batch #: 1
results stored...
batch #: 501
results stored...
batch #: 1001
results stored...
batch #: 1501
results stored...
batch #: 2001
results stored...
batch #: 2501
results stored...
batch #: 3001
results stored...
batch #: 3501
results stored...


In [62]:
len(embedded_notes)

3968

In [63]:
flat_sentences = [item for sublist in embedded_notes for item in sublist]

In [64]:
data['embedded_pmhx'] = flat_sentences

In [65]:
data['embedded_pmhx'].to_csv(data_path + 'pmhx_embeds.csv')

## below here, I'm just learning to import the data and merge it.  works well