## Install and Import

In [1]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
!pip install pytorch-pretrained-bert pytorch-nlp

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K    100% |████████████████████████████████| 133kB 7.4MB/s 
[?25hCollecting pytorch-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/4f/51/f0ee1efb75f7cc2e3065c5da1363d6be2eec79691b2821594f3f2329528c/pytorch_nlp-0.5.0-py3-none-any.whl (90kB)
[K    100% |████████████████████████████████| 92kB 17.3MB/s 
[?25hCollecting regex (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/1d/c1/c90beb2dbbfbf19f3634e16a441d5f11fa787bdf0748a35b8b88452c0e78/regex-2020.4.4-cp36-cp36m-manylinux1_x86_64.whl (679kB)
[K    100% |████████████████████████████████| 686kB 15.3MB/s 
Collecting boto3 (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/27/87/de75e5a24584d82cca60b86f95d06e56412ed9e23807dcf

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
#% matplotlib inline

Using TensorFlow backend.


In order for torch to use the GPU, we need to identify and specify the GPU as the device. Later, in our training loop, we will load data onto the device. 

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla V100-SXM2-16GB'

In [9]:
#use this cell when working online
path = '/floyd/home/ed-triage'
data_path = '/floyd/home/data/combined'

## second pass
now going to figure out how to run this on the whole dataset

In [10]:
def create_dummy_column(s):
  if str(s) == 'nan':
    return 'empty cell'
  else:
    return s

In [19]:
data = pd.read_csv(data_path + '/complete_clean_combo_data.csv', index_col = 0, low_memory = False)
data.shape

(165433, 121)

In [20]:
data.head()

Unnamed: 0,ID,ChartNumber,EncounterNumber,TriageLevel,AgeNumber,AgeInYrs,GenderDesc,PIA Date & Time,Disposition Date & Time,DischargeDisposition,...,Reg Date & Timeday_year_cos,Reg Date & Timeday_year_sin,Reg Date & Timehour_cos,Reg Date & Timehour_sin,Reg Date & Timeclock_cos,Reg Date & Timeclock_sin,Reg Date & Timemin_cos,Reg Date & Timemin_sin,Reg Date & Timesec_cos,Reg Date & Timesec_sin
0,149.0,N179474,NE000150/18,2.0,43.0,43.0,Female,01/04/2018 12:25:00PM,01/04/2018 2:30:00PM,17.0,...,0.998667,0.05162,-0.965926,0.258819,0.8660254,-0.5,0.809017,-0.587785,1.0,0.0
1,198.0,N798201,NE000199/18,3.0,13.0,13.0,Female,01/04/2018 3:23:00PM,01/04/2018 4:13:00PM,17.0,...,0.998667,0.05162,-0.866025,-0.5,0.5,0.866025,0.994522,0.104528,1.0,0.0
2,218.0,N798204,NE000225/18,2.0,23.0,23.0,Female,01/04/2018 3:00:00PM,01/04/2018 5:05:00PM,17.0,...,0.998667,0.05162,-0.707107,-0.707107,6.123234000000001e-17,1.0,0.406737,0.913545,1.0,0.0
3,219.0,N110229,NE000226/18,2.0,30.0,30.0,Female,01/04/2018 4:36:00PM,01/04/2018 11:00:00PM,17.0,...,0.998667,0.05162,-0.707107,-0.707107,6.123234000000001e-17,1.0,0.104528,0.994522,1.0,0.0
4,227.0,N739034,NE000222/18,4.0,36.0,36.0,Female,,01/04/2018 3:30:00PM,63.0,...,0.998667,0.05162,-0.707107,-0.707107,6.123234000000001e-17,1.0,0.866025,0.5,1.0,0.0


In [21]:
data['for embedding'] = data.CleanSubjectiveNotes.map(create_dummy_column)

In [22]:
sentences = data['for embedding'].values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

100%|██████████| 231508/231508 [00:00<00:00, 1021109.54B/s]


Tokenize the first sentence:
['[CLS]', 'patient', 'states', 'that', 'she', 'feels', 'shaky', '.', 'patient', 'denies', 'any', 'pain', '.', '[SEP]']


In [24]:
print ('show a few other tokenized sentences')
print (tokenized_texts[10])
print (tokenized_texts[100])
print (tokenized_texts[1000])
print (tokenized_texts[10000])

show a few other tokenized sentences
['[CLS]', 'patient', 'states', 'everything', 'is', 'ok', '.', 'denies', 'any', 'head', 'pain', '.', '[SEP]']
['[CLS]', 'patient', 'had', 'an', 'renal', 'ultrasound', 'today', 'and', 'coming', 'back', 'for', 'the', 'result', 'her', 'cr', '##ea', '##tin', '##ine', 'was', 'elevated', ',', 'also', 'complain', '##s', 'of', 'head', '##ca', '##he', 'for', 'over', 'a', 'week', 'no', 'blur', '##ry', 'vision', 'nausea', '##ted', 'no', 'vomiting', '.', '[SEP]']
['[CLS]', 'patient', 'slipped', 'and', 'fell', 'at', '22', '00', ',', 'head', 'hit', 'to', 'concrete', 'and', 'had', 'small', 'hem', '##ato', '##ma', 'x', '2', 'on', 'the', 'back', 'of', 'the', 'head', ',', 'lost', 'lo', '##c', 'x', '1', 'mt', '.', '[SEP]']
['[CLS]', 'sts', 'playing', 'basketball', ',', 'twisted', 'left', 'ankle', ',', 'with', 'swelling', ',', 'pain', 'with', 'movement', ',', 'with', 'ice', 'packed', 'on', '.', '[SEP]']


In [25]:
data['tokenized_subj_notes'] = sentences

In [26]:
data['tokenized_subj_notes'].to_csv(data_path + '/tokenized_subj_notes.csv')

In [27]:
sent_lens = [len(x) for x in tokenized_texts]
print ('mean sentence length', np.mean(sent_lens))
print ('max sentence length', np.max(sent_lens))

mean sentence length 47.59394437627317
max sentence length 268


In [28]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = np.max(sent_lens)

In [29]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [30]:
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [31]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [32]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(input_ids)
train_masks = torch.tensor(attention_masks)

In [33]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

In [35]:
#we are going to download the model and transfer it to cuda
from pytorch_pretrained_bert import BertModel
model = BertModel.from_pretrained('bert-base-uncased')
model.cuda();

## sentence embedding below

In [36]:
#gonna redo, but this time only keep the sentences
model.eval()
embedded_notes = []
for i, x in enumerate(train_dataloader):
  
    inpseq = x[0].to(device)
    inpmask = x[1].to(device)
    embeds,_ = model(inpseq, attention_mask = inpmask)
    sentence_vec = torch.mean(embeds[11],1)  #this is supposedly where the sentences are
    embedded_notes.append(sentence_vec.cpu().detach().numpy())
    if i%500 == 0:
        print ('batch #:', i+1)
        print ('results stored...')


batch #: 1
results stored...
batch #: 501
results stored...
batch #: 1001
results stored...
batch #: 1501
results stored...
batch #: 2001
results stored...
batch #: 2501
results stored...
batch #: 3001
results stored...
batch #: 3501
results stored...
batch #: 4001
results stored...
batch #: 4501
results stored...
batch #: 5001
results stored...
batch #: 5501
results stored...
batch #: 6001
results stored...
batch #: 6501
results stored...
batch #: 7001
results stored...
batch #: 7501
results stored...
batch #: 8001
results stored...
batch #: 8501
results stored...
batch #: 9001
results stored...
batch #: 9501
results stored...
batch #: 10001
results stored...


In [37]:
len(embedded_notes)

10340

In [38]:
flat_sentences = [item for sublist in embedded_notes for item in sublist]

In [39]:
data['embedded_subjnotes'] = flat_sentences

In [45]:
data['embedded_subjnotes'].head()

0    [-0.15257795, -0.26369357, 0.53711694, 0.09271...
1    [-0.089328445, -0.10715083, 0.27685848, -0.280...
2    [-0.24219792, 0.25459412, 0.08023955, 0.024560...
3    [-0.007053161, -0.15278126, 0.15155132, -0.132...
4    [-0.22487935, 0.059827298, 0.26115206, -0.2373...
Name: embedded_subjnotes, dtype: object

In [47]:
subj_embed_df = pd.DataFrame(flat_sentences)

In [48]:
subj_embed_df.to_csv(data_path + '/subjnote_embeds.csv')

## now gonna sentence embed medical history

In [53]:
#need an way to handle empty cells so i can still embed the whole dataset and plug it back into the dataframe
data['for embedding'] = data['CleanMedicalHistory'].map(create_dummy_column)

In [54]:
sentences = data['for embedding'].values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

In [55]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])


Tokenize the first sentence:
['[CLS]', 'no', 'significant', 'medical', 'history', '[SEP]']


In [56]:
print ('show a few other tokenized sentences')
print (tokenized_texts[10])
print (tokenized_texts[100])
print (tokenized_texts[1000])
print (tokenized_texts[10000])

show a few other tokenized sentences
['[CLS]', 'no', 'significant', 'medical', 'history', '[SEP]']
['[CLS]', 'h', '##yp', '##oth', '##yr', '##oid', '[SEP]']
['[CLS]', 'no', 'significant', 'medical', 'history', '[SEP]']
['[CLS]', 'ad', '##hd', '(', 'attention', 'deficit', 'hyper', '##act', '##ivity', 'disorder', ')', '[SEP]']


In [57]:
data['tokenized_medhx'] = sentences

In [64]:
sent_lens = [len(x) for x in tokenized_texts]
print ('mean sentence length:', np.mean(sent_lens))
print ('max sentence length:', np.max(sent_lens))

mean sentence length: 9.551733934583789
max sentence length: 107


In [65]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = np.max(sent_lens)

In [66]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [67]:
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

Create the attention masks 

In [68]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [69]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(input_ids)
train_masks = torch.tensor(attention_masks)

In [70]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

In [71]:
#we are going to download the model and transfer it to cuda
from pytorch_pretrained_bert import BertModel
model = BertModel.from_pretrained('bert-base-uncased')
model.cuda();

In [72]:
#gonna redo, but this time only keep the sentences
model.eval()
embedded_notes = []
for i, x in enumerate(train_dataloader):
  
    inpseq = x[0].to(device)
    inpmask = x[1].to(device)
    embeds,_ = model(inpseq, attention_mask = inpmask)
    sentence_vec = torch.mean(embeds[11],1)  #this is supposedly where the sentences are
    embedded_notes.append(sentence_vec.cpu().detach().numpy())
    if i%500 == 0:
        print ('batch #:', i+1)
        print ('results stored...')


batch #: 1
results stored...
batch #: 501
results stored...
batch #: 1001
results stored...
batch #: 1501
results stored...
batch #: 2001
results stored...
batch #: 2501
results stored...
batch #: 3001
results stored...
batch #: 3501
results stored...
batch #: 4001
results stored...
batch #: 4501
results stored...
batch #: 5001
results stored...
batch #: 5501
results stored...
batch #: 6001
results stored...
batch #: 6501
results stored...
batch #: 7001
results stored...
batch #: 7501
results stored...
batch #: 8001
results stored...
batch #: 8501
results stored...
batch #: 9001
results stored...
batch #: 9501
results stored...
batch #: 10001
results stored...


In [73]:
len(embedded_notes)

10340

In [74]:
flat_sentences = [item for sublist in embedded_notes for item in sublist]

In [75]:
pmhx_embed_df = pd.DataFrame(flat_sentences)

In [76]:
pmhx_embed_df.to_csv(data_path + '/pmhx_embeds.csv')