## Install and Import

In [1]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
!pip install pytorch-pretrained-bert pytorch-nlp

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K    100% |████████████████████████████████| 133kB 6.7MB/s 
[?25hCollecting pytorch-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/df/ae/b6d18c3f37da5a78e83701469e6153811f4b0ecb3f9387bb3e9a65ca48ee/pytorch_nlp-0.4.1-py3-none-any.whl (82kB)
[K    100% |████████████████████████████████| 92kB 22.7MB/s 
[?25hCollecting boto3 (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/f6/fa/6397049020b312f71c397fff8d10247c2e49da760e2807af7d21e3c23695/boto3-1.9.253-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 17.1MB/s 
Collecting regex (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/ff/60/d9782c56ceefa76033a00e1f84cd8c586c75e6e7fea2cd45ee8b46a386c5

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

Using TensorFlow backend.
UsageError: Line magic function `%` not found.


In order for torch to use the GPU, we need to identify and specify the GPU as the device. Later, in our training loop, we will load data onto the device. 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [4]:
#use this cell when working online
path = '/floyd/home/ed-triage'
data_path = '/floyd/home/data/egh'

## second pass
now going to figure out how to run this on the whole dataset

In [None]:
def create_dummy_column(s):
  if str(s) == 'nan':
    return 'empty cell'
  else:
    return s

In [6]:
data = pd.read_csv(data_path + '/subj_data.csv', index_col = 0)
data.shape

(85154, 3)

In [None]:
data['for embedding'] = data.CleanSubjectiveNotes.map(create_dummy_column)

In [None]:
sentences = data['for embedding'].values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

100%|██████████| 231508/231508 [00:00<00:00, 931811.75B/s]


Tokenize the first sentence:
['[CLS]', 'empty', 'cell', '[SEP]']


In [11]:
print ('show a few other tokenized sentences')
print (tokenized_texts[10])
print (tokenized_texts[100])
print (tokenized_texts[1000])
print (tokenized_texts[10000])

show a few other tokenized sentences
['[CLS]', 'empty', 'cell', '[SEP]']
['[CLS]', 'empty', 'cell', '[SEP]']
['[CLS]', 'lt', 'sided', 'chest', 'pain', 'since', '113', '##0', 'hours', 'lasted', 'for', '3', '-', '4', 'minutes', ',', 'pain', 'back', 'again', 'at', '1200', 'hours', '.', 'no', 'short', '##ness', 'of', 'breath', ',', 'no', 'di', '##zziness', '[SEP]']
['[CLS]', 'complain', '##s', 'of', 'flank', 'pain', ',', 'advise', 'by', 'dr', '.', 'ts', '##ilia', '##s', 'to', 'come', 'to', 'er', '[SEP]']


In [None]:
data['tokenized_subj_notes'] = sentences

BERT requires specifically formatted inputs. For each tokenized input sentence, we need to create:

- **input ids**: a sequence of integers identifying each input token to its index number in the BERT tokenizer vocabulary
- **segment mask**: (optional) a sequence of 1s and 0s used to identify whether the input is one sentence or two sentences long. For one sentence inputs, this is simply a sequence of 0s. For two sentence inputs, there is a 0 for each token of the first sentence, followed by a 1 for each token of the second sentence
- **attention mask**: (optional) a sequence of 1s and 0s, with 1s for all input tokens and 0s for all padding tokens (we'll detail this in the next paragraph)
- **labels**: a single value of 1 or 0. In our task 1 means "grammatical" and 0 means "ungrammatical"

In [13]:
sent_lens = [len(x) for x in tokenized_texts]
print ('mean sentence length', np.mean(sent_lens))
print ('max sentence length', np.max(sent_lens))

mean sentence length 33.50418066092021
max sentence length 216


In [None]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = np.max(sent_lens)

In [None]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [None]:
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

Create the attention masks 

In [None]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(input_ids)
train_masks = torch.tensor(attention_masks)

In [None]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

In [20]:
#we are going to download the model and transfer it to cuda
from pytorch_pretrained_bert import BertModel
model = BertModel.from_pretrained('bert-base-uncased')
model.cuda();

100%|██████████| 407873900/407873900 [00:14<00:00, 27588936.43B/s]


## sentence embedding below

In [None]:
#gonna redo, but this time only keep the sentences
model.eval()
embedded_notes = []
for i, x in enumerate(train_dataloader):
  
  inpseq = x[0].to(device)
  inpmask = x[1].to(device)
  embeds,pooled = model(inpseq, attention_mask = inpmask)
  sentence_vec = torch.mean(embeds[11],1)  #this is supposedly where the sentences are
  #print (sentence_vec.shape)
  embedded_notes.append(sentence_vec.cpu().detach().numpy())
  if i%500 == 0:
    print ('batch #:', i+1)
    #print('input sequence shape:', x[0].shape)
    #print ('mask shape:', x[1].shape)
    print ('results stored...')


batch #: 1
results stored...
batch #: 501
results stored...
batch #: 1001
results stored...


In [69]:
len(embedded_notes)

5323

In [None]:
flat_sentences = [item for sublist in embedded_notes for item in sublist]

In [None]:
data['temp'] = flat_sentences

In [None]:
data['embedded_subnotes'] = data['temp'][data['CleanSubjectiveNotes'].notnull()]

In [77]:
data.head()

Unnamed: 0,CleanSubjectiveNotes,MedicalHistory,pmhx,tokenized_subj_notes,pooled,embedded_sentences,new
0,,,,[CLS] empty cell [SEP],"[-0.81541735, -0.21443638, -0.17744659, 0.6907...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",
1,,,,[CLS] empty cell [SEP],"[-0.9214988, -0.31617814, -0.4026264, 0.831041...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",
2,,,,[CLS] empty cell [SEP],"[-0.76925135, -0.16731277, 0.63148165, 0.60955...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",
3,,,,[CLS] empty cell [SEP],"[-0.8026976, -0.20626362, 0.33342388, 0.616272...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",
4,,,,[CLS] empty cell [SEP],"[-0.9585385, -0.45988795, -0.8535837, 0.861542...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",
5,,,,[CLS] empty cell [SEP],"[-0.8474464, -0.28719974, -0.3201902, 0.689953...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",
6,,,,[CLS] empty cell [SEP],"[-0.8720149, -0.2216901, 0.31625006, 0.7715695...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",
7,,,,[CLS] empty cell [SEP],"[-0.7794212, -0.12808736, 0.60340405, 0.635327...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",
8,,,,[CLS] empty cell [SEP],"[-0.78633195, -0.18919525, 0.3827547, 0.634590...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",
9,,,,[CLS] empty cell [SEP],"[-0.75231934, -0.21003105, 0.4703341, 0.559403...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",


In [78]:
data.iloc[20000:20010]

Unnamed: 0,CleanSubjectiveNotes,MedicalHistory,pmhx,tokenized_subj_notes,pooled,embedded_sentences,new
20000,intermittent abdominal pain since last night. ...,^c^^^^ctasMEHPL=No Significant Medical History,no significant medical history,[CLS] intermittent abdominal pain since last n...,"[-0.7541863, -0.38550034, -0.935306, 0.6506193...","[-0.17362829, -0.1274244, 0.38404685, -0.16352...","[-0.17362829, -0.1274244, 0.38404685, -0.16352..."
20001,for suture removal rt thumb.,,,[CLS] for suture removal rt thumb. [SEP],"[-0.7758408, -0.26031324, -0.6724933, 0.686550...","[-0.28641352, -0.27055612, 0.48553735, -0.2155...","[-0.28641352, -0.27055612, 0.48553735, -0.2155..."
20002,was seen at mackenzie health last sunday afte...,"^c^^^^ctasMEHPL=HTN, NIDDM,","htn, niddm",[CLS] was seen at mackenzie health last sunda...,"[-0.8877131, -0.40006533, -0.941798, 0.8546101...","[-0.025925823, -0.16426979, 0.42920288, -0.115...","[-0.025925823, -0.16426979, 0.42920288, -0.115..."
20003,"pain across the chest non radiating x3 days, ...","^c^^^^ctasMEHPL=Niddm, cardiac x2, high choles...","niddm, cardiac x2, high cholesterol, htn, anxi...",[CLS] pain across the chest non radiating x3 ...,"[-0.75477433, -0.38048938, -0.9076633, 0.62457...","[-0.13821964, 0.1634989, 0.31855455, -0.376983...","[-0.13821964, 0.1634989, 0.31855455, -0.376983..."
20004,,"^c^^^^ctasMEHPL=htn, gerd, skin lesion to head","htn, gerd, skin lesion to head",[CLS] empty cell [SEP],"[-0.9424013, -0.3451816, 0.33916035, 0.7134917...","[0.11423147, 0.051171873, 0.01352569, 0.083628...",
20005,kicked by resident at the nh 3 days ago compla...,"^c^^^^ctasMEHPL=appendectomy, abd hernia, asthma,","appendectomy, abd hernia, asthma",[CLS] kicked by resident at the nh 3 days ago ...,"[-0.766297, -0.15209605, -0.69855046, 0.506783...","[-0.13807704, -0.10910831, 0.17822601, -0.2196...","[-0.13807704, -0.10910831, 0.17822601, -0.2196..."
20006,left arm/shoulder pain x 2 weeks. was seen by...,"^c^^^^ctasMEHPL=NIDDM, gerd,HTN, high cholesterol","niddm, gerd, htn, high cholesterol",[CLS] left arm/shoulder pain x 2 weeks. was s...,"[-0.72606647, -0.3469878, -0.98886853, 0.87553...","[-0.2114657, 0.110205136, 0.45870924, -0.23148...","[-0.2114657, 0.110205136, 0.45870924, -0.23148..."
20007,"diagnosed with psoriasis 10 years ago, skin ra...","^c^^^^ctasMEHPL=psoriasis, htn, high cholester...","psoriasis, htn, high cholesterol, niddm","[CLS] diagnosed with psoriasis 10 years ago, s...","[-0.78073835, -0.42687625, -0.91565096, 0.6277...","[-0.0015239774, -0.007246774, 0.15447551, -0.2...","[-0.0015239774, -0.007246774, 0.15447551, -0.2..."
20008,generalized weakness x 2 weeks.has an appointm...,"^c^^^^ctasMEHPL=niddm, HIGH CHOLESTEROL","niddm, high cholesterol",[CLS] generalized weakness x 2 weeks.has an ap...,"[-0.9108423, -0.4612296, -0.9929832, 0.9246962...","[-0.24177133, 0.04832459, 0.29758218, -0.29893...","[-0.24177133, 0.04832459, 0.29758218, -0.29893..."
20009,"on and off mid abdominal pain x3 weeks, more ...","^c^^^^ctasMEHPL=abdominal hernia,",abdominal hernia,"[CLS] on and off mid abdominal pain x3 weeks, ...","[-0.7534101, -0.35356155, -0.94269687, 0.73001...","[-0.10759002, -0.19900884, 0.3312311, -0.28770...","[-0.10759002, -0.19900884, 0.3312311, -0.28770..."


In [79]:
data['embedded_subnotes'].to_csv('/content/subjnote_embeds.csv')

  """Entry point for launching an IPython kernel.


## now gonna sentence embed medical history

In [None]:
#need an way to handle empty cells so i can still embed the whole dataset and plug it back into the dataframe
data['for embedding'] = data['pmhx'].map(create_dummy_column)

In [None]:
sentences = data['for embedding'].values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

In [87]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])


Tokenize the first sentence:
['[CLS]', 'empty', 'cell', '[SEP]']


In [88]:
print ('show a few other tokenized sentences')
print (tokenized_texts[10])
print (tokenized_texts[100])
print (tokenized_texts[1000])
print (tokenized_texts[10000])

['[CLS]', 'empty', 'cell', '[SEP]']
['[CLS]', 'empty', 'cell', '[SEP]']
['[CLS]', 'no', 'significant', 'medical', 'history', '[SEP]']


In [None]:
data['tokenized_medhx'] = sentences

In [90]:
sent_lens = [len(x) for x in tokenized_texts]
print ('mean sentence length:', np.mean(sent_lens))
print ('max sentence length:', np.max(sent_lens))

mean sentence length 7.951229537074007
max sentence length 101


In [None]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = np.max(sent_lens)

In [None]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [None]:
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

Create the attention masks 

In [None]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(input_ids)
train_masks = torch.tensor(attention_masks)

In [None]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

In [None]:
#we are going to download the model and transfer it to cuda
from pytorch_pretrained_bert import BertModel
model = BertModel.from_pretrained('bert-base-uncased')
model.cuda();

In [98]:
#gonna redo, but this time only keep the sentences
model.eval()
embedded_notes = []
for i, x in enumerate(train_dataloader):
  
  inpseq = x[0].to(device)
  inpmask = x[1].to(device)
  embeds,pooled = model(inpseq, attention_mask = inpmask)
  sentence_vec = torch.mean(embeds[11],1)  #this is supposedly where the sentences are
  #print (sentence_vec.shape)
  embedded_notes.append(sentence_vec.cpu().detach().numpy())
  if i%500 == 0:
    print ('batch #:', i+1)
    #print('input sequence shape:', x[0].shape)
    #print ('mask shape:', x[1].shape)
    print ('results stored...')


batch #: 1
input sequence shape: torch.Size([16, 101])
mask shape: torch.Size([16, 101])
results stored...
batch #: 51
input sequence shape: torch.Size([16, 101])
mask shape: torch.Size([16, 101])
results stored...
batch #: 101
input sequence shape: torch.Size([16, 101])
mask shape: torch.Size([16, 101])
results stored...
batch #: 151
input sequence shape: torch.Size([16, 101])
mask shape: torch.Size([16, 101])
results stored...
batch #: 201
input sequence shape: torch.Size([16, 101])
mask shape: torch.Size([16, 101])
results stored...
batch #: 251
input sequence shape: torch.Size([16, 101])
mask shape: torch.Size([16, 101])
results stored...
batch #: 301
input sequence shape: torch.Size([16, 101])
mask shape: torch.Size([16, 101])
results stored...
batch #: 351
input sequence shape: torch.Size([16, 101])
mask shape: torch.Size([16, 101])
results stored...
batch #: 401
input sequence shape: torch.Size([16, 101])
mask shape: torch.Size([16, 101])
results stored...
batch #: 451
input seq

In [None]:
data['new'] = data['embedded_sentences'][data['CleanSubjectiveNotes'].notnull() ]