# Imports

In [1]:
!pip install transformers --quiet
!pip install gensim==3.8.3
!pip install -q sentencepiece
!pip install tensorflow_datasets
!pip install -q tf-models-official

import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
from pprint import pprint

import os
import nltk
from nltk.data import find
import pandas as pd

import matplotlib.pyplot as plt

import re

from transformers import BertTokenizer, TFBertModel, AutoTokenizer, AutoModel
import textwrap

from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
from transformers import GPT2Tokenizer
# import tensorflow_datasets as tfds



In [2]:
data_path = "nbme-score-clinical-patient-notes/"
print("Project Path:", data_path)

Project Path: nbme-score-clinical-patient-notes/


In [3]:
os.listdir(data_path)

['test.csv',
 'patient_notes.csv',
 'train.csv',
 'features.csv',
 'sample_submission.csv']

In [4]:
features = pd.read_csv(data_path + 'features.csv')
patient_notes = pd.read_csv(data_path + 'patient_notes.csv')
sample_submission = pd.read_csv(data_path + 'sample_submission.csv')
training_data = pd.read_csv(data_path + 'train.csv')
test_data = pd.read_csv(data_path + 'train.csv')

In [5]:
features

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
...,...,...,...
138,912,9,Family-history-of-migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No-known-illness-contacts


In [6]:
text_notes = patient_notes["pn_history"]
text_notes

0        17-year-old male, has come to the student heal...
1        17 yo male with recurrent palpitations for the...
2        Dillon Cleveland is a 17 y.o. male patient wit...
3        a 17 yo m c/o palpitation started 3 mos ago; \...
4        17yo male with no pmh here for evaluation of p...
                               ...                        
42141    Ms. Madden is a 20 yo female presenting w/ the...
42142    A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143    Ms. Madden is a 20yo female who presents with ...
42144    Stephanie madden is a 20 year old woman compla...
42145    patient is a 20 yo F who presents with a heada...
Name: pn_history, Length: 42146, dtype: object

In [7]:
patient_notes_w_training = pd.merge(training_data, patient_notes, how="left", on="pn_num")
patient_notes_w_training = patient_notes_w_training.drop(["case_num_y"], axis=1)
patient_notes_w_training = patient_notes_w_training[patient_notes_w_training.annotation.apply(lambda w: len(w)) > 2]
patient_notes_w_training = pd.merge(patient_notes_w_training, features, how="inner", on="feature_num")
patient_notes_w_training

Unnamed: 0,id,case_num_x,pn_num,feature_num,annotation,location,pn_history,case_num,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],HPI: 17yo M presents with palpitations. Patien...,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,00046_000,0,46,0,['father: heart attack'],['824 844'],Mr. Cleveland is a 17yo M who was consented by...,0,Family-history-of-MI-OR-Family-history-of-myoc...
2,00082_000,0,82,0,['Father MI'],['622 631'],17 yo M w/ no cardiac or arrhythmia PMH presen...,0,Family-history-of-MI-OR-Family-history-of-myoc...
3,00100_000,0,100,0,['Dad-MI'],['735 741'],HPI: Dillon Cleveland is an otherwise healthy ...,0,Family-history-of-MI-OR-Family-history-of-myoc...
4,00161_000,0,161,0,['father had acute MI'],['601 620'],"17 y/o M , Dillon Cleveland comes with c/o of ...",0,Family-history-of-MI-OR-Family-history-of-myoc...
...,...,...,...,...,...,...,...,...,...
9896,95128_905,9,95128,905,['neck pain'],['218 227'],20 year odl female c/o headaches x few hrs. He...,9,Neck-pain
9897,95145_905,9,95145,905,['neck pain'],['158 167'],Pt is 20 yo F w headache since yesterday morni...,9,Neck-pain
9898,95333_905,9,95333,905,['Neck stiffness'],['338 352'],Stephanie madden is a 20 year old woman compla...,9,Neck-pain
9899,92203_911,9,92203,911,['unsure of meningitis shot'],['512 521;544 559'],Stephanie Madden is a 20 year old female who p...,9,Meningococcal-vaccine-status-unknown


In [8]:
#import tensorflow_datasets as tfds
import tensorflow_hub as hub
#import tensorflow_models as tfm

In [9]:
bio_folder_bert = "biobert_v1.1_pubmed/"
tf.io.gfile.listdir(bio_folder_bert)

['model.ckpt-1000000.meta',
 'config.json',
 'model.ckpt-1000000.index',
 'vocab.txt',
 'pytorch_model.bin',
 '.ipynb_checkpoints',
 'model.ckpt-1000000.data-00000-of-00001']

In [10]:
train_data = (tf.data.Dataset.from_tensor_slices(
    (
    tf.cast(text_notes, tf.string),
    tf.cast(patient_notes["case_num"], tf.int32
            ))
))

In [11]:
display_data = next(iter(train_data.batch(100)))

In [12]:
display_data[:5]

(<tf.Tensor: shape=(100,), dtype=string, numpy=
 array([b"17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment\r\n-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav\r\n-associated with dispnea on exersion and rest,stressed out about school\r\n-reports fe feels like his heart is jumping out of his chest\r\n-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam\r\n-pmh:non,meds :aderol (from a friend),nkda\r\n-fh:father had MI recently,mother has thyroid dz\r\n-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school\r\n-sh:no std",
        b'17 yo male with recurrent palpitations for the past 3 mo lasting about 3 - 4 min, it happened about 5 - 6 times since the beginning. One time durign a baskeball game two days ago light headedness, pressure in the chest

### Initialize BioBert Model

In [13]:
import json

### Tokenizer

In [2]:
bio_bert_tokenizer = BertTokenizer.from_pretrained(bio_folder_bert, local_files_only=True)
bio_bert_tokenizer_gh = AutoTokenizer.from_pretrained("gsarti/biobert-nli")

NameError: name 'bio_folder_bert' is not defined

In [15]:
test_input = bio_bert_tokenizer(['This is great!', 'This is terrible!'], 
              max_length=10,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
test_input

{'input_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[ 101, 1142, 1110, 1632,  106,  102,    0,    0,    0,    0],
       [ 101, 1142, 1110, 6434,  106,  102,    0,    0,    0,    0]],
      dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]], dtype=int32)>}

### Model

In [16]:
import sklearn
# !pip install torch torchvision



In [17]:
config_path = os.path.join(bio_folder_bert, "bert_config.json")
config_path

'biobert_v1.1_pubmed/bert_config.json'

In [18]:
bio_bert_model = TFBertModel.from_pretrained(bio_folder_bert, local_files_only=False, from_pt=True)
bio_bert_model_gc = TFAutoModel.from_pretrained("gsarti/biobert-nli")

All PyTorch model weights were used when initializing TFBertModel.

All the weights of TFBertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.


In [19]:
len(bio_bert_model.weights)

199

In [20]:
bio_bert_model.weights[0]

<tf.Variable 'tf_bert_model/bert/embeddings/word_embeddings/weight:0' shape=(28996, 768) dtype=float32, numpy=
array([[-0.02646292, -0.00737567, -0.02888124, ..., -0.036186  ,
        -0.03402966,  0.01820285],
       [-0.01152121,  0.01323619, -0.05758354, ..., -0.02354121,
        -0.06789913, -0.00655153],
       [ 0.02323636,  0.00516385, -0.02507166, ..., -0.06960029,
        -0.02208178, -0.02544149],
       ...,
       [-0.02961127, -0.05189098, -0.05919506, ..., -0.05500455,
        -0.05904943,  0.01434717],
       [-0.04530165,  0.01503256, -0.06112099, ..., -0.0789494 ,
        -0.02989064, -0.00505865],
       [ 0.03025113, -0.00555819, -0.04065674, ..., -0.02147963,
        -0.05192445, -0.0016351 ]], dtype=float32)>

### Train & Test Data Set

In [21]:
filtered_training_data = training_data[training_data.annotation.apply(lambda w: len(w)) > 2]
filtered_training_data

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']
...,...,...,...,...,...,...
14291,95333_908,9,95333,908,['Nausea'],['354 360']
14293,95333_910,9,95333,910,['lives with roomate'],['576 594']
14297,95333_914,9,95333,914,['photobia'],['274 282']
14298,95333_915,9,95333,915,['no sick contacts'],['421 437']


In [22]:
patient_notes_w_training

Unnamed: 0,id,case_num_x,pn_num,feature_num,annotation,location,pn_history,case_num,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],HPI: 17yo M presents with palpitations. Patien...,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,00046_000,0,46,0,['father: heart attack'],['824 844'],Mr. Cleveland is a 17yo M who was consented by...,0,Family-history-of-MI-OR-Family-history-of-myoc...
2,00082_000,0,82,0,['Father MI'],['622 631'],17 yo M w/ no cardiac or arrhythmia PMH presen...,0,Family-history-of-MI-OR-Family-history-of-myoc...
3,00100_000,0,100,0,['Dad-MI'],['735 741'],HPI: Dillon Cleveland is an otherwise healthy ...,0,Family-history-of-MI-OR-Family-history-of-myoc...
4,00161_000,0,161,0,['father had acute MI'],['601 620'],"17 y/o M , Dillon Cleveland comes with c/o of ...",0,Family-history-of-MI-OR-Family-history-of-myoc...
...,...,...,...,...,...,...,...,...,...
9896,95128_905,9,95128,905,['neck pain'],['218 227'],20 year odl female c/o headaches x few hrs. He...,9,Neck-pain
9897,95145_905,9,95145,905,['neck pain'],['158 167'],Pt is 20 yo F w headache since yesterday morni...,9,Neck-pain
9898,95333_905,9,95333,905,['Neck stiffness'],['338 352'],Stephanie madden is a 20 year old woman compla...,9,Neck-pain
9899,92203_911,9,92203,911,['unsure of meningitis shot'],['512 521;544 559'],Stephanie Madden is a 20 year old female who p...,9,Meningococcal-vaccine-status-unknown


In [23]:
x_train_set, y_train_set, x_valid_set, y_valid_set = sk.model_selection.train_test_split(patient_notes_w_training['pn_history'], patient_notes_w_training['case_num_x'], test_size=0.25, random_state=44)
x_train_set                                                                                                                                                                                  
                                                                                         

5173    CC: chest palpitations\r\nHPI: 26 yo females p...
5677    26 YO F COMING FOR A FOLLOW UP due to cardiac ...
9207    20 Y OLD F C/O HEADACHE X 2 DAYS \r\n- STARTED...
7790    Loraine Wicks is a 67 yo F with a history of H...
617     HPI: Mr. Cleveland is a 17 yo m that presents ...
                              ...                        
2144    Mrs. Montgomery is a 44yo female presenting wi...
3971    CC: Stomach problems\r\n35yo M who presents wi...
571     17 yo CC palpitation \r\n-Started 3 months ago...
9389    Ms. Madden is a previously healthy 20 y/o fema...
3491    HPI 35 yo M complains of epigastric pain for 2...
Name: pn_history, Length: 7425, dtype: object

In [24]:
y_valid_set

7918    8
1657    1
1642    1
8099    8
9008    9
       ..
8146    8
9267    9
7623    7
2852    2
6956    6
Name: case_num_x, Length: 2476, dtype: int64

In [25]:
x_train_data_set = (tf.data.Dataset.from_tensor_slices(
    (
    tf.cast(x_train_set, tf.string),
    tf.cast(x_valid_set, tf.int32
            ))
))

In [58]:
valid_data_set = (tf.data.Dataset.from_tensor_slices(
    (
    tf.cast(y_train_set, tf.string),
    tf.cast(y_valid_set, tf.int32
            ))
))

In [59]:
x_train_data_set_2 = (tf.data.Dataset.from_tensor_slices(
    (
    tf.cast(filtered_training_data['annotation'], tf.string),
    tf.cast(filtered_training_data['case_num'], tf.int32
            ))
))

In [60]:
x_train_data_set, x_train_labels = next(iter(x_train_data_set.batch(5000)))
x_valid_data_set, x_valid_labels = next(iter(valid_data_set.batch(2000)))

In [38]:
x_train_data_set_2, x_train_labels_2 = next(iter(x_train_data_set_2.batch(5000)))

In [39]:
x_train_data_set[:4]

<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'CC: chest palpitations\r\nHPI: 26 yo females presenting with episodic chest palpitations for the past 3 weeks. States that multiple times throughout the day, with no identifiable trigger, she will feel chest palpatations with associated SOB, nausea, throat swelling and feeling "something bad is going to happen." The episodes last 15-30min then she feels back normal again with no intervention. Episodes have been happening for 5 years but since 3 weeks ago got more frequent. She presented to the ED 2 weeks ago where they did an ECG, troponins, CBC and metabolic panel which were wnl. Denies weight loss, changes to skin or hair.\r\nROS: negative except as above, PMHx: healthy\r\nMeds: none  Allegeries: NKDA\r\nFHX noncontributory,  SurgHx: none\r\nSocial: no ETOH, no tobacco use, no illicit drug use, lives alone in an apartment in midtown, currently unemployed, sexually active with monogamus boyfriend with consistent condom use',
      

In [40]:
x_train_labels[:4]

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([5, 5, 9, 8], dtype=int32)>

In [41]:
x_train_data_set_2[:4]

<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b"['dad with recent heart attcak']",
       b'[\'mom with "thyroid disease\']', b"['chest pressure']",
       b"['intermittent episodes', 'episode']"], dtype=object)>

### Model Parameters

In [42]:
bio_bert_input = bio_bert_tokenizer(x_train_set[0], 
              max_length=10,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

In [43]:
bio_bert_output = bio_bert_model(bio_bert_input)
bio_bert_output

(<tf.Tensor: shape=(1, 10, 768), dtype=float32, numpy=
 array([[[-0.26176637,  0.28847873,  0.48205814, ..., -0.05416885,
           0.9281506 ,  0.3872112 ],
         [-0.5331787 ,  0.63052183,  0.47746453, ...,  0.42377853,
           0.45871764,  0.5198115 ],
         [-0.17617214,  0.44873574,  0.43347174, ..., -0.05542408,
           0.7582871 , -0.00843454],
         ...,
         [-0.01996813, -0.00797248,  0.5948601 , ...,  0.8508129 ,
           0.81240296,  0.46004915],
         [-0.00454436,  0.02857213,  0.34732428, ...,  0.17573778,
           0.97152716,  0.01631852],
         [-0.26176617,  0.28847826,  0.48205784, ..., -0.05416885,
           0.9281506 ,  0.3872112 ]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 768), dtype=float32, numpy=
 array([[ 1.39911011e-01,  3.25473547e-01,  7.45896280e-01,
         -9.90046680e-01,  9.60329711e-01, -4.37748758e-03,
         -2.55200148e-01,  5.43704808e-01, -3.68514396e-02,
          5.00914574e-01,  5.16136050e-01,  8.90100360e-0

In [62]:
num_train_examples = 5000
num_test_examples = 2000

MAX_SEQUENCE_LENGTH = 100
train_examples = [x.decode('utf-8') for x in x_train_data_set.numpy()]
valid_examples = [x.decode('utf-8') for x in x_valid_data_set.numpy()]

x_train = bio_bert_tokenizer(train_examples[:num_train_examples],
                             max_length = MAX_SEQUENCE_LENGTH,
                             truncation=True,
                             padding='max_length',
                             return_tensors='tf'
                            )

valid_train = bio_bert_tokenizer(valid_examples[:num_train_examples],
                                max_length = MAX_SEQUENCE_LENGTH,
                                truncation = True,
                                padding='max_length',
                                return_tensors='tf')

y_train = x_train_labels[:num_train_examples]
valid_train_labels = x_valid_labels[:num_train_examples]

In [73]:
### Create a label function that will compare the annotations and patient notes using dice and cosine similarities to start
### Then try implementing the Coreference resolution

In [69]:
def create_bert_classifier_model(bert_model, 
                              num_training_layers = 0, 
                              dimension_size = 200, 
                              dropout=0.3, 
                              learning_rate = 0.01):
    
#     bert_model.trainable = True
    input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='token_type_id_layer')
    attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='attention_mask_layer')
    
    bio_bert_input = {'input_ids': input_ids,
                     'token_type_ids': token_type_ids,
                     'attention_mask': attention_mask}
    
    bio_bert_output = bert_model(bio_bert_input)
    
    pooler_token = bio_bert_output[1]
    
    hidden_layer = tf.keras.layers.Dense(dimension_size, activation= "ReLU", name = 'hidden_layer')(pooler_token)
    
    hidden_layer = tf.keras.layers.Dropout(dropout)(hidden_layer)
    
    classification = tf.keras.layers.Dense(10, activation = "softmax", name='classification_layer')(hidden_layer)
    
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    
    classification_model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                                 loss = "sparse_categorical_crossentropy",
                                                 metrics = ['Accuracy'])
    return classification_model
    

In [70]:
baseline_bio_bert_model = create_bert_classifier_model(bio_bert_model)

In [71]:
baseline_bio_bert_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask_layer (InputLay  [(None, 100)]       0           []                               
 er)                                                                                              
                                                                                                  
 input_ids_layer (InputLayer)   [(None, 100)]        0           []                               
                                                                                                  
 token_type_id_layer (InputLaye  [(None, 100)]       0           []                               
 r)                                                                                               
                                                                                            

In [72]:
baseline_bio_bert_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask], y_train, 
                            validation_data = ([valid_train.input_ids, valid_train.token_type_ids, valid_train.attention_mask], valid_train_labels),
                            batch_size=32, epochs=2)


Epoch 1/2

KeyboardInterrupt: 