# Imports

In [2]:
!pip install transformers --quiet
!pip install gensim==3.8.3
!pip install -q sentencepiece
!pip install tensorflow_datasets
!pip install -q tf-models-official

import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
from pprint import pprint

import os
import nltk
from nltk.data import find
import pandas as pd

import matplotlib.pyplot as plt

import re

from transformers import BertTokenizer, TFBertModel, AutoModel, TFAutoModel, AutoTokenizer
import textwrap

from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
from transformers import GPT2Tokenizer



In [31]:
import ast

In [3]:
data_path = "nbme-score-clinical-patient-notes/"
print("Project Path:", data_path)

Project Path: nbme-score-clinical-patient-notes/


In [4]:
features = pd.read_csv(data_path + 'features.csv')
patient_notes = pd.read_csv(data_path + 'patient_notes.csv')
sample_submission = pd.read_csv(data_path + 'sample_submission.csv')
training_data = pd.read_csv(data_path + 'train.csv')
test_data = pd.read_csv(data_path + 'train.csv')

In [5]:
text_notes = patient_notes["pn_history"]
text_notes

0        17-year-old male, has come to the student heal...
1        17 yo male with recurrent palpitations for the...
2        Dillon Cleveland is a 17 y.o. male patient wit...
3        a 17 yo m c/o palpitation started 3 mos ago; \...
4        17yo male with no pmh here for evaluation of p...
                               ...                        
42141    Ms. Madden is a 20 yo female presenting w/ the...
42142    A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143    Ms. Madden is a 20yo female who presents with ...
42144    Stephanie madden is a 20 year old woman compla...
42145    patient is a 20 yo F who presents with a heada...
Name: pn_history, Length: 42146, dtype: object

In [6]:
patient_notes_w_training = pd.merge(training_data, patient_notes, how="left", on="pn_num")
patient_notes_w_training = patient_notes_w_training.drop(["case_num_y"], axis=1)
patient_notes_w_training = patient_notes_w_training[patient_notes_w_training.annotation.apply(lambda w: len(w)) > 2]
patient_notes_w_training = pd.merge(patient_notes_w_training, features, how="inner", on="feature_num")
patient_notes_w_training

Unnamed: 0,id,case_num_x,pn_num,feature_num,annotation,location,pn_history,case_num,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],HPI: 17yo M presents with palpitations. Patien...,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,00046_000,0,46,0,['father: heart attack'],['824 844'],Mr. Cleveland is a 17yo M who was consented by...,0,Family-history-of-MI-OR-Family-history-of-myoc...
2,00082_000,0,82,0,['Father MI'],['622 631'],17 yo M w/ no cardiac or arrhythmia PMH presen...,0,Family-history-of-MI-OR-Family-history-of-myoc...
3,00100_000,0,100,0,['Dad-MI'],['735 741'],HPI: Dillon Cleveland is an otherwise healthy ...,0,Family-history-of-MI-OR-Family-history-of-myoc...
4,00161_000,0,161,0,['father had acute MI'],['601 620'],"17 y/o M , Dillon Cleveland comes with c/o of ...",0,Family-history-of-MI-OR-Family-history-of-myoc...
...,...,...,...,...,...,...,...,...,...
9896,95128_905,9,95128,905,['neck pain'],['218 227'],20 year odl female c/o headaches x few hrs. He...,9,Neck-pain
9897,95145_905,9,95145,905,['neck pain'],['158 167'],Pt is 20 yo F w headache since yesterday morni...,9,Neck-pain
9898,95333_905,9,95333,905,['Neck stiffness'],['338 352'],Stephanie madden is a 20 year old woman compla...,9,Neck-pain
9899,92203_911,9,92203,911,['unsure of meningitis shot'],['512 521;544 559'],Stephanie Madden is a 20 year old female who p...,9,Meningococcal-vaccine-status-unknown


In [7]:
train_data = (tf.data.Dataset.from_tensor_slices(
    (
    tf.cast(text_notes, tf.string),
    tf.cast(patient_notes["case_num"], tf.int32
            ))
))
display_data = next(iter(train_data.batch(100)))

In [8]:
display_data[:5]

(<tf.Tensor: shape=(100,), dtype=string, numpy=
 array([b"17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment\r\n-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav\r\n-associated with dispnea on exersion and rest,stressed out about school\r\n-reports fe feels like his heart is jumping out of his chest\r\n-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam\r\n-pmh:non,meds :aderol (from a friend),nkda\r\n-fh:father had MI recently,mother has thyroid dz\r\n-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school\r\n-sh:no std",
        b'17 yo male with recurrent palpitations for the past 3 mo lasting about 3 - 4 min, it happened about 5 - 6 times since the beginning. One time durign a baskeball game two days ago light headedness, pressure in the chest

In [35]:
for i in range(len(patient_notes_w_training['annotation'])):
    patient_notes_w_training['annotation'][i] = ast.literal_eval(patient_notes_w_training['annotation'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_notes_w_training['annotation'][i] = ast.literal_eval(patient_notes_w_training['annotation'][i])


In [38]:
 patient_notes_w_training['annotation'][2]

['Father MI']

## Initialize BioMed Roberta

In [15]:
import json
from transformers import AutoModel
import sklearn as sk

In [11]:
biomed_roberta_tokenizer = AutoTokenizer.from_pretrained("allenai/biomed_roberta_base")
biomed_roberta_model = AutoModel.from_pretrained("allenai/biomed_roberta_base")

Downloading:   0%|          | 0.00/656M [00:00<?, ?B/s]

In [18]:
biomed_roberta_model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0

In [47]:
x_train_set, y_train_set, x_valid_set, y_valid_set = sk.model_selection.train_test_split(patient_notes_w_training['pn_history'], patient_notes_w_training['annotation'], test_size=0.25, random_state=44)
x_train_set

5173    CC: chest palpitations\r\nHPI: 26 yo females p...
5677    26 YO F COMING FOR A FOLLOW UP due to cardiac ...
9207    20 Y OLD F C/O HEADACHE X 2 DAYS \r\n- STARTED...
7790    Loraine Wicks is a 67 yo F with a history of H...
617     HPI: Mr. Cleveland is a 17 yo m that presents ...
                              ...                        
2144    Mrs. Montgomery is a 44yo female presenting wi...
3971    CC: Stomach problems\r\n35yo M who presents wi...
571     17 yo CC palpitation \r\n-Started 3 months ago...
9389    Ms. Madden is a previously healthy 20 y/o fema...
3491    HPI 35 yo M complains of epigastric pain for 2...
Name: pn_history, Length: 7425, dtype: object

In [52]:
x_valid_set
" ".join(str(a) for a in x_valid_set)

'[\'presented to the ED 2 weeks ago where they did an ECG, troponins, CBC and metabolic panel which were wnl\'] [\'26\'] [\'VOMITING\'] [\'appetite has increased\'] [\'or the last 2-3 months\', \'over the last 2 months\'] [\'female\'] [\'f\'] [\'F\'] [\'preceded with diarrhea\'] [\'feeling warm\'] [\'exercis induced asthma\'] [\'waking early\'] [\'20 Yo\'] [\'Last sexual encounter was 9 months ago\'] [\'diarrhoe\'] [\'for 3 weeks\'] [\'decreased energy\'] [\'Tums does not seem to relieve it now\'] [\'F\'] [\'26yo\'] [\'similar pain in the past\'] [\'loosing weight\'] [\'Thyroid disease in mother\'] [\'dark stools\'] [\'for the last 3 years\'] [\'mortrin\'] [\'Chest Pain\', \'chest pain\', \'chest pain\', \'chest pain\'] [\'weight gain\'] [\'sharp\'] [\'night sweats\'] [\'17 yo\'] [\'M - Migraines\', \'FH Migraines\'] [\'LMP was 2 months ago\'] [\'female\'] [\'3-4 months\'] [\'more frequent the past 3 weeks\'] [\'20 yo\'] [\'vaginal dryness\'] [\'burning\'] [\'increased appetite\'] [\'f

In [53]:
# x_valid_set = np.array([np.array(train) for train in x_valid_set])
x_valid_set.dtype

dtype('O')

In [54]:
x_train_data_set = (tf.data.Dataset.from_tensor_slices(
    (
    tf.cast(x_train_set, tf.string),
    tf.cast(x_valid_set, tf.string
            ))
))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [23]:
x_train_data_set, x_train_labels = next(iter(x_train_data_set.batch(5000)))

In [26]:
x_train_data_set[:5]

<tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'CC: chest palpitations\r\nHPI: 26 yo females presenting with episodic chest palpitations for the past 3 weeks. States that multiple times throughout the day, with no identifiable trigger, she will feel chest palpatations with associated SOB, nausea, throat swelling and feeling "something bad is going to happen." The episodes last 15-30min then she feels back normal again with no intervention. Episodes have been happening for 5 years but since 3 weeks ago got more frequent. She presented to the ED 2 weeks ago where they did an ECG, troponins, CBC and metabolic panel which were wnl. Denies weight loss, changes to skin or hair.\r\nROS: negative except as above, PMHx: healthy\r\nMeds: none  Allegeries: NKDA\r\nFHX noncontributory,  SurgHx: none\r\nSocial: no ETOH, no tobacco use, no illicit drug use, lives alone in an apartment in midtown, currently unemployed, sexually active with monogamus boyfriend with consistent condom use',
      

In [27]:
x_train_labels[:5]

<tf.Tensor: shape=(5,), dtype=string, numpy=
array([b"['presented to the ED 2 weeks ago where they did an ECG, troponins, CBC and metabolic panel which were wnl']",
       b"['26']", b"['VOMITING']", b"['appetite has increased']",
       b"['or the last 2-3 months', 'over the last 2 months']"],
      dtype=object)>