![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Clinical Named Entity Recognition (NER)

In [3]:
%sh
echo $SPARK_NLP_LICENSE

In [4]:
spark

In [5]:
import os
import json
import os
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession


from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl


In [6]:
import sparknlp

sparknlp.version()

# Clinical NER Pipeline (with pretrained models)

In [8]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP

tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = NerDLModel.pretrained("ner_clinical", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)


In [9]:
%sh
wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/pubmed/pubmed_sample_text_small.csv

In [10]:
%fs ls "file:/databricks/driver"

path,name,size
file:/databricks/driver/conf/,conf/,4096
file:/databricks/driver/derby.log,derby.log,726
file:/databricks/driver/news_category_test.csv,news_category_test.csv,1504408
file:/databricks/driver/logs/,logs/,4096
file:/databricks/driver/pubmed_sample_text_small.csv,pubmed_sample_text_small.csv,9363435
file:/databricks/driver/eventlogs/,eventlogs/,4096
file:/databricks/driver/ganglia/,ganglia/,4096
file:/databricks/driver/news_category_train.csv,news_category_train.csv,24032125


In [11]:
dbutils.fs.cp("file:/databricks/driver/pubmed_sample_text_small.csv", "dbfs:/")

In [12]:
%fs ls "dbfs:/"

path,name,size
dbfs:/FileStore/,FileStore/,0
dbfs:/andres/,andres/,0
dbfs:/cluster-logs/,cluster-logs/,0
dbfs:/context-spell-checker/,context-spell-checker/,0
dbfs:/databricks/,databricks/,0
dbfs:/databricks-datasets/,databricks-datasets/,0
dbfs:/databricks-results/,databricks-results/,0
dbfs:/datasets/,datasets/,0
dbfs:/ensemble_serialized/,ensemble_serialized/,0
dbfs:/local_disk0/,local_disk0/,0


In [13]:
import pyspark.sql.functions as F

pubMedDF = spark.read\
                .option("header", "true")\
                .csv("pubmed_sample_text_small.csv").repartition(4)
                
pubMedDF.show(truncate=50)

In [14]:
pubMedDF.printSchema()

In [15]:
result = model.transform(pubMedDF.limit(100))

In [16]:
result.show()

In [17]:
result_df = result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

result_df.show(50, truncate=100)


In [18]:
result_df.select("token", "ner_label").groupBy('ner_label').count().orderBy('count', ascending=False).show(truncate=False)

In [19]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

In [20]:
# fullAnnotate in LightPipeline

text = '''
A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation , associated with an acute hepatitis , and obesity with a body mass index ( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and vomiting . Two weeks prior to presentation , she was treated with a five-day course of amoxicillin for a respiratory tract infection . She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation . Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl , bicarbonate 18 mmol/l , anion gap 20 , creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , glycated hemoglobin ( HbA1c ) 10% , and venous pH 7.27 . Serum lipase was normal at 43 U/L . Serum acetone levels could not be assessed as blood samples kept hemolyzing due to significant lipemia . The patient was initially admitted for starvation ketosis , as she reported poor oral intake for three days prior to admission . However , serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL , the anion gap was still elevated at 21 , serum bicarbonate was 16 mmol/L , triglyceride level peaked at 2050 mg/dL , and lipase was 52 U/L . The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - the original sample was centrifuged and the chylomicron layer removed prior to analysis due to interference from turbidity caused by lipemia again . The patient was treated with an insulin drip for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL , within 24 hours . Her euDKA was thought to be precipitated by her respiratory tract infection in the setting of SGLT2 inhibitor use . The patient was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely . She had close follow-up with endocrinology post discharge .
'''

print (text)

light_model = LightPipeline(model)

light_result = light_model.fullAnnotate(text)


chunks = []
entities = []
sentence= []
begin = []
end = []

for n in light_result[0]['ner_chunk']:
        
    begin.append(n.begin)
    end.append(n.end)
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    sentence.append(n.metadata['sentence'])
    
    
import pandas as pd

df = pd.DataFrame({'chunks':chunks, 'begin': begin, 'end':end, 
                   'sentence_id':sentence, 'entities':entities})

df.head(20)

Unnamed: 0,chunks,begin,end,sentence_id,entities
0,gestational diabetes mellitus,40,68,0,PROBLEM
1,type two diabetes mellitus,129,154,0,PROBLEM
2,"T2DM ),",158,164,0,PROBLEM
3,HTG-induced pancreatitis,187,210,0,PROBLEM
4,an acute hepatitis,264,281,0,PROBLEM
5,obesity,289,295,0,PROBLEM
6,a body mass index,302,318,0,PROBLEM
7,BMI ) of 33.5 kg/m2,322,340,0,PROBLEM
8,polyuria,381,388,0,PROBLEM
9,polydipsia,392,401,0,PROBLEM


## Posology NER

In [22]:
# NER model trained on i2b2 (sampled from MIMIC) dataset
posology_ner = NerDLModel.pretrained("ner_posology", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

In [23]:
nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    posology_ner,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

posology_model = nlpPipeline.fit(empty_data)


In [24]:
posology_result = posology_model.transform(pubMedDF.limit(100))

In [25]:
posology_result.show(10)

In [26]:
posology_result.printSchema()

In [27]:
from pyspark.sql.functions import monotonically_increasing_id

# This will return a new DF with all the columns + id
posology_result = posology_result.withColumn("id", monotonically_increasing_id())

posology_result.show(3)

In [28]:
posology_result.select('token.result','ner.result').take(2)

In [29]:
from pyspark.sql import functions as F

posology_result_df = posology_result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))\
        .filter("ner_label!='O'")

posology_result_df.show(20, truncate=100)


In [30]:
posology_result.select('id',F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.begin', 'ner_chunk.end', 'ner_chunk.metadata')).alias("cols")) \
.select('id', F.expr("cols['3']['sentence']").alias("sentence_id"),
        F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']").alias("begin"),
        F.expr("cols['2']").alias("end"),
        F.expr("cols['3']['entity']").alias("ner_label"))\
        .filter("ner_label!='O'")\
        .show(truncate=False)

In [31]:
posology_result.select('ner_chunk').take(2)[1][0][0].result

In [32]:
posology_result.select('ner_chunk').take(2)[1][0][0].metadata

### with LightPipelines

In [34]:
light_model = LightPipeline(posology_model)

text ='The patient was prescribed 1 capsule of Advil for 5 days . He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months .'

light_result = light_model.annotate(text)

list(zip(light_result['token'], light_result['ner']))

In [35]:
light_result = light_model.fullAnnotate(text)

chunks = []
entities = []
begin =[]
end = []

for n in light_result[0]['ner_chunk']:
        
    begin.append(n.begin)
    end.append(n.end)
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    
import pandas as pd

df = pd.DataFrame({'chunks':chunks, 'entities':entities,
                   'begin': begin, 'end': end})

df

Unnamed: 0,chunks,entities,begin,end
0,1,DOSAGE,27,27
1,capsule,FORM,29,35
2,Advil,DRUG,40,44
3,for 5 days,DURATION,46,55
4,40 units,DOSAGE,126,133
5,insulin glargine,DRUG,138,153
6,at night,FREQUENCY,155,162
7,12 units,DOSAGE,166,173
8,insulin lispro,DRUG,178,191
9,with meals,FREQUENCY,193,202


## Writing a generic NER function

In [37]:
def get_base_pipeline (embeddings = 'embeddings_clinical'):

  documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

  # Sentence Detector annotator, processes various sentences per line

  sentenceDetector = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

  # Tokenizer splits words in a relevant format for NLP

  tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

  # Clinical word embeddings trained on PubMED dataset
  word_embeddings = WordEmbeddingsModel.pretrained(embeddings, "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

  empty_data = spark.createDataFrame([[""]]).toDF("text")

  model = nlpPipeline.fit(empty_data)

  return model



def get_clinical_entities (embeddings, spark_df, nrows = 100, model_name = 'ner_clinical'):

  # NER model trained on i2b2 (sampled from MIMIC) dataset
  loaded_ner_model = NerDLModel.pretrained(model_name, "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner")

  ner_converter = NerConverter() \
    .setInputCols(["sentence", "token", "ner"]) \
    .setOutputCol("ner_chunk")

  base_model = get_base_pipeline (embeddings)

  nlpPipeline = Pipeline(stages=[
      base_model,
      loaded_ner_model,
      ner_converter])

  empty_data = spark.createDataFrame([[""]]).toDF("text")

  model = nlpPipeline.fit(empty_data)

  result = model.transform(spark_df.limit(nrows))

  result = result.withColumn("id", monotonically_increasing_id())

  result_df = result.select('id',F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.begin', 'ner_chunk.end', 'ner_chunk.metadata')).alias("cols")) \
  .select('id', F.expr("cols['3']['sentence']").alias("sentence_id"),
          F.expr("cols['0']").alias("chunk"),
          F.expr("cols['1']").alias("begin"),
          F.expr("cols['2']").alias("end"),
          F.expr("cols['3']['entity']").alias("ner_label"))\
          .filter("ner_label!='O'")

  return result_df

In [38]:
embeddings = 'embeddings_clinical'

model_name = 'ner_clinical'

nrows = 100

ner_df = get_clinical_entities (embeddings, pubMedDF, nrows, model_name)

ner_df.show()

In [39]:
embeddings = 'embeddings_clinical'

model_name = 'ner_posology'

nrows = 100

ner_df = get_clinical_entities (embeddings, pubMedDF, nrows, model_name)

ner_df.show()

In [40]:
import pandas as pd

def get_clinical_entities_light (light_model, text):

  light_result = light_model.fullAnnotate(text)

  chunks = []
  entities = []

  for n in light_result[0]['ner_chunk']:
          
      chunks.append(n.result)
      entities.append(n.metadata['entity']) 
      
  df = pd.DataFrame({'chunks':chunks, 'entities':entities})

  return df

In [41]:
text ='The patient was prescribed 1 capsle of Parol with meals . He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months .'

light_model = LightPipeline(posology_model)

get_clinical_entities_light (light_model, text)


Unnamed: 0,chunks,entities
0,1,DOSAGE
1,capsle,FORM
2,Parol,DRUG
3,with meals,FREQUENCY
4,40 units,DOSAGE
5,insulin glargine,DRUG
6,at night,FREQUENCY
7,12 units,DOSAGE
8,insulin lispro,DRUG
9,with meals,FREQUENCY


## PHI NER

In [43]:
embeddings = 'embeddings_clinical'

model_name = 'deidentify_dl'

nrows = 100

ner_df = get_clinical_entities (embeddings, pubMedDF, nrows, model_name)

pd_ner_df = ner_df.toPandas()


In [44]:
pd_ner_df.sample(20)

Unnamed: 0,id,sentence_id,chunk,begin,end,ner_label
6,24,0,1977,5,8,DATE
26,39,2,3112,292,295,DATE
24,38,1,May 2007,359,366,DATE
51,82,6,Plant,787,791,PATIENT
3,8,5,Form Board,1003,1012,HOSPITAL
48,79,6,26,768,769,AGE
59,98,1,Daegu City,277,286,HOSPITAL
50,79,7,26,1139,1140,AGE
14,26,6,winter,970,975,DATE
1,1,3,2011,327,330,DATE


In [45]:
pd_ner_df.ner_label.value_counts()

## BioNLP (Cancer Genetics) NER

In [47]:
embeddings = 'embeddings_clinical'

model_name = 'ner_bionlp'

nrows = 100

ner_df = get_clinical_entities (embeddings, pubMedDF, nrows, model_name)

ner_df.show(truncate = False)

## Training your own NER Model

In [49]:
%sh
wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/ner/groningen_name_bank_conll/gnm_ner_dataset.csv

In [50]:
dbutils.fs.cp("file:/databricks/driver/gnm_ner_dataset.csv", "dbfs:/")

In [51]:
import pandas as pd

df = pd.read_csv('gnm_ner_dataset.csv', encoding = "ISO-8859-1", error_bad_lines=False)

In [52]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [53]:
df.Tag.value_counts()

In [54]:
df[df.Tag=='B-nat']

Unnamed: 0,Sentence #,Word,POS,Tag
2723,,H5N1,NNP,B-nat
4554,,H5N1,NNP,B-nat
5044,,Jing,NNP,B-nat
5073,,Jing,NNP,B-nat
5606,,H5N1,NNP,B-nat
12506,,SARS,NNP,B-nat
12508,,Severe,NNP,B-nat
13162,,HIV,NNP,B-nat
13164,,AIDS,NNP,B-nat
22260,,AIDS,NNP,B-nat


In [55]:
df['Sentence #'].value_counts()

In [56]:
df['sentence']=df['Sentence #'].fillna(method='ffill').str.split(expand=True)[1]

In [57]:
df['sentence'] = df['sentence'].astype(int)

In [58]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,sentence
0,Sentence: 1,Thousands,NNS,O,1
1,,of,IN,O,1
2,,demonstrators,NNS,O,1
3,,have,VBP,O,1
4,,marched,VBN,O,1


In [59]:
df['Tag']=df['Tag'].str.upper()

In [60]:
df['Tag'].value_counts()

In [61]:
conversion_dict = {'B-TIM':'B-TIME_INDICATOR',
                   'I-TIM':'I-TIME_INDICATOR',
                   'B-GEO':'B-GEOGRAPHICAL_ENTITY',
                   'I-GEO':'I-GEOGRAPHICAL_ENTITY',
                   'B-ORG':'B-ORGANIZATION',
                   'I-ORG':'I-ORGANIZATION',
                   'B-PER':'B-PERSON',
                   'I-PER':'I-PERSON',
                   'B-GPE':'B-GEOPOLITICAL_ENTITY',
                   'I-GPE':'I-GEOPOLITICAL_ENTITY',
                   'B-ART':'B-ARTIFACT',
                   'I-ART':'I-ARTIFACT',
                   'B-EVE':'B-EVENT',
                   'I-EVE':'I-EVENT',
                   'B-NAT':'B-NATURAL_PHENOMENON',
                   'I-NAT':'I-NATURAL_PHENOMENON'}

df['Tag'] = df['Tag'].replace(conversion_dict)

In [62]:
df['Tag'].value_counts()

In [63]:
df.shape

In [64]:
conll_lines=["-DOCSTART- -X- -X- O\n\n"]
    
s=1

labels=[]

#Sentence #	Word	POS	Tags

for i,row in df[:100000].iterrows():

    lines = '{} {} {} {}\n'.format(row['Word'], row['POS'], row['POS'], row['Tag'])
    
    labels.append(lines)

    if row['sentence']!= s:

        conll_lines.append("\n")
        s+=1

    conll_lines.append(lines)
    
    if i%1000==0:
        print (i)


conll_lines.append("\n")


In [65]:
for i in conll_lines[:30]:
  print (i, end='')

In [66]:
with open('GMB_NER.conll', 'w') as f:
    for lines in conll_lines:
        f.write(lines)

In [67]:
%fs ls "file:/databricks/driver"

path,name,size
file:/databricks/driver/conf/,conf/,4096
file:/databricks/driver/derby.log,derby.log,726
file:/databricks/driver/news_category_test.csv,news_category_test.csv,1504408
file:/databricks/driver/logs/,logs/,4096
file:/databricks/driver/gnm_ner_dataset.csv,gnm_ner_dataset.csv,15208151
file:/databricks/driver/pubmed_sample_text_small.csv,pubmed_sample_text_small.csv,9363435
file:/databricks/driver/eventlogs/,eventlogs/,4096
file:/databricks/driver/ganglia/,ganglia/,4096
file:/databricks/driver/news_category_train.csv,news_category_train.csv,24032125
file:/databricks/driver/GMB_NER.conll,GMB_NER.conll,1663772


In [68]:
%fs ls "dbfs:/"

path,name,size
dbfs:/FileStore/,FileStore/,0
dbfs:/GMB_NER.conll,GMB_NER.conll,1663772
dbfs:/andres/,andres/,0
dbfs:/cluster-logs/,cluster-logs/,0
dbfs:/context-spell-checker/,context-spell-checker/,0
dbfs:/databricks/,databricks/,0
dbfs:/databricks-datasets/,databricks-datasets/,0
dbfs:/databricks-results/,databricks-results/,0
dbfs:/datasets/,datasets/,0
dbfs:/ensemble_serialized/,ensemble_serialized/,0


In [69]:
dbutils.fs.cp("file:/databricks/driver/GMB_NER.conll", "dbfs:/")

In [70]:
from sparknlp.training import CoNLL

conll_data = CoNLL().readDataset(spark, 'dbfs:/GMB_NER.conll')

conll_data.show(3)

In [71]:
conll_data.count()

In [72]:
from pyspark.sql import functions as F

conll_data.select(F.explode(F.arrays_zip('token.result','label.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy('ground_truth').count().orderBy('count', ascending=False).show(100,truncate=False)


In [73]:
(training_data, test_data) = conll_data.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

In [74]:
glove_embeddings = WordEmbeddingsModel.pretrained("glove_100d", "en", "public/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

test_data = glove_embeddings.transform(test_data)

test_data.write.parquet('gnm_test.parquet')

In [75]:
test_data.show(3)

In [76]:

nerTagger = NerDLApproach()\
  .setInputCols(["sentence", "token", "embeddings"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(1)\
  .setLr(0.001)\
  .setPo(0.005)\
  .setBatchSize(512)\
  .setRandomSeed(0)\
  .setVerbose(1)\
  .setValidationSplit(0.2)\
  .setEvaluationLogExtended(True) \
  .setEnableOutputLogs(False)\
  .setIncludeConfidence(True)\
  .setTestDataset("gnm_test.parquet")

#   .setGraphFolder('graph/')\

ner_pipeline = Pipeline(stages=[
          glove_embeddings,
          nerTagger
 ])

In [77]:
%time

ner_model = ner_pipeline.fit(training_data)

# if you get an error for incompatible TF graph, use 4.1 NerDL-Graph.ipynb notebook to create a graph

In [78]:
! cd ~/annotator_logs && ls -lt


In [79]:
!cat ~/annotator_logs/NerDLApproach_21d345fae919.log


In [80]:
ner_model.stages

In [81]:
type(ner_model.stages[1])

In [82]:
ner_model.transform(test_data).select(F.explode(F.arrays_zip('token.result', 'token.begin', 'token.end', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("begin"),
        F.expr("cols['2']").alias("end"),
        F.expr("cols['3']").alias("ner_label"))\
        .filter("ner_label!='O'")\
        .show(truncate=False)

In [83]:
ner_model.stages[1].write().overwrite().save('trained_ner_model')