![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Training and Reusing Clinical Named Entity Recognition Models

Please make sure that your cluster is setup properly according to https://nlp.johnsnowlabs.com/docs/en/licensed_install#install-spark-nlp-for-healthcare-on-databricks

## Blogposts and videos:

https://towardsdatascience.com/named-entity-recognition-ner-with-bert-in-spark-nlp-874df20d1d77

https://www.youtube.com/watch?v=YM-e4eOiQ34

https://medium.com/spark-nlp/named-entity-recognition-for-healthcare-with-sparknlp-nerdl-and-nercrf-a7751b6ad571

https://medium.com/atlas-research/ner-for-clinical-text-7c73caddd180

In [0]:
import os
import json
import string
import numpy as np
import pandas as pd


import sparknlp
import sparknlp_jsl
from sparknlp.base import *
from sparknlp.util import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.pretrained import ResourceDownloader

from pyspark.sql import functions as F
from pyspark.ml import Pipeline, PipelineModel

pd.set_option('max_colwidth', 100)
pd.set_option('display.max_columns', 100)  
pd.set_option('display.expand_frame_repr', False)


print('sparknlp.version : ',sparknlp.version())
print('sparknlp_jsl.version : ',sparknlp_jsl.version())

spark

# Clinical NER Pipeline (with pretrained models)

In [0]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") \
  .setInputCols(["document"]) \
  .setOutputCol("sentence") 

#sentenceDetector = SentenceDetector()\
#   .setInputCols(["document"])\
#   .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = MedicalNerModel.pretrained("ner_clinical_large", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_converter = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

In [0]:
#checking the stages in pipeline
model.stages

In [0]:
#getting the classes in pretrained model
clinical_ner.getClasses()

In [0]:
#extracting the embedded default param values
clinical_ner.extractParamMap()

In [0]:
#checking the embeddings
clinical_ner.getStorageRef()

In [0]:
#downloading the sample dataset
! wget -q https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/pubmed/pubmed_sample_text_small.csv

In [0]:
%fs ls file:/databricks/driver 

path,name,size
file:/databricks/driver/conf/,conf/,4096
file:/databricks/driver/eventlogs/,eventlogs/,4096
file:/databricks/driver/metadata/,metadata/,4096
file:/databricks/driver/storage/,storage/,4096
file:/databricks/driver/sample-sentences-en.txt,sample-sentences-en.txt,284
file:/databricks/driver/logs/,logs/,4096
file:/databricks/driver/pubmed_sample_text_small.csv,pubmed_sample_text_small.csv,9363435
file:/databricks/driver/ganglia/,ganglia/,4096


In [0]:
import pyspark.sql.functions as F

pubMedDF = spark.read\
                .option("header", "true")\
                .csv("dbfs:/pubmed_sample_text_small.csv")\
                
pubMedDF.show(truncate=80)

In [0]:
pubMedDF.printSchema()

In [0]:
result = model.transform(pubMedDF.limit(100))

In [0]:
result.show()

In [0]:
result.select('token.result','ner.result').show(truncate=80)

In [0]:
result_df = result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

result_df.show(50, truncate=100)

In [0]:
result_df.select("token", "ner_label").groupBy('ner_label').count().orderBy('count', ascending=False).show(truncate=False)

In [0]:
result.select('ner_chunk').take(1)

In [0]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

In [0]:
# fullAnnotate in LightPipeline

text = '''
A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation , associated with an acute hepatitis , and obesity with a body mass index ( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and vomiting . Two weeks prior to presentation , she was treated with a five-day course of amoxicillin for a respiratory tract infection . She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation . Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl , bicarbonate 18 mmol/l , anion gap 20 , creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , glycated hemoglobin ( HbA1c ) 10% , and venous pH 7.27 . Serum lipase was normal at 43 U/L . Serum acetone levels could not be assessed as blood samples kept hemolyzing due to significant lipemia . The patient was initially admitted for starvation ketosis , as she reported poor oral intake for three days prior to admission . However , serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL , the anion gap was still elevated at 21 , serum bicarbonate was 16 mmol/L , triglyceride level peaked at 2050 mg/dL , and lipase was 52 U/L . The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - the original sample was centrifuged and the chylomicron layer removed prior to analysis due to interference from turbidity caused by lipemia again . The patient was treated with an insulin drip for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL , within 24 hours . Her euDKA was thought to be precipitated by her respiratory tract infection in the setting of SGLT2 inhibitor use . The patient was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely . She had close follow-up with endocrinology post discharge .
'''

print (text)

light_model = LightPipeline(model)

light_result = light_model.fullAnnotate(text)


chunks = []
entities = []
sentence= []
begin = []
end = []

for n in light_result[0]['ner_chunk']:
        
    begin.append(n.begin)
    end.append(n.end)
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    sentence.append(n.metadata['sentence'])
    
    
import pandas as pd

df = pd.DataFrame({'chunks':chunks, 'begin': begin, 'end':end, 
                   'sentence_id':sentence, 'entities':entities})

df.head(20)

Unnamed: 0,chunks,begin,end,sentence_id,entities
0,gestational diabetes mellitus,40,68,0,PROBLEM
1,subsequent type two diabetes mellitus,118,154,0,PROBLEM
2,T2DM,158,161,0,PROBLEM
3,HTG-induced pancreatitis,187,210,0,PROBLEM
4,an acute hepatitis,264,281,0,PROBLEM
5,obesity,289,295,0,PROBLEM
6,a body mass index,302,318,0,PROBLEM
7,BMI,322,324,0,TEST
8,polyuria,381,388,0,PROBLEM
9,polydipsia,392,401,0,PROBLEM


In [0]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()

ner_vis = visualiser.display(light_result[0], label_col='ner_chunk', document_col='document', return_html=True)

# Change color of an entity label

#visualiser.set_label_colors({'PROBLEM':'#008080', 'TEST':'#800080', 'TREATMENT':'#808080'})
#visualiser.display(light_result[0], label_col='ner_chunk')

# Set label filter

# visualiser.display(light_result, label_col='ner_chunk', document_col='document',
                   #labels=['PROBLEM','TEST'])
  
displayHTML(ner_vis)

## Posology NER

In [0]:
# NER model trained on i2b2 (sampled from MIMIC) dataset
posology_ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner")

ner_converter = NerConverter()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("ner_chunk")

posology_ner_greedy = MedicalNerModel.pretrained("ner_posology_greedy", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner_greedy")

ner_converter_greedy = NerConverter()\
    .setInputCols(["sentence","token","ner_greedy"])\
    .setOutputCol("ner_chunk_greedy")

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    posology_ner,
    ner_converter,
    posology_ner_greedy,
    ner_converter_greedy])

empty_data = spark.createDataFrame([[""]]).toDF("text")
posology_model = nlpPipeline.fit(empty_data)

In [0]:
posology_ner.getClasses()

In [0]:
posology_result = posology_model.transform(pubMedDF.limit(100))

In [0]:
posology_result.show(10)

In [0]:
posology_result.printSchema()

In [0]:
from pyspark.sql.functions import monotonically_increasing_id

# This will return a new DF with all the columns + id
posology_result = posology_result.withColumn("id", monotonically_increasing_id())

posology_result.show(3)

In [0]:
posology_result.select('token.result','ner.result').show(truncate=100)

In [0]:
from pyspark.sql import functions as F

posology_result_df = posology_result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))\
        .filter("ner_label!='O'")

posology_result_df.show(20, truncate=100)

In [0]:
posology_greedy_result_df = posology_result.select(F.explode(F.arrays_zip('token.result', 'ner_greedy.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))\
        .filter("ner_label!='O'")

posology_greedy_result_df.show(20, truncate=100)

In [0]:
posology_result.select('id',F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.begin', 'ner_chunk.end', 'ner_chunk.metadata')).alias("cols")) \
.select('id', F.expr("cols['3']['sentence']").alias("sentence_id"),
        F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']").alias("begin"),
        F.expr("cols['2']").alias("end"),
        F.expr("cols['3']['entity']").alias("ner_label"))\
        .filter("ner_label!='O'")\
        .show(truncate=False)

In [0]:
posology_result.select('id',F.explode(F.arrays_zip('ner_chunk_greedy.result', 'ner_chunk_greedy.begin', 'ner_chunk_greedy.end', 'ner_chunk_greedy.metadata')).alias("cols")) \
.select('id', F.expr("cols['3']['sentence']").alias("sentence_id"),
        F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']").alias("begin"),
        F.expr("cols['2']").alias("end"),
        F.expr("cols['3']['entity']").alias("ner_label"))\
        .filter("ner_label!='O'")\
        .show(truncate=False)

In [0]:
posology_result.select('ner_chunk').take(2)[1][0][0].result

In [0]:
posology_result.select('ner_chunk').take(2)[1][0][0].metadata

### with LightPipelines

In [0]:
light_model = LightPipeline(posology_model)

text ='The patient was prescribed 1 capsule of Advil for 5 days . He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely for 3 months .'

light_result = light_model.annotate(text)

list(zip(light_result['token'], light_result['ner']))

In [0]:
list(zip(light_result['token'], light_result['ner_greedy']))

In [0]:
light_result = light_model.fullAnnotate(text)

chunks = []
entities = []
begin =[]
end = []

for n in light_result[0]['ner_chunk']:
        
    begin.append(n.begin)
    end.append(n.end)
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    
import pandas as pd

df = pd.DataFrame({'chunks':chunks, 'entities':entities,
                   'begin': begin, 'end': end})

df

Unnamed: 0,chunks,entities,begin,end
0,1,DOSAGE,27,27
1,capsule,FORM,29,35
2,Advil,DRUG,40,44
3,for 5 days,DURATION,46,55
4,40 units,DOSAGE,126,133
5,insulin glargine,DRUG,138,153
6,at night,FREQUENCY,155,162
7,12 units,DOSAGE,166,173
8,insulin lispro,DRUG,178,191
9,with meals,FREQUENCY,193,202


#### NER Visualization

In [0]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()

ner_vis = visualiser.display(light_result[0], label_col='ner_chunk', document_col='document', return_html=True)


# Change color of an entity label

#visualiser.set_label_colors({'PROBLEM':'#008080', 'TEST':'#800080', 'TREATMENT':'#808080'})
#visualiser.display(light_result[0], label_col='ner_chunk')

# Set label filter

# visualiser.display(light_result, label_col='ner_chunk', document_col='document',
                   #labels=['PROBLEM','TEST'])
  
displayHTML(ner_vis)

In [0]:
# ner_greedy

visualiser_greedy = NerVisualizer()

ner_greedy_vis = visualiser_greedy.display(light_result[0], label_col='ner_chunk_greedy', document_col='document', return_html=True)

displayHTML(ner_greedy_vis)

## Writing a generic NER function

In [0]:
def get_base_pipeline (embeddings = 'embeddings_clinical'):

  documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

  # Sentence Detector annotator, processes various sentences per line

  sentenceDetector = SentenceDetector()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

  # Tokenizer splits words in a relevant format for NLP

  tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

  # Clinical word embeddings trained on PubMED dataset
  word_embeddings = WordEmbeddingsModel.pretrained(embeddings, "en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")

  base_pipeline = Pipeline(stages=[
                    documentAssembler,
                    sentenceDetector,
                    tokenizer,
                    word_embeddings
                  ])

  return base_pipeline



def get_clinical_entities (embeddings, spark_df, nrows = 100, model_name = 'ner_clinical'):

  # NER model trained on i2b2 (sampled from MIMIC) dataset
  loaded_ner_model = MedicalNerModel.pretrained(model_name, "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

  ner_converter = NerConverter() \
      .setInputCols(["sentence", "token", "ner"]) \
      .setOutputCol("ner_chunk")

  base_pipeline = get_base_pipeline (embeddings)

  nlpPipeline = Pipeline(stages=[
      base_pipeline,
      loaded_ner_model,
      ner_converter])

  empty_data = spark.createDataFrame([[""]]).toDF("text")

  model = nlpPipeline.fit(empty_data)

  result = model.transform(spark_df.limit(nrows))

  result = result.withColumn("id", monotonically_increasing_id())

  result_df = result.select('id',F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.begin', 'ner_chunk.end', 'ner_chunk.metadata')).alias("cols")) \
  .select('id', F.expr("cols['3']['sentence']").alias("sentence_id"),
          F.expr("cols['0']").alias("chunk"),
          F.expr("cols['1']").alias("begin"),
          F.expr("cols['2']").alias("end"),
          F.expr("cols['3']['entity']").alias("ner_label"))\
          .filter("ner_label!='O'")

  return result_df

In [0]:
embeddings = 'embeddings_clinical'

model_name = 'ner_clinical'

nrows = 100

ner_df = get_clinical_entities (embeddings, pubMedDF, nrows, model_name)

ner_df.show()

In [0]:
embeddings = 'embeddings_clinical'

model_name = 'ner_posology'

nrows = 100

ner_df = get_clinical_entities (embeddings, pubMedDF, nrows, model_name)

ner_df.show()

In [0]:
import pandas as pd

def get_clinical_entities_light (light_model, text):

  light_result = light_model.fullAnnotate(text)

  chunks = []
  entities = []

  for n in light_result[0]['ner_chunk']:
          
      chunks.append(n.result)
      entities.append(n.metadata['entity']) 
      
  df = pd.DataFrame({'chunks':chunks, 'entities':entities})

  return df

In [0]:
text ='The patient was prescribed 1 capsule of Parol with meals . He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months .'

light_model = LightPipeline(posology_model)

get_clinical_entities_light (light_model, text)

Unnamed: 0,chunks,entities
0,1,DOSAGE
1,capsule,FORM
2,Parol,DRUG
3,with meals,FREQUENCY
4,40 units,DOSAGE
5,insulin glargine,DRUG
6,at night,FREQUENCY
7,12 units,DOSAGE
8,insulin lispro,DRUG
9,with meals,FREQUENCY


## PHI NER

In [0]:
embeddings = 'embeddings_clinical'

model_name = 'ner_deid_large'

# deidentify_dl
# ner_deid_large

nrows = 100

ner_df = get_clinical_entities (embeddings, pubMedDF, nrows, model_name)

pd_ner_df = ner_df.toPandas()

In [0]:
pd_ner_df.sample(20)

Unnamed: 0,id,sentence_id,chunk,begin,end,ner_label
53,57,15,Korean,1826,1831,LOCATION
4,17,0,Yaqui Valley,203,214,LOCATION
18,24,8,Japanese,1337,1344,LOCATION
8,24,0,Sweden.One,152,161,LOCATION
35,44,2,Florence,353,360,LOCATION
36,44,2,Italy,363,367,LOCATION
54,57,15,Korea,1884,1888,LOCATION
33,44,2,1002,302,306,DATE
14,24,6,Swedish students,1076,1091,LOCATION
82,96,8,UK,1005,1006,LOCATION


In [0]:
pd_ner_df.ner_label.value_counts()

## BioNLP (Cancer Genetics) NER

In [0]:
embeddings = 'embeddings_clinical'

model_name = 'ner_bionlp'

nrows = 100

ner_df = get_clinical_entities (embeddings, pubMedDF, nrows, model_name)

ner_df.show(truncate = False)

# NER Chunker
We can extract phrases that fits into a known pattern using the NER tags. NerChunker would be quite handy to extract entity groups with neighboring tokens when there is no pretrained NER model to address certain issues. Lets say we want to extract clinical findings and body parts together as a single chunk even if there are some unwanted tokens between.

In [0]:
posology_ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner")

ner_chunker = NerChunker()\
    .setInputCols(["sentence","ner"])\
    .setOutputCol("ner_chunk")\
    .setRegexParsers(["<DRUG>.*<FREQUENCY>"])

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    posology_ner,
    ner_chunker])

empty_data = spark.createDataFrame([[""]]).toDF("text")

ner_chunker_model = nlpPipeline.fit(empty_data)

In [0]:
posology_ner.getClasses()

In [0]:
light_model = LightPipeline(ner_chunker_model)

text ='The patient was prescribed 1 capsule of Advil for 5 days . He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months .'

light_result = light_model.annotate(text)

list(zip(light_result['token'], light_result['ner']))

In [0]:
light_result["ner_chunk"]

#Chunk Filterer
ChunkFilterer will allow you to filter out named entities by some conditions or predefined look-up lists, so that you can feed these entities to other annotators like Assertion Status or Entity Resolvers. It can be used with two criteria: isin and regex.

In [0]:
posology_ner = MedicalNerModel.pretrained("ner_posology", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

ner_converter = NerConverter()\
      .setInputCols(["sentence","token","ner"])\
      .setOutputCol("ner_chunk")
      
chunk_filterer = ChunkFilterer()\
      .setInputCols("sentence","ner_chunk")\
      .setOutputCol("chunk_filtered")\
      .setCriteria("isin")\
      .setWhiteList(['Advil','metformin', 'insulin lispro'])

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    posology_ner,
    ner_converter,
    chunk_filterer])

empty_data = spark.createDataFrame([[""]]).toDF("text")

chunk_filter_model = nlpPipeline.fit(empty_data)

In [0]:
light_model = LightPipeline(chunk_filter_model)

text ='The patient was prescribed 1 capsule of Advil for 5 days . He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , metformin 1000 mg two times a day . It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months .'

light_result = light_model.annotate(text)

light_result.keys()

In [0]:
light_result['ner_chunk'] 

In [0]:
light_result["chunk_filtered"]

In [0]:
ner_model = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models")\
      .setInputCols("sentence","token","embeddings")\
      .setOutputCol("ner")
      
chunk_filterer = ChunkFilterer()\
      .setInputCols("sentence","ner_chunk")\
      .setOutputCol("chunk_filtered")\
      .setCriteria("isin")\
      .setWhiteList(['severe fever','sore throat'])

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    ner_model,
    ner_converter,
    chunk_filterer])

empty_data = spark.createDataFrame([[""]]).toDF("text")

chunk_filter_model = nlpPipeline.fit(empty_data)

In [0]:
text = 'Patient with severe fever, severe cough, sore throat, stomach pain, and a headache.'

filter_df = spark.createDataFrame([[text]]).toDF('text')

chunk_filter_result = chunk_filter_model.transform(filter_df)

In [0]:
chunk_filter_result.select('ner_chunk.result', 'chunk_filtered.result').show(truncate=False)

## Training a Clinical NER (NCBI Disease Dataset)

In [0]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/NER_NCBIconlltrain.txt
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/NER_NCBIconlltest.txt
  
dbutils.fs.cp("file:/databricks/driver/NER_NCBIconlltest.txt", "dbfs:/")
dbutils.fs.cp("file:/databricks/driver/NER_NCBIconlltrain.txt", "dbfs:/")

In [0]:
from sparknlp.training import CoNLL

conll_data = CoNLL().readDataset(spark, 'file:/databricks/driver/NER_NCBIconlltrain.txt')

conll_data.show(3)

In [0]:
conll_data.count()

In [0]:
from pyspark.sql import functions as F

conll_data.select(F.explode(F.arrays_zip('token.result','label.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy('ground_truth').count().orderBy('count', ascending=False).show(100,truncate=False)


In [0]:
conll_data.select("label.result").distinct().count()

In [0]:
import pyspark.sql.functions as F

#conll_data.select(F.countDistinct("label.result")).show()
#conll_data.groupBy("label.result").count().show(truncate=False)

conll_data = conll_data.withColumn('unique', F.array_distinct("label.result"))\
.withColumn('c', F.size('unique'))\
.filter(F.col('c')>1)

conll_data.select(F.explode(F.arrays_zip('token.result','label.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy('ground_truth').count().orderBy('count', ascending=False).show(100,truncate=False)



In [0]:
# Clinical word embeddings trained on PubMED dataset
clinical_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")


In [0]:
test_data = CoNLL().readDataset(spark, "file:/databricks/driver/NER_NCBIconlltest.txt")

test_data = clinical_embeddings.transform(test_data)

test_data.write.parquet('/NER_NCBIconlltest.parquet')

In [0]:
%fs ls dbfs:/

path,name,size
dbfs:/400.txt,400.txt,2669
dbfs:/400_rot.pdf,400_rot.pdf,2240141
dbfs:/AntBNC_lemmas_ver_001.txt,AntBNC_lemmas_ver_001.txt,1348552
dbfs:/AskAPatient.fold-0.test.txt,AskAPatient.fold-0.test.txt,33834
dbfs:/AskAPatient.fold-0.train.txt,AskAPatient.fold-0.train.txt,594794
dbfs:/AskAPatient.fold-0.validation.txt,AskAPatient.fold-0.validation.txt,32162
dbfs:/ClassifierDL_USE_20200727_e5/,ClassifierDL_USE_20200727_e5/,0
dbfs:/ClassifierDL_USE_20200923_e5/,ClassifierDL_USE_20200923_e5/,0
dbfs:/ClassifierDL_wordemb_e5/,ClassifierDL_wordemb_e5/,0
dbfs:/Databricks/,Databricks/,0


In [0]:
!mkdir ner_logs
!mkdir ner_graph
'''!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/nerdl_graph/blstm_3_200_128_83.pb -P /content/ner_graph'''
# !!! Create MedicalNER Graph with the given functions at the end of this Notebook

dbutils.fs.cp("file:/databricks/driver/ner_graph", "dbfs:/", True)

## MedicalNER Graph

In [0]:
from sparknlp_jsl.training import tf_graph
tf_graph.print_model_params("ner_dl")
tf_graph.build("ner_dl", build_params={"embeddings_dim": 200, "nchars": 83, "ntags": 12, "is_medical": 1}, model_location="/medical_ner_graphs", model_filename="auto")

In [0]:
# for open source users
'''
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/training/english/dl-ner/nerdl-graph/create_graph.py
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/training/english/dl-ner/nerdl-graph/dataset_encoder.py
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/training/english/dl-ner/nerdl-graph/ner_model.py
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/training/english/dl-ner/nerdl-graph/ner_model_saver.py
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/training/english/dl-ner/nerdl-graph/sentence_grouper.py

!pip -q install tensorflow==1.15.0

import create_graph

ntags = 3 # number of labels
embeddings_dim = 200
nchars =83

create_graph.create_graph(ntags, embeddings_dim, nchars)
'''

In [0]:
%fs ls file:/

path,name,size
file:/lib64/,lib64/,4096
file:/BUILD,BUILD,88
file:/dev/,dev/,520
file:/usr/,usr/,4096
file:/root/,root/,4096
file:/media/,media/,4096
file:/opt/,opt/,4096
file:/srv/,srv/,4096
file:/proc/,proc/,0
file:/run/,run/,540


In [0]:
nerTagger = MedicalNerApproach()\
  .setInputCols(["sentence", "token", "embeddings"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(2)\
  .setBatchSize(64)\
  .setRandomSeed(0)\
  .setVerbose(1)\
  .setValidationSplit(0.2)\
  .setEvaluationLogExtended(True) \
  .setEnableOutputLogs(True)\
  .setIncludeConfidence(True)\
  .setTestDataset("/NER_NCBIconlltest.parquet")\
  .setOutputLogsPath('ner_logs')\
  .setGraphFolder('file:/medical_ner_graphs') # control your graph folder

ner_pipeline = Pipeline(stages=[
          clinical_embeddings,
          nerTagger
 ])

In [0]:
# 2 epochs 47 sec
ner_model = ner_pipeline.fit(conll_data)

# if you get an error for incompatible TF graph, use 4.1 NerDL-Graph.ipynb notebook to create a graph
# licensed users can also use 17.Graph_builder_for_DL_models.ipynb to create tf graphs easily.

In [0]:
%sh cd /databricks/driver/ner_logs && ls -lt

In [0]:
%sh cat ner_logs/MedicalNerApproach_787e3d8e11ed.log

### Evaluate your model

In [0]:
pred_df = ner_model.stages[1].transform(test_data)

In [0]:
pred_df.columns

In [0]:
from sparknlp_jsl.eval import NerDLMetrics
import pyspark.sql.functions as F

evaler = NerDLMetrics(mode="full_chunk", dropO=True)

eval_result = evaler.computeMetricsFromDF(pred_df.select("label","ner"), prediction_col="ner", label_col="label").cache()

eval_result.withColumn("precision", F.round(eval_result["precision"],4))\
    .withColumn("recall", F.round(eval_result["recall"],4))\
    .withColumn("f1", F.round(eval_result["f1"],4)).show(100)

print(eval_result.selectExpr("avg(f1) as macro").show())
print (eval_result.selectExpr("sum(f1*total) as sumprod","sum(total) as sumtotal").selectExpr("sumprod/sumtotal as micro").show())

In [0]:
evaler = NerDLMetrics(mode="partial_chunk_per_token", dropO=True)

eval_result = evaler.computeMetricsFromDF(pred_df.select("label","ner"), prediction_col="ner", label_col="label").cache()

eval_result.withColumn("precision", F.round(eval_result["precision"],4))\
    .withColumn("recall", F.round(eval_result["recall"],4))\
    .withColumn("f1", F.round(eval_result["f1"],4)).show(100)

print(eval_result.selectExpr("avg(f1) as macro").show())
print (eval_result.selectExpr("sum(f1*total) as sumprod","sum(total) as sumtotal").selectExpr("sumprod/sumtotal as micro").show())

In [0]:
ner_model.stages[1].write().overwrite().save('/databricks/driver/models/custom_NER_3epoch')

In [0]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

loaded_ner_model = MedicalNerModel.load("dbfs:/databricks/driver/models/custom_NER_3epoch")\
 .setInputCols(["sentence", "token", "embeddings"])\
 .setOutputCol("ner")

converter = NerConverter()\
  .setInputCols(["document", "token", "ner"])\
  .setOutputCol("ner_span")

ner_prediction_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        token,
        clinical_embeddings,
        loaded_ner_model,
        converter])

empty_data = spark.createDataFrame([['']]).toDF("text")

prediction_model = ner_prediction_pipeline.fit(empty_data)

from sparknlp.base import LightPipeline

light_model = LightPipeline(prediction_model)

In [0]:
text = "She has a metastatic breast cancer to lung"

def get_preds(text, light_model):

    result = light_model.fullAnnotate(text)[0]

    return [(i.result, i.metadata['entity']) for i in result['ner_span']]

get_preds(text, light_model)

End of Notebook # 1