![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/JohnSnowLabs/spark-nlp-workshop/tree/master/tutorials/streamlit_notebooks/healthcare/LANGTEST_NER.ipynb)

# Colab Setup

In [1]:
import json, os
from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [2]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET


In [3]:
import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql.types import StringType, IntegerType

import pandas as pd
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 5.1.0
Spark NLP_JSL Version : 5.1.0


# Utility Functions

In [4]:
document_assembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

sentence_detector = SentenceDetector() \
      .setInputCols(["document"]) \
      .setOutputCol("sentence")

tokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")


ner_converter = NerConverter()\
      .setInputCols(["sentence", "token", "ner"])\
      .setOutputCol("ner_chunk")

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [5]:
def get_light_model(model_name):

  deid_ner= MedicalNerModel.pretrained(model_name, "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

  nlpPipeline = Pipeline(stages=[
      document_assembler,
      sentence_detector,
      tokenizer,
      word_embeddings,
      deid_ner,
      ner_converter])


  model = nlpPipeline.fit(spark.createDataFrame([[""]]).toDF("text"))
  light_model = LightPipeline(model)

  return light_model


In [6]:
def get_table(light_result):
  chunks = []
  entities = []
  begin = []
  end = []
  confidence= []

  for n in light_result[0]['ner_chunk']:
      begin.append(n.begin)
      end.append(n.end)
      chunks.append(n.result)
      entities.append(n.metadata['entity'])
      confidence.append(n.metadata['confidence'])


  df_clinical = pd.DataFrame({'chunks':chunks,
                            'entities':entities,
                            'begin': begin,
                            'end':end,
                            'confidence':confidence})
  return df_clinical

In [7]:
from sparknlp_display import NerVisualizer
visualiser = NerVisualizer()

# ner_deid_generic_augmented_langtest

## Detect PHI for Deidentification (LangTest - Generic - Augmented)

In [8]:
original_text="HPI : 65 yo M h/o HTN , DM , RAS , now ESRD on HD , recent admission in LH CCU 1/26 - 2/28/97 after presenting with PEA cardiac arrest , course c/b respiratory failure requiring trach and PEG , initiated on HD during that admission , d/c'ed to Homestead Hospital 2/28/97 , transferred from HH for fevers ( T to 102 on 3/22 ) and urine and blood cultures positive for gram negative rods for further management .\nMrs . Hannah ,  52 -yo woman with poorly-controlled DM diagnosed 2061 and h/o gastroparesis , retinopathy , nephropathy , peripheral neuropathy , and with multiple visits and three recent admissions for n/v/gastroparesis : May 1-3 and June 7-21 and July 22 . Her GI symptoms began in December , 2080 ."
corrupted_text="hpi : 65 YO m h/o htn , dm , ras , now esrd on hd , recent admission IN lh ccu 126 - 2/28/97 after presenting with pea cardiac arrest , course c/b respiratory failure requiring trach and peg , initiated on hd during that admission , d/c'ed to homestead hospital 2/28/97 , transferred from hh for fevers ( t to 102 on 3/22 ) and urine and blood cultures positive for gram negative rods for further management .\nMrs . HANNAH^03^YEAGER , 52 -Yo WOMAN WITH POORLY-CONTROLLED DM diagnosed 2061 and h/o gastroparesi , RETINOPATHY , NEPHROPATHY , PERIPHERAL NEUROPATHY , AND WITH MULTIPLE VISITS AND THREE RECENT ADMISSIONS FOR N/V/GASTROPARESIS : MAY 1-3 and june 7-21 and JULY 22 . HER GI SYMPTOMS BEGAN IN Decembr , 2080 ."

In [9]:
original_model_name="ner_deid_generic_augmented"
langtest_model_name="ner_deid_generic_augmented_langtest"

In [10]:
original_model=get_light_model(original_model_name)
langtest_model=get_light_model(langtest_model_name)

ner_deid_generic_augmented download started this may take some time.
[OK!]
ner_deid_generic_augmented_langtest download started this may take some time.
[OK!]


In [11]:
model_org_text_result = original_model.fullAnnotate(original_text)
model_org_text_result_table=get_table(model_org_text_result)

model_corr_text_result = original_model.fullAnnotate(corrupted_text)
model_corr_text_result_table=get_table(model_corr_text_result)

lang_model_corr_text_result = langtest_model.fullAnnotate(corrupted_text)
lang_model_corr_text_table=get_table(lang_model_corr_text_result)

In [12]:
from google.colab import widgets

t = widgets.TabBar(["model_org_text", "model_corr_text", "langtest_model_corr_text"])

with t.output_to(0):
    display(model_org_text_result_table.head(10))

with t.output_to(1):
    display(model_corr_text_result_table.head(10))

with t.output_to(2):
    display(lang_model_corr_text_table.head(10))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,chunks,entities,begin,end,confidence
0,65,AGE,6,7,1.0
1,LH,LOCATION,72,73,1.0
2,1/26,DATE,79,82,1.0
3,2/28/97,DATE,86,92,1.0
4,Homestead Hospital,LOCATION,244,261,0.99215
5,2/28/97,DATE,263,269,1.0
6,HH,LOCATION,290,291,0.9997
7,3/22,DATE,318,321,0.9999
8,Hannah,NAME,417,422,0.9986
9,52,AGE,427,428,0.9876


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,chunks,entities,begin,end,confidence
0,65,AGE,6,7,0.9997
1,2/28/97,DATE,85,91,1.0
2,2/28/97,DATE,262,268,1.0
3,3/22,DATE,317,320,1.0
4,2061,DATE,484,487,0.9997
5,june 7-21,DATE,653,661,0.9728
6,2080,DATE,712,715,0.9072


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,chunks,entities,begin,end,confidence
0,65,AGE,6,7,0.9998
1,lh,LOCATION,72,73,0.9968
2,126,DATE,79,81,0.7953
3,2/28/97,DATE,85,91,0.9997
4,homestead hospital,LOCATION,243,260,0.92275
5,2/28/97,DATE,262,268,1.0
6,hh,LOCATION,289,290,1.0
7,3/22,DATE,317,320,1.0
8,HANNAH^03^YEAGER,NAME,416,431,0.9964
9,52,AGE,435,436,0.8726


<IPython.core.display.Javascript object>

In [13]:
t = widgets.TabBar(["model_org_text", "model_corr_text", "langtest_model_corr_text"])

with t.output_to(0):
    visualiser.display(model_org_text_result[0], label_col='ner_chunk', document_col='document')

with t.output_to(1):
    visualiser.display(model_corr_text_result[0], label_col='ner_chunk', document_col='document')

with t.output_to(2):
    visualiser.display(lang_model_corr_text_result[0], label_col='ner_chunk', document_col='document')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>