## BioNLP NER model example

In [None]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import sparknlp

spark = sparknlp.start()

In [6]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')

embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

clinical_pos = PerceptronModel.pretrained("pos_clinical", "en", "clinical/models") \
  .setInputCols(["sentence", "token"]) \
  .setOutputCol("pos")

dependency_parser = DependencyParserModel.pretrained("dependency_conllu") \
  .setInputCols(["sentence","token", "pos"]) \
  .setOutputCol("dependency")

bio_ner = NerDLModel.pretrained('ner_bionlp', 'en', 'clinical/models')

converter = NerConverter()\
  .setInputCols(["sentence", "token", "ner"])\
  .setOutputCol("ner_span")

clinical_ner_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        token,
        embeddings,
        clinical_pos,
        dependency_parser,
        bio_ner,
        converter])


embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
pos_clinical download started this may take some time.
Approximate size to download 1.7 MB
[OK!]
dependency_conllu download started this may take some time.
Approximate size to download 16.6 MB
[OK!]
ner_bionlp download started this may take some time.
Approximate size to download 14 MB
[OK!]


In [7]:
import pyspark.sql.functions as F

text="""
Stage 4 adenocarcinoma of lung with b/l lung nodules , probable L-2 metastasis and a negative brain MRI . Molecular testing of tumor demonstrates EGFR mutation . On Tarceva . Disease Stable . Plan Chemotherapy toxicity reviewed again . Patient agrees to proceed . Counseling time : 40 mins . 1 ) Continue Tarceva for lung cancer . CT CAP ordered before next visit 2 ) Continue anti-anxiety medication 3 ) Xgeva today 4 ) Return in 1 month with labs , after scan
"""

prediction_data = spark.createDataFrame([[text]]).toDF("text")

prediction_data.show(truncate=False)


+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+-------------------------------------------------------------------

In [8]:

prediction_model=clinical_ner_pipeline.fit(prediction_data)

preds = prediction_model.transform(prediction_data)

preds.select(F.explode(F.arrays_zip("ner_span.result","ner_span.metadata")).alias("entities")) \
.select(F.expr("entities['0']").alias("chunk"),
        F.expr("entities['1'].entity").alias("entity"))\
        .show(truncate=False)

+--------------+----------------------+
|chunk         |entity                |
+--------------+----------------------+
|adenocarcinoma|Cancer                |
|lung          |Organ                 |
|lung nodules  |Pathological_formation|
|L-2           |Gene_or_gene_product  |
|brain MRI     |Cancer                |
|tumor         |Cancer                |
|EGFR          |Gene_or_gene_product  |
|Tarceva       |Simple_chemical       |
|Patient       |Organism              |
|1             |Gene_or_gene_product  |
|Tarceva       |Simple_chemical       |
|lung cancer   |Cancer                |
|CT CAP        |Gene_or_gene_product  |
+--------------+----------------------+

