[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/enterprise/healthcare/ChunkMergeSample.ipynb)

In [1]:
import json

with open('keys.json') as f:
    license_keys = json.load(f)

license_keys.keys()


dict_keys(['version', 'secret', 'SPARK_NLP_LICENSE', 'JSL_OCR_LICENSE', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'JSL_OCR_SECRET'])

In [2]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

secret = license_keys.get("secret",license_keys.get('SPARK_NLP_SECRET', ""))
spark_version = os.environ.get("SPARK_VERSION", license_keys.get("SPARK_VERSION","2.4"))
version = license_keys.get("version",license_keys.get('SPARK_NLP_PUBLIC_VERSION', ""))
jsl_version = license_keys.get("jsl_version",license_keys.get('SPARK_NLP_VERSION', ""))

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['JSL_OCR_LICENSE'] = license_keys['JSL_OCR_LICENSE']
os.environ['AWS_ACCESS_KEY_ID']= license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']

print(spark_version, version, jsl_version)

! python -m pip install "pyspark==$spark_version".*
! python -m pip install --upgrade spark-nlp-jsl==$jsl_version  --extra-index-url https://pypi.johnsnowlabs.com/$secret

import sparknlp
import sparknlp_jsl
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

print (sparknlp.version())
print (sparknlp_jsl.version())

spark = sparknlp_jsl.start(secret, gpu=False, spark23=(spark_version[:3]=="2.3"))

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)
Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/8zvTuUjWPt
Collecting spark-nlp-jsl==2.5.2
Collecting spark-nlp==2.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/3e/b0/c272273674b5810c0909b369c57669197907a15d84bbdf058007bb909c99/spark_nlp-2.5.2-py2.py3-none-any.whl (122kB)
[K     |████████████████████████████████| 133kB 2.9MB/s 
[?25hCollecting pyspark==2.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 58kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████

In [0]:
data_chunk_merge = spark.createDataFrame([
  (1,"Zacarias Woods would not have T2N1 at Los Angeles California where he presented lymphocite leukimia",),
  (2,"Andre Agassi had 2 x 3 x 1 mm hairwig better than T1N2M1 with adenocarcinoma",)
]).toDF("id","text")

regex = '''(c|p|yc|yp|r|rp|a)?(C[1-5])?M(x|X|0|1[a-d]?),pM
(c|p|yc|yp|r|rp|a)?(C[1-5])?N(x|X|0|[1-3][a-d]?),pN
(c|p|yc|yp|r|rp|a)?(C[1-5])?T(x|X|is|0|[1-4][a-d]?),pT
(c|p|yc|yp|r|rp|a)?(C[1-5])?T(x|X|is|0|[1-4][a-d]?),pT
([0-9]+(\.[0-9]+)?\s?x\s?)*([0-9]+(\.[0-9]+)?)\s?(mg|MG|mm|cm|MM|CM|),SIZE
T1N2M1,TNM
at Los Angeles California,LOCATION
Zacarias,PERSON
better than,BLOCK'''

with open('ner_regex.csv', 'w') as f:
    f.write(regex)

replace_dict = '''pT,TNM
pM,TNM'''

with open('replace_dict.csv', 'w') as f:
    f.write(replace_dict)

false_positives = '''better than,BLOCK'''

with open('false_positives.csv', 'w') as f:
    f.write(false_positives)

In [4]:
da = DocumentAssembler().setInputCol("text").setOutputCol("document")
sd = SentenceDetector().setInputCols("document").setOutputCol("sentence")
tk = Tokenizer().setInputCols("sentence").setOutputCol("token")
emb = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models").setOutputCol("embs")
ner = NerDLModel.pretrained("ner_deid_large","en","clinical/models").setInputCols("sentence","token","embs").setOutputCol("ner")
canner = NerDLModel.pretrained("ner_bionlp","en","clinical/models").setInputCols("sentence","token","embs").setOutputCol("canner")
nc = NerConverter().setInputCols("sentence","token","ner").setOutputCol("ner_chunk")
cannc = NerConverter().setInputCols("sentence","token","canner").setOutputCol("canner_chunk")
rex = RegexMatcher().setInputCols("sentence").setOutputCol("rex").setExternalRules("ner_regex.csv",",","TEXT")

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_large download started this may take some time.
Approximate size to download 14 MB
[OK!]
ner_bionlp download started this may take some time.
Approximate size to download 13.9 MB
[OK!]


In [0]:
#######################################################################################

merger_can = ChunkMergeApproach().setInputCols("ner_chunk","canner_chunk").setOutputCol("combined")\
    .setFalsePositivesResource("false_positives.csv","TEXT", {"delimiter":","})\
    .setReplaceDictResource("replace_dict.csv","TEXT", {"delimiter":","})

merger_rex = ChunkMergeApproach().setInputCols("combined","rex").setOutputCol("combined")\
    .setFalsePositivesResource("false_positives.csv","TEXT", {"delimiter":","})\
    .setReplaceDictResource("replace_dict.csv","TEXT", {"delimiter":","})\

#######################################################################################

pl = Pipeline().setStages([da,sd,tk,emb,ner,canner,nc,cannc,rex,merger_can, merger_rex])

In [0]:
merged_data = pl.fit(data_chunk_merge).transform(data_chunk_merge).cache()

In [7]:
merged_data.selectExpr("id","explode(ner_chunk) as a")\
.selectExpr("id","a.begin","a.end","a.result as ner_chunk","a.metadata.entity")\
.orderBy("id","begin").show(100, False)

+---+-----+---+----------------------+--------+
|id |begin|end|ner_chunk             |entity  |
+---+-----+---+----------------------+--------+
|1  |0    |13 |Zacarias Woods        |NAME    |
|1  |38   |59 |Los Angeles California|LOCATION|
|1  |80   |98 |lymphocite leukimia   |NAME    |
|2  |0    |11 |Andre Agassi          |NAME    |
+---+-----+---+----------------------+--------+



In [8]:
merged_data.selectExpr("id","explode(canner_chunk) as a")\
.selectExpr("id","a.begin","a.end","a.result as ner_chunk","a.metadata.entity")\
.orderBy("id","begin").show(100, False)

+---+-----+---+--------------+--------------------+
|id |begin|end|ner_chunk     |entity              |
+---+-----+---+--------------+--------------------+
|1  |0    |7  |Zacarias      |Gene_or_gene_product|
|2  |6    |11 |Agassi        |Gene_or_gene_product|
|2  |50   |55 |T1N2M1        |Gene_or_gene_product|
|2  |62   |75 |adenocarcinoma|Cancer              |
+---+-----+---+--------------+--------------------+



In [9]:
merged_data.selectExpr("id","explode(rex) as a")\
.selectExpr("id","a.begin","a.end","a.result as ner_chunk","a.metadata.identifier as entity")\
.orderBy("id","begin").show(100, False)

+---+-----+---+-------------------------+--------+
|id |begin|end|ner_chunk                |entity  |
+---+-----+---+-------------------------+--------+
|1  |0    |7  |Zacarias                 |PERSON  |
|1  |30   |31 |T2                       |pT      |
|1  |30   |31 |T2                       |pT      |
|1  |31   |31 |2                        |SIZE    |
|1  |32   |33 |N1                       |pN      |
|1  |33   |34 |1                        |SIZE    |
|1  |35   |59 |at Los Angeles California|LOCATION|
|2  |17   |28 |2 x 3 x 1 mm             |SIZE    |
|2  |38   |48 |better than              |BLOCK   |
|2  |50   |51 |T1                       |pT      |
|2  |50   |51 |T1                       |pT      |
|2  |50   |55 |T1N2M1                   |TNM     |
|2  |51   |51 |1                        |SIZE    |
|2  |52   |53 |N2                       |pN      |
|2  |53   |53 |2                        |SIZE    |
|2  |54   |55 |M1                       |pM      |
|2  |55   |56 |1               

In [10]:
merged_data.selectExpr("id","explode(combined) as a")\
.selectExpr("id","a.result as chunk","a.metadata.entity as entity").show(100, False)

+---+-------------------------+--------------------+
|id |chunk                    |entity              |
+---+-------------------------+--------------------+
|1  |Zacarias Woods           |NAME                |
|1  |T2                       |TNM                 |
|1  |N1                       |pN                  |
|1  |at Los Angeles California|LOCATION            |
|1  |lymphocite leukimia      |NAME                |
|2  |Andre Agassi             |NAME                |
|2  |2 x 3 x 1 mm             |SIZE                |
|2  |T1N2M1                   |Gene_or_gene_product|
|2  |adenocarcinoma           |Cancer              |
+---+-------------------------+--------------------+

