[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/enterprise/healthcare/colab/ChunkMergeSample.ipynb)

In [4]:
import json

with open('keys.json') as f:
    license_keys = json.load(f)

license_keys.keys()


dict_keys(['secret', 'SPARK_NLP_LICENSE', 'JSL_OCR_LICENSE', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'JSL_OCR_SECRET'])

In [4]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4

secret = license_keys['secret']
os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['JSL_OCR_LICENSE'] = license_keys['JSL_OCR_LICENSE']
os.environ['AWS_ACCESS_KEY_ID']= license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']

! python -m pip install --upgrade spark-nlp-jsl==2.5.0  --extra-index-url https://pypi.johnsnowlabs.com/$secret

# Install Spark NLP
! pip install --ignore-installed -q spark-nlp==2.5

import sparknlp

print (sparknlp.version())

import json
import os
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession


from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl



def start(secret):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.0") \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+secret+"/spark-nlp-jsl-2.5.0.jar")
      
    return builder.getOrCreate()


spark = start(secret) # if you want to start the session with custom params as in start function above
# sparknlp_jsl.start(secret)

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)
Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/l5rISdi5Xk
Collecting spark-nlp-jsl==2.5.0
  Downloading https://pypi.johnsnowlabs.com/l5rISdi5Xk/spark-nlp-jsl/spark_nlp_jsl-2.5.0-py3-none-any.whl
Collecting spark-nlp==2.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/75/b0/f50d169c49f5982f8be9e86e285b53e23f91fd7db0d10646c2d1de5c3ad0/spark_nlp-2.5.0-py2.py3-none-any.whl (120kB)
[K     |████████████████████████████████| 122kB 7.8MB/s 
Installing collected packages: spark-nlp, spark-nlp-jsl
Successfully installed spark-nlp-2.5.0 spark-nlp-jsl-2.5.0
2.5.0


In [15]:
data_chunk_merge = spark.createDataFrame([
  (1,"Zacarias Woods would not have T2N1 at Los Angeles California",),
  (2,"Andre Agassi had 2 x 3 x 1 mm hairwig better than T1N2M1",)
]).toDF("id","text")

regex = '''(c|p|yc|yp|r|rp|a)?(C[1-5])?M(x|X|0|1[a-d]?),pM
(c|p|yc|yp|r|rp|a)?(C[1-5])?N(x|X|0|[1-3][a-d]?),pN
(c|p|yc|yp|r|rp|a)?(C[1-5])?T(x|X|is|0|[1-4][a-d]?),pT
([0-9]+(\.[0-9]+)?\s?x\s?)*([0-9]+(\.[0-9]+)?)\s?(mg|MG|mm|cm|MM|CM|),SIZE
at Los Angeles California,LOCATION
Zacarias,PERSON
better than,BLOCK'''

with open('ner_regex.csv', 'w') as f:
    f.write(regex)

replace_dict = '''pT,TNM
pM,TNM'''

with open('replace_dict.csv', 'w') as f:
    f.write(replace_dict)

false_positives = '''beautiful thing,BLOCK'''

with open('false_positives.csv', 'w') as f:
    f.write(false_positives)

In [16]:
da = DocumentAssembler().setInputCol("text").setOutputCol("document")
sd = SentenceDetector().setInputCols("document").setOutputCol("sentence")
tk = Tokenizer().setInputCols("sentence").setOutputCol("token")
emb = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models").setOutputCol("embs")
ner = NerDLModel.pretrained("ner_deid_large","en","clinical/models").setInputCols("sentence","token","embs").setOutputCol("ner")
nc = NerConverter().setInputCols("sentence","token","ner").setOutputCol("ner_chunk")
rex = RegexMatcher().setInputCols("sentence").setOutputCol("rex").setExternalRules("ner_regex.csv",",","TEXT")
merger = ChunkMergeApproach().setInputCols("ner_chunk","rex").setOutputCol("combined")\
    .setFalsePositivesResource("false_positives.csv","TEXT", {"delimiter":","})\
    .setReplaceDictResource("replace_dict.csv","TEXT", {"delimiter":","})\

pl = Pipeline().setStages([da,sd,tk,emb,ner,nc,rex,merger])

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_large download started this may take some time.
Approximate size to download 14 MB
[OK!]


In [17]:
merged_data = pl.fit(data_chunk_merge).transform(data_chunk_merge).cache()

In [31]:
merged_data.selectExpr("id","explode(arrays_zip(ner_chunk.begin,ner_chunk.result, ner_chunk.metadata)) as a")\
.selectExpr("id","a['0'] as begin","a['1'] as ner_chunk","a['2'].entity as entity")\
.orderBy("id","begin").show(100, False)

+---+-----+----------------------+--------+
|id |begin|ner_chunk             |entity  |
+---+-----+----------------------+--------+
|1  |0    |Zacarias Woods        |NAME    |
|1  |38   |Los Angeles California|LOCATION|
|2  |0    |Andre Agassi          |NAME    |
+---+-----+----------------------+--------+



In [32]:
merged_data.selectExpr("id","explode(arrays_zip(rex.begin,rex.result, rex.metadata)) as a")\
.selectExpr("id","a['0'] as begin","a['1'] as ner_chunk","a['2'].identifier as entity")\
.orderBy("id","begin").show(100, False)

+---+-----+-------------------------+--------+
|id |begin|ner_chunk                |entity  |
+---+-----+-------------------------+--------+
|1  |0    |Zacarias                 |PERSON  |
|1  |30   |T2                       |pT      |
|1  |31   |2                        |SIZE    |
|1  |32   |N1                       |pN      |
|1  |33   |1                        |SIZE    |
|1  |35   |at Los Angeles California|LOCATION|
|2  |17   |2 x 3 x 1 mm             |SIZE    |
|2  |38   |better than              |BLOCK   |
|2  |50   |T1                       |pT      |
|2  |51   |1                        |SIZE    |
|2  |52   |N2                       |pN      |
|2  |53   |2                        |SIZE    |
|2  |54   |M1                       |pM      |
|2  |55   |1                        |SIZE    |
+---+-----+-------------------------+--------+



In [33]:
merged_data.selectExpr("id","explode(arrays_zip(combined.result, combined.metadata)) as a")\
.selectExpr("id","a['0'] as chunk","a['1'].entity as entity").show(100, False)

+---+-------------------------+--------+
|id |chunk                    |entity  |
+---+-------------------------+--------+
|1  |Zacarias Woods           |NAME    |
|1  |T2                       |TNM     |
|1  |N1                       |pN      |
|1  |at Los Angeles California|LOCATION|
|2  |Andre Agassi             |NAME    |
|2  |2 x 3 x 1 mm             |SIZE    |
|2  |better than              |BLOCK   |
|2  |T1                       |TNM     |
|2  |N2                       |pN      |
|2  |M1                       |TNM     |
+---+-------------------------+--------+

