# Description
## This notebok provides set of commands to install Spark NLP for offline usage. It contains 4 sections:
1) Download all dependencies for Spark NLP

2) Download all dependencies for Spark NLP (enterprise/licensed)

3) Download all dependencies for Spark NLP OCR

4) Download all models/embeddings for offline usage

5) Example of NER


## 1) Download all dependencies for Spark NLP

In [None]:
import json

with open('workshop_license_keys_365.json') as f:
    license_keys = json.load(f)

license_keys.keys()


dict_keys(['PUBLIC_VERSION', 'JSL_VERSION', 'SECRET', 'SPARK_NLP_LICENSE', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'SPARK_OCR_LICENSE', 'SPARK_OCR_SECRET'])

In [None]:

os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['AWS_ACCESS_KEY_ID']= license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
os.environ['JSL_OCR_LICENSE'] = license_keys['SPARK_OCR_LICENSE']

version = license_keys['PUBLIC_VERSION']
jsl_version = license_keys['JSL_VERSION']


In [None]:
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
!java -version

openjdk version "1.8.0_265"
OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)


In [None]:
!pip install --ignore-installed -q pyspark==2.4.4

[K     |████████████████████████████████| 215.7MB 65kB/s 
[K     |████████████████████████████████| 204kB 44.9MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
!pip list | grep spark

pyspark                       2.4.4          


In [None]:
!sudo apt install awscli

In [None]:
# spark-nlp jar
!wget -q https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/spark-nlp-assembly-2.7.3.jar

# spark-nlp wheel
!wget -q https://github.com/JohnSnowLabs/spark-nlp/archive/2.7.3.tar.gz

In [None]:
!tar -xvf spark-nlp-2.7.3.tar.gz

In [None]:
!pip install -q spark-nlp-2.7.3/ 

  Building wheel for spark-nlp (setup.py) ... [?25l[?25hdone


## 2) Download all dependencies for Spark NLP (enterprise/licensed)

In [None]:
# here you need to enter your AWS KEY and AWS SECRET KEY.
# As a region enter "ohio"
# As a language enter "en"
!aws configure

In [None]:
jsl_secret = license_keys['SECRET']
jsl_jar = jsl_version+'.jar'
jsl_tar = jsl_version+'.tar.gz'

In [None]:
# spark nlp JSL wheel
!sudo aws s3 cp --region us-east-2 s3://pypi.johnsnowlabs.com/$jsl_secret/spark-nlp-jsl-$jsl_jar spark-nlp-jsl-$jsl_jar
!sudo aws s3 cp --region us-east-2 s3://pypi.johnsnowlabs.com/$secret/spark-nlp-jsl/spark-nlp-jsl-$jsl_tar spark-nlp-jsl-$jsl_tar

In [None]:
!tar -xvf spark-nlp-jsl-$jsl_tar

In [None]:
!pip install -q /content/spark-nlp-jsl-$jsl_version/ 

In [None]:
!pip list | grep spark

pyspark                       2.4.4          
spark-nlp                     2.6.0          
spark-nlp-jsl                 2.6.0          


## 3) Download all dependencies for Spark NLP OCR

In [None]:
ocr_secret = license_keys['SPARK_OCR_SECRET']
ocr_version = ocr_secret.split('-')[0]
ocr_jar = ocr_version+'.spark24.jar'ocr_tar = ocr_version+'.spark24.tar.gz'

In [None]:
!wget -q https://pypi.johnsnowlabs.com/$ocr_secret/jars/spark-ocr-assembly-$ocr_jar!wget -q https://pypi.johnsnowlabs.com/$ocr_secret/spark-ocr/spark-ocr-$ocr_tar

In [None]:
# unpack wheel OCR
!tar -xvf /content/spark-ocr-$ocr_tar

In [None]:
!pip install -q /content/spark-ocr-$ocr_version/

In [None]:
#sanity check
!pip list | grep spark

pyspark                       2.4.4          
spark-nlp                     2.6.0          
spark-nlp-jsl                 2.6.0          
spark-ocr                     1.5.0          


## Installation completed. Let's download models using AWS keys

## 4) Download all models/embeddings for offline usage

In [None]:
# This code will download >100 GB of Spark NLP models to your local disk
# !sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/public/models/ public_models/ --recursive 

In [None]:
# This code also will download >100 GB of clinical embeddings from Spark NLP models
# !sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/clinical/models/ clinical_models/ --recursive 

In [None]:
# For example purposes let's download only subset for NER and glove
!sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/public/models/ public_models/ --recursive --exclude "*" --include "ner_dl*"

download: s3://auxdata.johnsnowlabs.com/public/models/ner_dl_bert_base_cased_en_2.6.0_2.4_1599550960441.zip to public_models/ner_dl_bert_base_cased_en_2.6.0_2.4_1599550960441.zip
download: s3://auxdata.johnsnowlabs.com/public/models/ner_dl_bert_base_cased_en_2.4.0_2.4_1583223672963.zip to public_models/ner_dl_bert_base_cased_en_2.4.0_2.4_1583223672963.zip
download: s3://auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.4.3_2.4_1584624951079.zip to public_models/ner_dl_bert_en_2.4.3_2.4_1584624951079.zip
download: s3://auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.4.0_2.4_1583223672963.zip to public_models/ner_dl_bert_en_2.4.0_2.4_1583223672963.zip
download: s3://auxdata.johnsnowlabs.com/public/models/ner_dl_bert_contrib_en_2.0.2_2.4_1556650375261.zip to public_models/ner_dl_bert_contrib_en_2.0.2_2.4_1556650375261.zip
download: s3://auxdata.johnsnowlabs.com/public/models/ner_dl_bert_en_2.2.0_2.4_1567854461249.zip to public_models/ner_dl_bert_en_2.2.0_2.4_1567854461249.zi

In [None]:
!sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/public/models/ public_models/ --recursive --exclude "*" --include "glove*"

download: s3://auxdata.johnsnowlabs.com/public/models/glove_6B_100_xx_2.4.0_2.4_1579690037117.zip to public_models/glove_6B_100_xx_2.4.0_2.4_1579690037117.zip
download: s3://auxdata.johnsnowlabs.com/public/models/glove_100d_en_2.0.0_2.4_1553028251278.zip to public_models/glove_100d_en_2.0.0_2.4_1553028251278.zip
download: s3://auxdata.johnsnowlabs.com/public/models/glove_100d_en_2.0.2_2.4_1556534397055.zip to public_models/glove_100d_en_2.0.2_2.4_1556534397055.zip
download: s3://auxdata.johnsnowlabs.com/public/models/glove_100d_en_2.4.0_2.4_1579690104032.zip to public_models/glove_100d_en_2.4.0_2.4_1579690104032.zip
download: s3://auxdata.johnsnowlabs.com/public/models/glove_6B_300_xx_2.4.0_2.4_1579698630432.zip to public_models/glove_6B_300_xx_2.4.0_2.4_1579698630432.zip
download: s3://auxdata.johnsnowlabs.com/public/models/glove_6B_300_xx_2.0.2_2.4_1559059806004.zip to public_models/glove_6B_300_xx_2.0.2_2.4_1559059806004.zip
download: s3://auxdata.johnsnowlabs.com/public/models/glov

In [None]:
# !sudo aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/clinical/models/ clinical_models/ --recursive --exclude "*" --include "embeddings_clinical*"

## 5) Example on NER

In [None]:
!unzip -q /content/public_models/ner_dl_en_2.4.3_2.4_1584624950746.zip -d ner_dl_glove/

In [None]:
!unzip -q /content/public_models/glove_100d_en_2.4.0_2.4_1579690104032.zip -d glove_embeddings/

In [None]:
ner_local_path = 'ner_dl_glove'
embeddings_local_path = 'glove_embeddings'

In [None]:
spark_nlp_jar_path = "/content/spark-nlp-assembly-"+version+".jar"
spark_nlp_internal = "/content/spark-nlp-jsl-"+jsl_jar
spark_nlp_jar_path = spark_nlp_jar_path+","+spark_nlp_internal

In [None]:
import json
import os
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl

In [None]:
def start():
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "10G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars", spark_nlp_jar_path)
    return builder.getOrCreate()

spark = start()

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

# ner_dl model is trained with glove_100d. So we use the same embeddings in the pipeline
glove_embeddings = WordEmbeddingsModel.load(embeddings_local_path).\
  setInputCols(["document", 'token']).\
  setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
public_ner = NerDLModel.load(ner_local_path) \
  .setInputCols(["document", "token", "embeddings"]) \
  .setOutputCol("ner")

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 glove_embeddings,
 public_ner
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [None]:
df = spark.createDataFrame([['Peter Parker lives in New York.']]).toDF("text")

result = pipelineModel.transform(df)

result.select('token.result','ner.result').show(truncate=False)

+----------------------------------------+-------------------------------------+
|result                                  |result                               |
+----------------------------------------+-------------------------------------+
|[Peter, Parker, lives, in, New, York, .]|[B-PER, I-PER, O, O, B-LOC, I-LOC, O]|
+----------------------------------------+-------------------------------------+



In [None]:
light_model = LightPipeline(pipelineModel)

text = 'Peter Parker lives in New York.'

light_result = light_model.annotate(text)

list(zip(light_result['token'], light_result['ner']))

[('Peter', 'B-PER'),
 ('Parker', 'I-PER'),
 ('lives', 'O'),
 ('in', 'O'),
 ('New', 'B-LOC'),
 ('York', 'I-LOC'),
 ('.', 'O')]