![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/platforms/airgap/Airgaped.ipynb)

# Installing Spark NLP offline mode




Spark-nlp installation Doc : https://nlp.johnsnowlabs.com/docs/en/install#offline

Medium Airgapped https://medium.com/spark-nlp/installing-spark-nlp-and-spark-ocr-in-air-gapped-networks-offline-mode-f42a1ee6b7a8


installation platform : https://github.com/JohnSnowLabs/spark-nlp-workshop/tree/master/platforms

sparknlp from pypi : https://pypi.org/project/spark-nlp/3.4.2/#files

CPUvsGPUbenchmark: https://nlp.johnsnowlabs.com/docs/en/CPUvsGPUbenchmark

## install pyspark v3.1.2

In [None]:
# Installing pyspark
!pip install -q pyspark==3.1.2

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

## license key

In [None]:
import json, os
from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [3]:
license_keys.keys()

dict_keys(['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'SPARK_NLP_LICENSE', 'SECRET', 'JSL_VERSION', 'PUBLIC_VERSION', 'SPARK_OCR_LICENSE', 'SPARK_OCR_SECRET', 'OCR_VERSION'])

## download Spark NLP jars from S3

In [None]:
!pip install -q awscli

In [None]:
# public jar
!aws s3 cp --region us-east-2 s3://auxdata.johnsnowlabs.com/public/jars/spark-nlp-assembly-$PUBLIC_VERSION.jar ./spark-nlp-$PUBLIC_VERSION.jar

# healthcare jar
!aws s3 cp --region us-east-2 s3://pypi.johnsnowlabs.com/$SECRET/spark-nlp-jsl-$JSL_VERSION.jar ./spark-nlp-jsl-$JSL_VERSION.jar

# healthcare  whl
!aws s3 cp --region us-east-2 s3://pypi.johnsnowlabs.com/$SECRET/spark-nlp-jsl/spark_nlp_jsl-$JSL_VERSION-py3-none-any.whl ./spark_nlp_jsl-$JSL_VERSION-py3-none-any.whl

In [None]:
# public whl from pypi  https://pypi.org/project/spark-nlp/#files 
# get the whl download link 
!wget https://files.pythonhosted.org/packages/65/19/c439d42f7afd75d6c9c20207db8ee0c95d7c82177b759303c7601120e91a/spark_nlp-4.4.1-py2.py3-none-any.whl

## install 

In [None]:
! pip install ./spark_nlp-$PUBLIC_VERSION-py2.py3-none-any.whl
! pip install ./spark_nlp_jsl-$JSL_VERSION-py3-none-any.whl

## session start

In [8]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from sparknlp.base import LightPipeline

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel

In [9]:
from pyspark.sql import SparkSession

def start():
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.driver.maxResultSize","2000M")\
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.jars", f"./spark-nlp-jsl-{JSL_VERSION}.jar,./spark-nlp-{PUBLIC_VERSION}.jar" )

    return builder.getOrCreate()

In [10]:
#  SECRET is in your Licence key

spark = start()

spark

## online mode pipeline
USING THE RESOURCE DOWNLOADER   `.pretrained()`

In [11]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
        
sentenceDetector = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")
 
# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = MedicalNerModel.pretrained("ner_clinical_large","en","clinical/models")\
    .setInputCols(["sentence","token","embeddings"])\
    .setOutputCol("ner")\
    .setLabelCasing("upper") #decide if we want to return the tags in upper or lower case 

ner_converter = NerConverter()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        clinical_ner,
        ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical_large download started this may take some time.
[OK!]


In [12]:
# fullAnnotate in LightPipeline

text = '''
A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation , and associated with an acute hepatitis , presented with a one-week history of polyuria , poor appetite , and vomiting . 
She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation . 
Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl ,  creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , and venous pH 7.27 . 
'''

print (text)

light_model = LightPipeline(model)

light_result = light_model.fullAnnotate(text)

chunks = []
entities = []
sentence= []
begin = []
end = []
confidence = []

for n in light_result[0]['ner_chunk']:
        
    begin.append(n.begin)
    end.append(n.end)
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    sentence.append(n.metadata['sentence'])
    confidence.append(n.metadata['confidence'])
    
import pandas as pd

df_clinical = pd.DataFrame({'chunks':chunks, 
                            'begin': begin, 
                            'end':end, 
                            'sentence_id':sentence, 
                            'entities':entities,
                            'confidence':confidence})

df_clinical.head(20)


A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation , and associated with an acute hepatitis , presented with a one-week history of polyuria , poor appetite , and vomiting . 
She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation . 
Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl ,  creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , and venous pH 7.27 . 



Unnamed: 0,chunks,begin,end,sentence_id,entities,confidence
0,gestational diabetes mellitus,40,68,0,PROBLEM,0.91976666
1,subsequent type two diabetes mellitus,118,154,0,PROBLEM,0.75924003
2,T2DM,158,161,0,PROBLEM,0.9917
3,HTG-induced pancreatitis,187,210,0,PROBLEM,0.97535
4,an acute hepatitis,268,285,0,PROBLEM,0.9440667
5,polyuria,326,333,0,PROBLEM,0.9728
6,poor appetite,337,349,0,PROBLEM,0.9934
7,vomiting,357,364,0,PROBLEM,0.9854
8,metformin,380,388,1,TREATMENT,0.9998
9,glipizide,392,400,1,TREATMENT,0.9999


In [13]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()

visualiser.display(light_result[0], label_col='ner_chunk', document_col='document', save_path="display_result.html")

## offline mode pipeline

MANUALLY DOWNLOADING  `.load()`

### using boto3 for download 

In [14]:
! pip install -q boto3

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
license_keys.keys()

dict_keys(['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'SPARK_NLP_LICENSE', 'SECRET', 'JSL_VERSION', 'PUBLIC_VERSION', 'SPARK_OCR_LICENSE', 'SPARK_OCR_SECRET', 'OCR_VERSION'])

In [16]:
import shutil
import boto3

# Add your credentials 
ACCESS_KEY = AWS_ACCESS_KEY_ID
SECRET_KEY = AWS_SECRET_ACCESS_KEY

# Connect
s3 = boto3.resource('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
buck_auxdata = s3.Bucket('auxdata.johnsnowlabs.com')

In [17]:
!mkdir -p ./zip_files ./models

**Download the embedding model**

In [18]:
# Download the embedding model 
buck_auxdata.download_file('clinical/models/embeddings_clinical_en_2.4.0_2.4_1580237286004.zip',
'zip_files/embeddings_clinical_en_2.4.0_2.4_1580237286004.zip')

# Unzip
shutil.unpack_archive('zip_files/embeddings_clinical_en_2.4.0_2.4_1580237286004.zip',
'models/embeddings_clinical', 'zip')

**Download the ner_clinical_large model**

In [19]:
# Download the ner_clinical_large model 
buck_auxdata.download_file('clinical/models/ner_clinical_large_en_3.0.0_3.0_1617206114650.zip',
'zip_files/ner_clinical_large_en_3.0.0_3.0_1617206114650.zip')

# Unzip
shutil.unpack_archive('zip_files/ner_clinical_large_en_3.0.0_3.0_1617206114650.zip',
'models/ner_clinical_large', 'zip')

**ner pipeline**

In [20]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
        
sentenceDetector = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")
 
# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings_loaded = WordEmbeddingsModel.load("./models/embeddings_clinical")\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner_loaded = MedicalNerModel.load("./models/ner_clinical_large")\
    .setInputCols(["sentence","token","embeddings"])\
    .setOutputCol("ner")\
    .setLabelCasing("upper") #decide if we want to return the tags in upper or lower case 

ner_converter = NerConverter()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings_loaded,
        clinical_ner_loaded,
        ner_converter])


empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

In [21]:
# fullAnnotate in LightPipeline

text = '''
A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation , and associated with an acute hepatitis , presented with a one-week history of polyuria , poor appetite , and vomiting . 
She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation . 
Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl ,  creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , and venous pH 7.27 . 
'''

print (text)

light_model = LightPipeline(model)

light_result = light_model.fullAnnotate(text)


chunks = []
entities = []
sentence= []
begin = []
end = []
confidence = []

for n in light_result[0]['ner_chunk']:
        
    begin.append(n.begin)
    end.append(n.end)
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    sentence.append(n.metadata['sentence'])
    confidence.append(n.metadata['confidence'])
    
    
import pandas as pd

df_clinical = pd.DataFrame({'chunks':chunks, 
                            'begin': begin, 
                            'end':end, 
                            'sentence_id':sentence, 
                            'entities':entities,
                            'confidence':confidence})

df_clinical.head(20)


A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation , and associated with an acute hepatitis , presented with a one-week history of polyuria , poor appetite , and vomiting . 
She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation . 
Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl ,  creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , and venous pH 7.27 . 



Unnamed: 0,chunks,begin,end,sentence_id,entities,confidence
0,gestational diabetes mellitus,40,68,0,PROBLEM,0.91976666
1,subsequent type two diabetes mellitus,118,154,0,PROBLEM,0.75924003
2,T2DM,158,161,0,PROBLEM,0.9917
3,HTG-induced pancreatitis,187,210,0,PROBLEM,0.97535
4,an acute hepatitis,268,285,0,PROBLEM,0.9440667
5,polyuria,326,333,0,PROBLEM,0.9728
6,poor appetite,337,349,0,PROBLEM,0.9934
7,vomiting,357,364,0,PROBLEM,0.9854
8,metformin,380,388,1,TREATMENT,0.9998
9,glipizide,392,400,1,TREATMENT,0.9999


In [22]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()

visualiser.display(light_result[0], label_col='ner_chunk', document_col='document', save_path="display_result.html")



### using ResourceDownloader

In [23]:
from sparknlp.pretrained import ResourceDownloader

#The first argument is the path to the zip file and the second one is the folder.
ResourceDownloader.downloadModelDirectly("clinical/models/embeddings_clinical_en_2.4.0_2.4_1580237286004.zip", "clinical/models")  
ResourceDownloader.downloadModelDirectly("clinical/models/ner_clinical_large_en_3.0.0_3.0_1617206114650.zip", "clinical/models") 

In [24]:
WordEmbeddingsPath =  "/root/cache_pretrained/embeddings_clinical_en_2.4.0_2.4_1580237286004"
NerModelPath = "/root/cache_pretrained/ner_clinical_large_en_3.0.0_3.0_1617206114650"

In [25]:
# clinical_ner = MedicalNerModel.pretrained("ner_clinical_large","en","clinical/models")\
#     .setInputCols(["sentence","token","embeddings"])\
#     .setOutputCol("ner")

**ner pipeline**

In [26]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
        
sentenceDetector = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")
 
# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.load(WordEmbeddingsPath)\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
jsl_ner = MedicalNerModel.load(NerModelPath) \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner")\
    .setLabelCasing("upper") #decide if we want to return the tags in upper or lower case 

ner_converter = NerConverter()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        jsl_ner,
        ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

In [27]:
# fullAnnotate in LightPipeline

text = '''
A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation , and associated with an acute hepatitis , presented with a one-week history of polyuria , poor appetite , and vomiting . 
She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation . 
Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl ,  creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , and venous pH 7.27 . 
'''
print (text)

light_model = LightPipeline(model)

light_result = light_model.fullAnnotate(text)

chunks = []
entities = []
sentence= []
begin = []
end = []
confidence = []

for n in light_result[0]['ner_chunk']:
        
    begin.append(n.begin)
    end.append(n.end)
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    sentence.append(n.metadata['sentence'])
    confidence.append(n.metadata['confidence'])
    
import pandas as pd

df_clinical = pd.DataFrame({'chunks':chunks, 
                            'begin': begin, 
                            'end':end, 
                            'sentence_id':sentence, 
                            'entities':entities,
                            'confidence':confidence})

df_clinical.head(20)


A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation , and associated with an acute hepatitis , presented with a one-week history of polyuria , poor appetite , and vomiting . 
She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation . 
Physical examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was benign with no tenderness , guarding , or rigidity . Pertinent laboratory findings on admission were : serum glucose 111 mg/dl ,  creatinine 0.4 mg/dL , triglycerides 508 mg/dL , total cholesterol 122 mg/dL , and venous pH 7.27 . 



Unnamed: 0,chunks,begin,end,sentence_id,entities,confidence
0,gestational diabetes mellitus,40,68,0,PROBLEM,0.91976666
1,subsequent type two diabetes mellitus,118,154,0,PROBLEM,0.75924003
2,T2DM,158,161,0,PROBLEM,0.9917
3,HTG-induced pancreatitis,187,210,0,PROBLEM,0.97535
4,an acute hepatitis,268,285,0,PROBLEM,0.9440667
5,polyuria,326,333,0,PROBLEM,0.9728
6,poor appetite,337,349,0,PROBLEM,0.9934
7,vomiting,357,364,0,PROBLEM,0.9854
8,metformin,380,388,1,TREATMENT,0.9998
9,glipizide,392,400,1,TREATMENT,0.9999


In [28]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()

visualiser.display(light_result[0], label_col='ner_chunk', document_col='document', save_path="display_result.html")



## offline mode pretrained pipeline

MANUALLY DOWNLOADING  `.from_disk()`

### using boto3 for download 

In [29]:
# ! pip install -q boto3

In [30]:
import shutil
import boto3

# Add your credentials 
ACCESS_KEY = AWS_ACCESS_KEY_ID
SECRET_KEY = AWS_SECRET_ACCESS_KEY

# Connect
s3 = boto3.resource('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
buck_auxdata = s3.Bucket('auxdata.johnsnowlabs.com')

In [31]:
!mkdir -p ./zip_files ./models

**Download the explain_clinical_doc_carp model**

In [32]:
# Download the embedding model 
# s3://auxdata.johnsnowlabs.com/clinical/models/explain_clinical_doc_carp_en_3.0.0_3.0_1617296754955.zip
buck_auxdata.download_file('clinical/models/explain_clinical_doc_carp_en_3.0.0_3.0_1617296754955.zip',
'zip_files/explain_clinical_doc_carp_en_3.0.0_3.0_1617296754955.zip')

# Unzip
shutil.unpack_archive('zip_files/explain_clinical_doc_carp_en_3.0.0_3.0_1617296754955.zip',
'models/explain_clinical_doc_carp', 'zip')

**explain_clinical_doc_carp pipeline**

In [33]:
from sparknlp.pretrained import PretrainedPipeline

pipeline = PretrainedPipeline.from_disk("models/explain_clinical_doc_carp")

In [34]:
text ="""A 28-year-old female with a history of gestational diabetes mellitus, used to take metformin 1000 mg two times a day, presented with a one-week history of polyuria , polydipsia , poor appetite , and vomiting .
She was seen by the endocrinology service and discharged on 40 units of insulin glargine at night, 12 units of insulin lispro with meals.
"""

annotations = pipeline.annotate(text)

annotations.keys()

dict_keys(['sentences', 'clinical_ner_tags', 'document', 'clinical_ner_chunks', 'assertion', 'clinical_relations', 'posology_ner_tags', 'tokens', 'posology_ner_chunks', 'embeddings', 'pos_tags', 'dependencies'])

In [35]:
import pandas as pd

rows = list(zip(annotations['tokens'], annotations['clinical_ner_tags'], annotations['posology_ner_tags'], annotations['pos_tags'], annotations['dependencies']))

df = pd.DataFrame(rows, columns = ['tokens','clinical_ner_tags','posology_ner_tags','POS_tags','dependencies'])

df.head(20)

Unnamed: 0,tokens,clinical_ner_tags,posology_ner_tags,POS_tags,dependencies
0,A,O,O,DD,female
1,28-year-old,O,O,NN,female
2,female,O,O,NN,ROOT
3,with,O,O,II,history
4,a,O,O,DD,history
5,history,O,O,NN,female
6,of,O,O,II,history
7,gestational,B-PROBLEM,O,JJ,of
8,diabetes,I-PROBLEM,O,NN,mellitus
9,mellitus,I-PROBLEM,O,NN,gestational


In [36]:
text = 'Patient has a headache for the last 2 weeks and appears anxious when she walks fast. No alopecia noted. She denies pain'

result = pipeline.fullAnnotate(text)[0]

chunks=[]
entities=[]
status=[]

for n,m in zip(result['clinical_ner_chunks'],result['assertion']):
    
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    status.append(m.result)
        
df = pd.DataFrame({'chunks':chunks, 'entities':entities, 'assertion':status})

df

Unnamed: 0,chunks,entities,assertion
0,a headache,PROBLEM,present
1,anxious,PROBLEM,conditional
2,alopecia,PROBLEM,absent
3,pain,PROBLEM,absent


In [37]:
text = """
The patient was prescribed 1 unit of Advil for 5 days after meals. The patient was also 
given 1 unit of Metformin daily.
He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 
12 units of insulin lispro with meals , and metformin 1000 mg two times a day.
"""

result = pipeline.fullAnnotate(text)[0]

chunks=[]
entities=[]
begins=[]
ends=[]

for n in result['posology_ner_chunks']:
    
    chunks.append(n.result)
    begins.append(n.begin)
    ends.append(n.end)
    entities.append(n.metadata['entity']) 
        
df = pd.DataFrame({'chunks':chunks, 'begin':begins, 'end':ends, 'entities':entities})

df

Unnamed: 0,chunks,begin,end,entities
0,1 unit,28,33,DOSAGE
1,Advil,38,42,DRUG
2,for 5 days,44,53,DURATION
3,1 unit,96,101,DOSAGE
4,Metformin,106,114,DRUG
5,daily,116,120,FREQUENCY
6,40 units,190,197,DOSAGE
7,insulin glargine,202,217,DRUG
8,at night,219,226,FREQUENCY
9,12 units,231,238,DOSAGE


## offline mode public models

**Download the embedding model from Model Hub and Upload *zip_files* folder** 

https://nlp.johnsnowlabs.com/2020/01/22/glove_100d.html

https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/glove_100d_en_2.4.0_2.4_1579690104032.zip

In [None]:
# or you can use internet connection
!mkdir -p modelhub_files zip_files
!wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/glove_100d_en_2.4.0_2.4_1579690104032.zip -P ./zip_files

drug and drop

In [39]:
# Unzip
shutil.unpack_archive('zip_files/glove_100d_en_2.4.0_2.4_1579690104032.zip',
'modelhub_files/glove_100d', 'zip')

**Download the ner_clinical_large model from Model Hub and Upload *zip_files* folder**

https://nlp.johnsnowlabs.com/2020/03/19/ner_dl_en.html

https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ner_dl_en_2.4.3_2.4_1584624950746.zip

drug and drop or download with wget

In [None]:
# or you can use internet connection
!mkdir -p modelhub_files zip_files
!wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ner_dl_en_2.4.3_2.4_1584624950746.zip -P ./zip_files

# Unzip
shutil.unpack_archive('zip_files/ner_dl_en_2.4.3_2.4_1584624950746.zip',
'modelhub_files/ner_dl', 'zip')

pipeline

In [41]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

# ner_dl model is trained with glove_100d. So we use the same embeddings in the pipeline
glove_embeddings = WordEmbeddingsModel.load('/content/modelhub_files/glove_100d')\
    .setInputCols(["document", 'token'])\
    .setOutputCol("embeddings")

public_ner = NerDLModel.load("/content/modelhub_files/ner_dl")\
    .setInputCols(["document", "token", "embeddings"])\
    .setOutputCol("ner")

ner_converter = NerConverter() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    tokenizer,
    glove_embeddings,
    public_ner,
    ner_converter
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

In [42]:
# fullAnnotate in LightPipeline

light_model = LightPipeline(pipelineModel)

light_result = light_model.annotate('Peter Parker is a nice persn and lives in New York. Bruce Wayne is also a nice guy and lives in Gotham City.')

list(zip(light_result['token'], light_result['ner']))

[('Peter', 'B-PER'),
 ('Parker', 'I-PER'),
 ('is', 'O'),
 ('a', 'O'),
 ('nice', 'O'),
 ('persn', 'O'),
 ('and', 'O'),
 ('lives', 'O'),
 ('in', 'O'),
 ('New', 'B-LOC'),
 ('York', 'I-LOC'),
 ('.', 'O'),
 ('Bruce', 'B-PER'),
 ('Wayne', 'I-PER'),
 ('is', 'O'),
 ('also', 'O'),
 ('a', 'O'),
 ('nice', 'O'),
 ('guy', 'O'),
 ('and', 'O'),
 ('lives', 'O'),
 ('in', 'O'),
 ('Gotham', 'B-LOC'),
 ('City', 'I-LOC'),
 ('.', 'O')]

In [43]:
light_model = LightPipeline(pipelineModel)

light_result = light_model.fullAnnotate('Peter Parker is a nice persn and lives in New York. Bruce Wayne is also a nice guy and lives in Gotham City.')


chunks = []
entities = []

for n in light_result[0]['ner_chunk']:
        
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    
    
import pandas as pd

df = pd.DataFrame({'chunks':chunks, 'entities':entities})

df

Unnamed: 0,chunks,entities
0,Peter Parker,PER
1,New York,LOC
2,Bruce Wayne,PER
3,Gotham City,LOC
