![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Annotation_Lab/Annotation_Lab_preannotate_upload.ipynb)


# Pre-annotating and Uploading Tasks to the Annotation Lab

## This tutorial provides instructions and code for the following operations:
1. Creating Pre-annotations for the Annotation Lab
2. Uploading Pre-annotations to the Annotation Lab
3. Uploading Tasks Without Pre-annotations to the Annotation Lab

In [1]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

Saving jsl_keys.json to jsl_keys.json


In [2]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 60 kB/s 
[K     |████████████████████████████████| 616 kB 7.8 MB/s 
[K     |████████████████████████████████| 198 kB 47.0 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 206 kB 413 kB/s 
[K     |████████████████████████████████| 95 kB 2.8 MB/s 
[K     |████████████████████████████████| 66 kB 4.7 MB/s 
[K     |████████████████████████████████| 1.6 MB 59.0 MB/s 
[?25h

In [3]:
import pandas as pd
import requests
import json
from zipfile import ZipFile
from io import BytesIO
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 4.1.0
Spark NLP_JSL Version : 4.1.0


## 1. Creating Pre-annotations for the Annotation Lab

Please make sure to modify this pre-annotation pipeline to make it coherent with your Annotation Lab project configuration.

In [4]:
# Downloading sample datasets.
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/mt_samples.csv

In [8]:
sample_data = pd.read_csv('mt_samples.csv').head(20)
print (sample_data.shape)
sample_data.head()

(20, 1)


Unnamed: 0,text
0,Sample Type / Medical Specialty:\nHematology -...
1,Sample Type / Medical Specialty:\nHematology -...
2,Sample Type / Medical Specialty:\nHematology -...
3,Sample Type / Medical Specialty:\nHematology -...
4,Sample Type / Medical Specialty:\nHematology -...


In [6]:
# pre-annotation pipeline
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')\
    .setCustomBounds(['\n'])

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel().pretrained('embeddings_clinical', 'en', 'clinical/models')\
    .setInputCols(["sentence", 'token'])\
    .setOutputCol("embeddings")\

ner_model = MedicalNerModel.pretrained('ner_jsl', 'en', 'clinical/models')\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")

converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")

assertion_model = AssertionDLModel().pretrained('assertion_dl', 'en', 'clinical/models')\
    .setInputCols(["sentence", "ner_chunk", 'embeddings'])\
    .setOutputCol("assertion_res")

pos_tagger = PerceptronModel()\
    .pretrained("pos_clinical", "en", "clinical/models") \
    .setInputCols(["sentence", "token"])\
    .setOutputCol("pos_tags")
    
dependency_parser = DependencyParserModel()\
    .pretrained("dependency_conllu", "en")\
    .setInputCols(["sentence", "pos_tags", "token"])\
    .setOutputCol("dependencies")

relation_clinical = RelationExtractionModel.pretrained('re_clinical', 'en', 'clinical/models')\
    .setInputCols(["embeddings", "pos_tags", "ner_chunk", "dependencies"])\
    .setOutputCol("relations_clinical")\
    .setRelationPairs(['procedure-disease_syndrome_disorder', 'test-oncological', 'test-disease_syndrome_disorder',
                       'external_body_part_or_region-procedure', 'oncological-external_body_part_or_region',
                       'oncological-procedure'])\
    .setMaxSyntacticDistance(0)

relation_pos = RelationExtractionModel.pretrained('posology_re', 'en', 'clinical/models')\
    .setInputCols(["embeddings", "pos_tags", "ner_chunk", "dependencies"])\
    .setOutputCol("relations_pos")\
    .setRelationPairs(['drug_ingredient-drug_brandname', 'drug_ingredient-dosage', 'drug_ingredient-strength', 'drug_ingredient-route'])\
    .setMaxSyntacticDistance(0)

ner_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        tokenizer,
        word_embeddings,
        ner_model,
        converter,
        assertion_model,
        pos_tagger,
        dependency_parser,
        relation_clinical,
        relation_pos
    ])

empty_data = spark.createDataFrame([['']]).toDF("text")
pipeline_model = ner_pipeline.fit(empty_data)
lmodel = LightPipeline(pipeline_model)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_jsl download started this may take some time.
[OK!]
assertion_dl download started this may take some time.
[OK!]
pos_clinical download started this may take some time.
Approximate size to download 1.5 MB
[OK!]
dependency_conllu download started this may take some time.
Approximate size to download 16.7 MB
[OK!]
re_clinical download started this may take some time.
Approximate size to download 6 MB
[OK!]


Visualize on a sample doc

In [9]:
temp_res = lmodel.fullAnnotate(sample_data['text'].values[11])
from sparknlp_display import *
AssertionVisualizer().display(temp_res[0], 'ner_chunk', 'assertion_res')

In [10]:
RelationExtractionVisualizer().display(temp_res[0], 'relations_clinical')#, 'assertion_res')

In [11]:
RelationExtractionVisualizer().display(temp_res[0], 'relations_pos')#, 'assertion_res')

Get Results from pipeline

In [13]:
# light pipeline:
# results = lmodel.fullAnnotate(sample_data['text'])

#full pipeline
results = pipeline_model.transform(spark.createDataFrame(sample_data)).collect()


In [18]:
from from sparknlp_jsl.alab import get_preannotations


annotation_json = get_preannotations(all_results = results, # pipeline results
                                     document_column = 'document', # document column from pipeline - to get original string
                                     ner_columns = ['ner_chunk'], # can define multiple ner column names - even if entities overlap.
                                     assertion_columns = ['assertion_res'], # can define multiple assertion column names.
                                     relations_columns = ['relations_clinical', 'relations_pos'], # can define multiple relation models column names - as shown in this example.
                                     user_name = 'demo_model', # define custom name for your model / pipeline.
                                     titles_list = [], # if want to assign custom titles to tasks in alab. Note: if this list is defined, it has to be of equal length as number of tasks.
                                     id_offset = 0 # if you already have existing documents in the alab project, you can change this id offset to avoid overwriting. This will increment task ids by this number.
                                     )

print (len(annotation_json))


20


Write as json to disk

In [19]:
with open('pre_annotations.json', 'w') as f_:
  f_.write(json.dumps(annotation_json, indent=4))

## 2. Uploading Pre-annotations to the Annotation Lab

Start by providing your user credentials and making sure this `get_cookies` function returns a `200` response code. The base url for this demo is: https://annotationlab.johnsnowlabs.com - make sure to change this accordingly. The code blocks below are based on the Annotation Lab [API documentation](https://nlp.johnsnowlabs.com/docs/en/alab/api).

In [None]:
# provide user credentials
username = 'user'
password = 'pass'
client_secret = "secret"

# helper function to get cookies
def get_cookies(username, password):
    
    url = "https://annotationlab.johnsnowlabs.com/openid-connect/token"
    
    headers = {
        "Content-Type": "application/json",
        "accept": "*/*",
    }
    
    data = {
      "username": username,
      "password": password,
      "client_id": "annotator",
      "client_secret": client_secret
    }
    
    resp = requests.post(url, headers=headers, json=data)
    print (resp.status_code)
    auth_info = resp.json()

    cookies = {
        'access_token': f"Bearer {auth_info['access_token']}",
        'refresh_token': auth_info['refresh_token']
    }
    return cookies

cookies = get_cookies(username, password)
cookies

Insert the name of your project, making sure it is the official name used in the Annotation Lab which is accessible by the url.

In [22]:
project_name = 'dummy'
url = "https://annotationlab.johnsnowlabs.com/api/projects/{}/import".format(project_name)
print(url)

headers = {
        "Content-Type": "application/json",
        "accept": "*/*"
    }

cookies = get_cookies(username, password)

resp = requests.post(url, headers=headers, cookies=cookies, json=annotation_json)

resp.status_code
print (resp.text)

https://annotationlab.johnsnowlabs.com/api/projects/dummy/import
200



## 3. Uploading Tasks Without Pre-annotations to the Annotation Lab

Start by providing your user credentials and making sure this `get_cookies` function returns a `200` response code. The base url for this demo is: https://annotationlab.johnsnowlabs.com - make sure to change this accordingly. The code blocks below are based on the Annotation Lab [API documentation](https://nlp.johnsnowlabs.com/docs/en/alab/api).

In [None]:
# provide user credentials
username = 'user'
password = 'pass'
client_secret = "secret"

# helper function to get cookies
def get_cookies(username, password):
    
    url = "https://annotationlab.johnsnowlabs.com/openid-connect/token"
    
    headers = {
        "Content-Type": "application/json",
        "accept": "*/*",
    }
    
    data = {
      "username": username,
      "password": password,
      "client_id": "annotator",
      "client_secret": client_secret
    }
    
    resp = requests.post(url, headers=headers, json=data)
    print (resp.status_code)
    auth_info = resp.json()

    cookies = {
        'access_token': f"Bearer {auth_info['access_token']}",
        'refresh_token': auth_info['refresh_token']
    }
    return cookies

cookies = get_cookies(username, password)
cookies

For the purposes of this tutorial, we will be uploading the text samples from this csv.

In [None]:
# Downloading sample datasets.
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/mt_samples.csv

In [None]:
sample_data = pd.read_csv('mt_samples.csv')
print (sample_data.shape)
sample_data.head()

(50, 1)


Unnamed: 0,text
0,Sample Type / Medical Specialty:\nHematology -...
1,Sample Type / Medical Specialty:\nHematology -...
2,Sample Type / Medical Specialty:\nHematology -...
3,Sample Type / Medical Specialty:\nHematology -...
4,Sample Type / Medical Specialty:\nHematology -...


In [None]:
def create_sample_data(text_list):
    sample_data_for_upload = []
    for index, text in enumerate(text_list):
        sample_data_for_upload.append({'title': index, 'text': text})

    return sample_data_for_upload

sample_data_for_upload = create_sample_data(sample_data['text'].values)

Insert the name of your project, making sure it is the official name used in the Annotation Lab which is accessible by the url.

In [None]:
project_name = 'demo_100'
url = "https://annotationlab.johnsnowlabs.com/api/projects/{}/import".format(project_name)
print(url)

headers = {
        "Content-Type": "application/json",
        "accept": "*/*"
    }

cookies = get_cookies(username, password)

resp = requests.post(url, headers=headers, cookies=cookies, json=sample_data_for_upload)

resp.status_code
resp.text