![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Annotation_Lab/AnnotationLab_preannotate_upload.ipynb)


# Pre-annotating and Uploading Tasks to the Annotation Lab

## This tutorial provides instructions and code for the following operations:
1. Creating Pre-annotations for the Annotation Lab
2. Uploading Pre-annotations to the Annotation Lab
3. Uploading Tasks Without Pre-annotations to the Annotation Lab

In [None]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [3]:
import pandas as pd
import requests
import json
from zipfile import ZipFile
from io import BytesIO
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 3.4.4
Spark NLP_JSL Version : 3.5.3


## 1. Creating Pre-annotations for the Annotation Lab

Please make sure to modify this pre-annotation pipeline to make it coherent with your Annotation Lab project configuration.

In [None]:
# Downloading sample datasets.
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/mt_samples.csv

In [None]:
sample_data = pd.read_csv('mt_samples.csv')
print (sample_data.shape)
sample_data.head()

(50, 1)


Unnamed: 0,text
0,Sample Type / Medical Specialty:\nHematology -...
1,Sample Type / Medical Specialty:\nHematology -...
2,Sample Type / Medical Specialty:\nHematology -...
3,Sample Type / Medical Specialty:\nHematology -...
4,Sample Type / Medical Specialty:\nHematology -...


In [None]:
# pre-annotation pipeline
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')\
    .setCustomBounds(['\n'])

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel().pretrained('embeddings_clinical', 'en', 'clinical/models')\
    .setInputCols(["sentence", 'token'])\
    .setOutputCol("embeddings")\

ner_model = MedicalNerModel.pretrained('ner_jsl', 'en', 'clinical/models')\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")

converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")

assertion_model = AssertionDLModel().pretrained('assertion_dl', 'en', 'clinical/models')\
    .setInputCols(["sentence", "ner_chunk", 'embeddings'])\
    .setOutputCol("assertion_res")

ner_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        tokenizer,
        word_embeddings,
        ner_model,
        converter,
        assertion_model
    ])

empty_data = spark.createDataFrame([['']]).toDF("text")
pipeline_model = ner_pipeline.fit(empty_data)
lmodel = LightPipeline(pipeline_model)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14.5 MB
[OK!]
assertion_dl download started this may take some time.
Approximate size to download 1.3 MB
[OK!]


In [None]:
# pre-annotate dataframe using the pipeline above
from pyspark.sql.types import *

def get_preannotations_from_NER (list_of_files, ner_prediction_model):
    
    print(len(list_of_files), " documents will be preannotated ...")
    
    file_text_tuples = []
    for index, file_text in enumerate(list_of_files):
        ## 
        file_text_tuples.append((index, # id of the file
                                 'demo_mt_samples_{}'.format(index), # this is the title that appears on the UI
                                 file_text # text of the file
                                ))
        
    # Define schema
    schema = StructType([
        StructField("task_id", StringType(), True),
        StructField("title", StringType(), True),
        StructField("text", StringType(), True)
    ])

    # Create dataframe
    
    spark_df = spark.createDataFrame(file_text_tuples, schema)
    
    print("created spark dataframe ... transforming started")
        
    pred_df = ner_prediction_model.transform(spark_df)
    
    print("pandas conversion started")
    
    view_df = pred_df.select("task_id",
                             'title', 
                             "text",
                             "ner_chunk", ## you can change this to any column name (the final chunk column in your pp)
                             "assertion_res" # - if you want assertion annotations as well
                            ).toPandas()
    
        
    view_df['task_id']=view_df['task_id'].astype(int)
    
    return view_df

preds_df = get_preannotations_from_NER(sample_data['text'].values, pipeline_model)

In [None]:
# convert pre-annotations to JSON format for uploading to Annotation Lab
import random as rand
import datetime

def generate_hash(length=10):    
    nums = list(range(48,58))
    uppers = list(range(65,91))
    lowers = list(range(97,123))
    all_chars = nums+uppers+lowers
    return "".join([chr(all_chars[rand.randint(0, len(all_chars)-1)]) for x in range(length)])

def create_import_json (username, pandas_pred_df, project_id):
    
    def build_label(chunk, start, end, label):
        
        label_json = {
                "from_name": "label",
                "id": generate_hash(),
                "source": "$text",
                "to_name": "text",
                "type": "labels",
                "value": {
                  "end": end,
                  "labels": [label],
                  "start": start,
                  "text": chunk
                }
              }
        return label_json

    import_json = []

    for i,row in pandas_pred_df.iterrows():
       
        results_jsons = [] 
        
        assertion_mapper = {}
        for x in row["ner_chunk"]: # assign proper column name
            if not pd.isna(x):
                results_jsons.append(build_label(x.result, x.begin, x.end+1, x.metadata["entity"]))
                assertion_mapper[x.begin] = x.result
                
        # comment out this loop if assertion is not required
        for x in row["assertion_res"]:
            if not pd.isna(x):
                results_jsons.append(build_label(assertion_mapper[x.begin], x.begin, x.end+1, x.result))
                
             
        import_json.append({"predictions": [{
            'created_username': username,
                "result":results_jsons
            }],
            "data":{
                "text":row["text"],
                "title":row['title']
            },
                            'id':row['task_id']
                           })
    
    print("Annotations payload is ready")
    
    return import_json

annotation_json = create_import_json('ner_jsl', preds_df, 'demo_100')

print('Annotated Documents:' , len(annotation_json))

Annotations payload is ready
Annotated Documents: 50


## 2. Uploading Pre-annotations to the Annotation Lab

Start by providing your user credentials and making sure this `get_cookies` function returns a `200` response code. The base url for this demo is: https://annotationlab.johnsnowlabs.com - make sure to change this accordingly. The code blocks below are based on the Annotation Lab [API documentation](https://nlp.johnsnowlabs.com/docs/en/alab/api).

In [None]:
# provide user credentials
username = 'user'
password = 'pass'
client_secret = "secret"

# helper function to get cookies
def get_cookies(username, password):
    
    url = "https://annotationlab.johnsnowlabs.com/openid-connect/token"
    
    headers = {
        "Content-Type": "application/json",
        "accept": "*/*",
    }
    
    data = {
      "username": username,
      "password": password,
      "client_id": "annotator",
      "client_secret": client_secret
    }
    
    resp = requests.post(url, headers=headers, json=data)
    print (resp.status_code)
    auth_info = resp.json()

    cookies = {
        'access_token': f"Bearer {auth_info['access_token']}",
        'refresh_token': auth_info['refresh_token']
    }
    return cookies

cookies = get_cookies(username, password)
cookies

Insert the name of your project, making sure it is the official name used in the Annotation Lab which is accessible by the url.

In [None]:
project_name = 'demo_100'
url = "https://annotationlab.johnsnowlabs.com/api/projects/{}/import".format(project_name)
print(url)

headers = {
        "Content-Type": "application/json",
        "accept": "*/*"
    }

cookies = get_cookies(username, password)

resp = requests.post(url, headers=headers, cookies=cookies, json=annotation_json)

resp.status_code
resp.text

## 3. Uploading Tasks Without Pre-annotations to the Annotation Lab

Start by providing your user credentials and making sure this `get_cookies` function returns a `200` response code. The base url for this demo is: https://annotationlab.johnsnowlabs.com - make sure to change this accordingly. The code blocks below are based on the Annotation Lab [API documentation](https://nlp.johnsnowlabs.com/docs/en/alab/api).

In [None]:
# provide user credentials
username = 'user'
password = 'pass'
client_secret = "secret"

# helper function to get cookies
def get_cookies(username, password):
    
    url = "https://annotationlab.johnsnowlabs.com/openid-connect/token"
    
    headers = {
        "Content-Type": "application/json",
        "accept": "*/*",
    }
    
    data = {
      "username": username,
      "password": password,
      "client_id": "annotator",
      "client_secret": client_secret
    }
    
    resp = requests.post(url, headers=headers, json=data)
    print (resp.status_code)
    auth_info = resp.json()

    cookies = {
        'access_token': f"Bearer {auth_info['access_token']}",
        'refresh_token': auth_info['refresh_token']
    }
    return cookies

cookies = get_cookies(username, password)
cookies

For the purposes of this tutorial, we will be uploading the text samples from this csv.

In [None]:
# Downloading sample datasets.
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/mt_samples.csv

In [None]:
sample_data = pd.read_csv('mt_samples.csv')
print (sample_data.shape)
sample_data.head()

(50, 1)


Unnamed: 0,text
0,Sample Type / Medical Specialty:\nHematology -...
1,Sample Type / Medical Specialty:\nHematology -...
2,Sample Type / Medical Specialty:\nHematology -...
3,Sample Type / Medical Specialty:\nHematology -...
4,Sample Type / Medical Specialty:\nHematology -...


In [None]:
def create_sample_data(text_list):
    sample_data_for_upload = []
    for index, text in enumerate(text_list):
        sample_data_for_upload.append({'title': index, 'text': text})

    return sample_data_for_upload

sample_data_for_upload = create_sample_data(sample_data['text'].values)

Insert the name of your project, making sure it is the official name used in the Annotation Lab which is accessible by the url.

In [None]:
project_name = 'demo_100'
url = "https://annotationlab.johnsnowlabs.com/api/projects/{}/import".format(project_name)
print(url)

headers = {
        "Content-Type": "application/json",
        "accept": "*/*"
    }

cookies = get_cookies(username, password)

resp = requests.post(url, headers=headers, cookies=cookies, json=sample_data_for_upload)

resp.status_code
resp.text