![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Annotation_Lab/AL_API_import_export_pre_annotate.ipynb)


# Connect to Annotation Lab via API.
## This tutorial provides instrudctions and code for the following operations:
- Uploading Pre-annotations to Alab
- Importing a project from Alab, and converting to get conll, Assertion files.
- Uploading tasks without pre-annotations.

In [1]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

Saving jsl_keys.json to jsl_keys.json


In [2]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 72 kB/s 
[K     |████████████████████████████████| 130 kB 44.1 MB/s 
[K     |████████████████████████████████| 198 kB 48.1 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 136 kB 5.0 MB/s 
[K     |████████████████████████████████| 95 kB 2.4 MB/s 
[K     |████████████████████████████████| 66 kB 3.7 MB/s 
[?25h

In [11]:
import pandas as pd
import requests
import json
from zipfile import ZipFile
from io import BytesIO
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 3.3.2
Spark NLP_JSL Version : 3.3.2


**Note: The base url for this demo is: https://annotationlab.johnsnowlabs.com - you can change this accordingly**

**Provide you user credentials**

In [35]:
username = 'user'
password = 'pass'
client_secret = "secret"

**Helper Function to get cookies**

In [36]:
def get_cookies(username, password):
    
    
    url = "https://annotationlab.johnsnowlabs.com/openid-connect/token"
    
    headers = {
        "Content-Type": "application/json",
        "accept": "*/*",
    }
    
    data = {
      "username": username,
      "password": password,
      "client_id": "annotator",
      "client_secret": client_secret
    }
    
    resp = requests.post(url, headers=headers, json=data)
    print (resp.status_code)
    auth_info = resp.json()

    cookies = {
        'access_token': f"Bearer {auth_info['access_token']}",
        'refresh_token': auth_info['refresh_token']
    }
    return cookies

cookies = get_cookies(username, password)
#cookies

200


# Download sample data for uploading to Alab

In [25]:
# Downloading sample datasets.
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/mt_samples.csv
    

In [26]:
sample_data = pd.read_csv('mt_samples.csv')
print (sample_data.shape)
sample_data.head()

(50, 1)


Unnamed: 0,text
0,Sample Type / Medical Specialty:\nHematology -...
1,Sample Type / Medical Specialty:\nHematology -...
2,Sample Type / Medical Specialty:\nHematology -...
3,Sample Type / Medical Specialty:\nHematology -...
4,Sample Type / Medical Specialty:\nHematology -...


# 1. Pre-annotate, and upload to a project on Alab

**Note: Your project configuration should be coherent with your pre-annotation pipeline**

**1.1 Pipeline for pre-annotation. You can change according to requirements.**

In [27]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')\
    .setCustomBounds(['\n'])

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")\
    .setSplitChars(['\[','\]'])\
    .setContextChars([".", ",", ";", ":", "!", "?", "*", "-", "(", ")", "\"", "'","+","%","-"])

word_embeddings = WordEmbeddingsModel().pretrained('embeddings_clinical', 'en', 'clinical/models')\
    .setInputCols(["sentence", 'token']) \
    .setOutputCol("embeddings")\

ner_model = MedicalNerModel.pretrained('ner_jsl', 'en', 'clinical/models')\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")

converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")

assertion_model = AssertionDLModel().pretrained('assertion_dl', 'en', 'clinical/models')\
    .setInputCols(["sentence", "ner_chunk", 'embeddings'])\
    .setOutputCol("assertion_res")

ner_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        tokenizer,
        word_embeddings,
        ner_model,
        converter,
        assertion_model
    ])

empty_data = spark.createDataFrame([['']]).toDF("text")
pipeline_model = ner_pipeline.fit(empty_data)
lmodel = LightPipeline(pipeline_model)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14.5 MB
[OK!]
assertion_dl download started this may take some time.
Approximate size to download 1.3 MB
[OK!]


**1.2 Get Pre-Annotations using the pipeline above and convert to required format**

In [28]:
from pyspark.sql.types import *

def get_preannotations_from_NER (list_of_files, ner_prediction_model):
    
    print (len(list_of_files), " documents will be preannotated ...")
    
    file_text_tuples = []
    for index, file_text in enumerate(list_of_files):
        ## 
        file_text_tuples.append((index, # id of the file
                                 'demo_mt_samples_{}'.format(index), # this is the title that appears on the UI
                                 file_text # text of the file
                                ))
        
    # Define schema
    schema = StructType([
        StructField("task_id", StringType(), True),
        StructField("title", StringType(), True),
        StructField("text", StringType(), True)
    ])

    # Create dataframe
    
    spark_df = spark.createDataFrame(file_text_tuples, schema)
    
    print ("created spark dataframe ... transforming started")
        
    pred_df = ner_prediction_model.transform(spark_df)
    
    print ("pandas conversion started")
    
    view_df = pred_df.select("task_id",
                             'title', 
                             "text",
                             "ner_chunk", ## you can change this to any column name (the final chunk column in your pp)
                             "assertion_res" # - if you want assertion annotations as well
                            ).toPandas()
    
        
    view_df['task_id']=view_df['task_id'].astype(int)
    
    return view_df

preds_df = get_preannotations_from_NER(sample_data['text'].values, pipeline_model)


50  documents will be preannotated ...
created spark dataframe ... transforming started
pandas conversion started


**1.3 Prepare the JSON to upload to Alab**

In [29]:
import random as rand
import datetime

def generate_hash(length=10):    
    nums = list(range(48,58))
    uppers = list(range(65,91))
    lowers = list(range(97,123))
    all_chars = nums+uppers+lowers
    return "".join([chr(all_chars[rand.randint(0, len(all_chars)-1)]) for x in range(length)])

def create_import_json (username, pandas_pred_df, project_id):
    
    def build_label(chunk, start, end, label):
        
        label_json = {
                "from_name": "label",
                "id": generate_hash(),
                "source": "$text",
                "to_name": "text",
                "type": "labels",
                "value": {
                  "end": end,
                  "labels": [label],
                  "start": start,
                  "text": chunk
                }
              }
        return label_json

    import_json = []

    for i,row in pandas_pred_df.iterrows():
       
        results_jsons = [] 
        
        assertion_mapper = {}
        for x in row["ner_chunk"]: # assign proper column name
            if not pd.isna(x):
                results_jsons.append(build_label(x.result, x.begin, x.end+1, x.metadata["entity"]))
                assertion_mapper[x.begin] = x.result
                
        # comment out this loop if assertion is not required
        for x in row["assertion_res"]:
            if not pd.isna(x):
                results_jsons.append(build_label(assertion_mapper[x.begin], x.begin, x.end+1, x.result))
                
             
        import_json.append({"predictions": [{
            'created_username': username,
                "result":results_jsons
            }],
            "data":{
                "text":row["text"],
                "title":row['title']
            },
                            'id':row['task_id']
                           })
    
    print ("Annotations payload is ready")
    
    return import_json

annotation_json = create_import_json('ner_jsl', preds_df, 'demo_100')

print ('Annotated Documents:' , len(annotation_json))

Annotations payload is ready
Annotated Documents: 50


**1.4 Upload pre-annotations to Alab**

In [30]:
project_name = 'demo_100'
url = "https://annotationlab.johnsnowlabs.com/api/projects/{}/import".format(project_name)
print (url)

headers = {
        "Content-Type": "application/json",
        "accept": "*/*"
    }

cookies = get_cookies(username, password)

resp = requests.post(url, headers = headers, cookies = cookies, json = annotation_json)

resp.status_code
resp.text

https://annotationlab.johnsnowlabs.com/api/projects/demo_100/import
200




# 2. Download / Export a project as json from Alab

**2.1 Export project from Alab**

In [38]:
project_name = 'demo_100'
url = "https://annotationlab.johnsnowlabs.com/api/projects/{}/export?format=JSON".format(project_name)
print (url)

headers = {
        "Content-Type": "application/json",
        "accept": "*/*"
    }

cookies = get_cookies(username, password)

resp = requests.post(url, headers = headers, cookies = cookies)

zipfile = ZipFile(BytesIO(resp.content))
with zipfile.open(zipfile.namelist()[0]) as f:  
    data = f.read()  
project_json = json.loads(data)  

print ('Total tasks in the project with completions:', len(project_json))

with open('project_export.json', 'w') as f_:
    f_.write(json.dumps(project_json, indent=4))

https://annotationlab.johnsnowlabs.com/api/projects/demo_100/export?format=JSON
200
Total tasks in the project with completions: 4


**2.1 Parse the project json**

In [39]:
from sparknlp_jsl.training import *
from pyspark.sql import functions as F

json_path = './project_export.json'

rdr = AnnotationToolJsonReader(assertion_labels = ['present', 'absent', 'possbile', 'hypothetical', 'conditional', 'associated_with_someone_else'])

df_anns = rdr.readDataset(spark, json_path).withColumn("json",F.lit(json_path))

df_anns = NerConverter().setInputCols(['sentence', 'token', 'ner_label']).setOutputCol('ner_chunks').transform(df_anns)


**2.2 Generate conll file**

In [43]:
def df_to_conll (json_df, project_id):
    df = json_df.select("json","task_id",F.explode(F.arrays_zip('token.begin','token.end','token.result','ner_label.result',"token.metadata")).alias("cols")) \
    .select("json","task_id",
            F.expr("cols['0']").alias("begin"),
            F.expr("cols['1']").alias("end"),
            F.expr("cols['2']").alias("token"),
            F.expr("cols['3']").alias("ner"),
           F.expr("cols['4'].sentence").alias("sentence")).toPandas()
    
    conll_lines=["-DOCSTART- -X- -X- O\n\n"]
    for j,project in enumerate(df["json"].unique()):
        project_df = df[df["json"]==project].reset_index(drop=True)
        print ("project ", j)
        for t,task in enumerate(project_df.task_id.unique()):
            print (j, t)
            task_df = project_df[project_df.task_id==task].reset_index(drop=True)
            for sent in task_df.sentence.unique():
                #print (task, sent)
                sent_df = task_df[task_df.sentence==sent].sort_values(by=["begin"]).reset_index(drop=True)
                for i,row in sent_df.iterrows():
                    #print (task, sent, i)
                    conll_lines.append(row["token"]+" -X- -X- "+row["ner"]+"\n")
                #print ("end of sent ")
                conll_lines.append("\n")
    with open('./project_{}_ner.conll'.format(project_id), 'w') as f:
        for i in conll_lines:
            f.write(i)
    return './project_{}_ner.conll'.format(project_id)

df_to_conll(df_anns, 'demo_100')

!head -n 20 ./project_demo_100_ner.conll


project  0
0 0
0 1
0 2
0 3
-DOCSTART- -X- -X- O

Sample -X- -X- O
Type -X- -X- O
/ -X- -X- O
Medical -X- -X- O
Specialty -X- -X- O
: -X- -X- O
Hematology -X- -X- B-Clinical_Dept
- -X- -X- O
Oncology -X- -X- B-Clinical_Dept
Sample -X- -X- O
Name -X- -X- O
: -X- -X- O
Consult -X- -X- O
- -X- -X- O
Breast -X- -X- B-Oncological
Cancer -X- -X- I-Oncological
Description -X- -X- B-Section_Header
: -X- -X- I-Section_Header


**2.3 Generate Data for Training Assertion Model**

In [44]:
rel_pds = df_anns.toPandas()
print (rel_pds.shape)
print (rel_pds['task_id'].nunique())
rel_pds = rel_pds.sort_values('completion_id').drop_duplicates('task_id', keep='last')
rel_pds.head()

(4, 14)
4


Unnamed: 0,title,task_id,text,completion_id,tool_chunk,assertion_label,relations,document,sentence,token,ner_label,ner_chunk,json,ner_chunks
1,demo_mt_samples_46,46,Sample Type / Medical Specialty:\nHematology -...,46001,"[(chunk, 33, 42, Hematology, {'entity': 'Clini...","[(assertion, 33, 42, present, {'chunk_id': 'IY...",[],"[(document, 0, 10211, Sample Type / Medical Sp...","[(document, 0, 189, Sample Type / Medical Spec...","[(token, 0, 5, Sample, {'sentence': '0'}, []),...","[(named_entity, 0, 5, O, {'sentence': '0', 'wo...","[(chunk, 33, 42, Hematology, {'sentence': '0',...",./project_export.json,"[(chunk, 33, 42, Hematology, {'sentence': '0',..."
2,demo_mt_samples_47,47,Sample Type / Medical Specialty:\nHematology -...,47001,"[(chunk, 33, 42, Hematology, {'entity': 'Clini...","[(assertion, 33, 42, present, {'chunk_id': '8P...",[],"[(document, 0, 3491, Sample Type / Medical Spe...","[(document, 0, 144, Sample Type / Medical Spec...","[(token, 0, 5, Sample, {'sentence': '0'}, []),...","[(named_entity, 0, 5, O, {'sentence': '0', 'wo...","[(chunk, 33, 42, Hematology, {'sentence': '0',...",./project_export.json,"[(chunk, 33, 42, Hematology, {'sentence': '0',..."
3,demo_mt_samples_48,48,Sample Type / Medical Specialty:\nHematology -...,48001,"[(chunk, 33, 42, Hematology, {'entity': 'Clini...","[(assertion, 33, 42, present, {'chunk_id': '3b...",[],"[(document, 0, 2863, Sample Type / Medical Spe...","[(document, 0, 126, Sample Type / Medical Spec...","[(token, 0, 5, Sample, {'sentence': '0'}, []),...","[(named_entity, 0, 5, O, {'sentence': '0', 'wo...","[(chunk, 33, 42, Hematology, {'sentence': '0',...",./project_export.json,"[(chunk, 33, 42, Hematology, {'sentence': '0',..."
0,demo_mt_samples_49,49,Sample Type / Medical Specialty:\nHematology -...,49001,"[(chunk, 33, 42, Hematology, {'entity': 'Clini...","[(assertion, 33, 42, present, {'chunk_id': 'aZ...",[],"[(document, 0, 5889, Sample Type / Medical Spe...","[(document, 0, 319, Sample Type / Medical Spec...","[(token, 0, 5, Sample, {'sentence': '0'}, []),...","[(named_entity, 0, 5, O, {'sentence': '0', 'wo...","[(chunk, 33, 42, Hematology, {'sentence': '0',...",./project_export.json,"[(chunk, 33, 42, Hematology, {'sentence': '0',..."


In [55]:

all_tasks_assertions = []
all_tasks_relations = []
for index, group in rel_pds.groupby('task_id'):
    
    print (index)
    
    ann_chunks = pd.DataFrame( [{'chunk_id': i.metadata['chunk_id'], 'chunk': i.result, 'begin': int(i.begin), 'end': i.end, 'entity': i.metadata['entity']} for i in group['tool_chunk'].explode() ] )
        
    ner_chunks = pd.DataFrame( [{'chunk_num': ii, 'chunk': i.result, 'begin': int(i.begin), 'end': i.end, 'sentence_id': int(i.metadata['sentence'])} for ii, i in enumerate(group['ner_chunks'].explode()) ] )
    
    with_sent = pd.merge(ann_chunks[['chunk_id', 'begin', 'entity']], ner_chunks, on=['begin'], how='inner')
    
    sentences_df = pd.DataFrame( [{'sentence_id': int(i.metadata['sentence']), 'sentence': i.result, 'sent_begin': int(i.begin), 'sent_end': int(i.end)} for i in group['sentence'].explode() ] )
    
    with_sent = pd.merge(with_sent, sentences_df, on=['sentence_id'], how='inner')

    assertion_chunks = pd.DataFrame( [{'assertion_label': i.result, 'begin': int(i.begin) } for i in group['assertion_label'].explode() ] )
    
    assertion_chunks = pd.merge(assertion_chunks, with_sent, on=['begin'], how='inner')
    
    assertion_chunks['task_id'] = index
    all_tasks_assertions.append(assertion_chunks)
        
all_tasks_assertions = pd.concat(all_tasks_assertions, axis=0)
print (all_tasks_assertions['assertion_label'].value_counts())

all_tasks_assertions['begin'] = all_tasks_assertions['begin'] - all_tasks_assertions['sent_begin']
all_tasks_assertions['end'] = all_tasks_assertions['end'] - all_tasks_assertions['sent_begin']

print (all_tasks_assertions.shape)               


46
47
48
49
present                         634
absent                          300
hypothetical                     48
associated_with_someone_else     36
conditional                      11
Name: assertion_label, dtype: int64
(1029, 12)


In [56]:
from itertools import groupby

def split_get_ind(str_):
    ret = []
    for k, g in groupby(enumerate(str_), lambda x: x[1] != ' '):
        if k:
            pos, first_item = next(g)
            res = first_item + ''.join([x for _, x in g])
            ret.append( (pos, pos+len(res)))
    return ret

tkn_st = []
tkn_ed = []
for i, row in all_tasks_assertions.iterrows():
    ass_tkns = split_get_ind(row['sentence'])
    st = -1
    ed = -1
    for tkn_ind, tkn in enumerate(ass_tkns):
        if int(row['begin']) in range(*tkn):
            st = tkn_ind
        if int(row['end']) in range(*tkn):
            ed = tkn_ind
    if st < 0 or ed < 0:
        tkn_st.append(None)
        tkn_ed.append(None)
    else:
        tkn_st.append(st)
        tkn_ed.append(ed)
    
    #print (st, ed)
all_tasks_assertions['tkn_start'] = tkn_st
all_tasks_assertions['tkn_end'] = tkn_ed
print (all_tasks_assertions.shape)
all_tasks_assertions.dropna(inplace=True)
all_tasks_assertions.reset_index(inplace=True, drop=True)
print (all_tasks_assertions.shape)
all_tasks_assertions = all_tasks_assertions[['task_id', 'sentence', 'tkn_start', 'tkn_end', 'chunk', 'entity', 'assertion_label']]
all_tasks_assertions.head(50)


(1029, 14)
(1029, 14)


Unnamed: 0,task_id,sentence,tkn_start,tkn_end,chunk,entity,assertion_label
0,46,Sample Type / Medical Specialty:\nHematology -...,4,4,Hematology,Clinical_Dept,present
1,46,Sample Type / Medical Specialty:\nHematology -...,6,6,Oncology,Clinical_Dept,present
2,46,Sample Type / Medical Specialty:\nHematology -...,7,10,Non-Small Cell Lung Cancer,Oncological,present
3,46,Sample Type / Medical Specialty:\nHematology -...,12,12,Description:,Section_Header,present
4,46,Sample Type / Medical Specialty:\nHematology -...,15,18,non-small cell lung cancer,Oncological,present
5,46,Sample Type / Medical Specialty:\nHematology -...,19,20,stage IV,Modifier,present
6,46,Sample Type / Medical Specialty:\nHematology -...,21,22,metastatic disease,Oncological,present
7,46,"At this point, he and his wife ask about wheth...",3,3,he,Gender,present
8,46,"At this point, he and his wife ask about wheth...",5,5,his,Gender,present
9,46,"At this point, he and his wife ask about wheth...",6,6,wife,Gender,present


# 3. Simply Upload data to an Alab Project (without pre-annotations)

In [31]:
def create_sample_data(text_list):
    sample_data_for_upload = []
    for index, text in enumerate(text_list):
        sample_data_for_upload.append({'title': index, 'text': text})

    return sample_data_for_upload

sample_data_for_upload = create_sample_data(sample_data['text'].values)

project_name = 'demo_100'
url = "https://annotationlab.johnsnowlabs.com/api/projects/{}/import".format(project_name)
print (url)

headers = {
        "Content-Type": "application/json",
        "accept": "*/*"
    }

cookies = get_cookies(username, password)

resp = requests.post(url, headers = headers, cookies = cookies, json = sample_data_for_upload)

resp.status_code
resp.text

https://annotationlab.johnsnowlabs.com/api/projects/dummy/import
200


'{"code":500,"description":"The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.","error":"Internal Server Error"}\n'