![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Annotation_Lab/AL_API_import_export_pre_annotate.ipynb)


# Connect to Annotation Lab via API.
## This tutorial provides instrudctions and code for the following operations:
- Uploading Pre-annotations to Alab
- Importing a project from Alab, and converting to get conll, Assertion files.
- Uploading tasks without pre-annotations.

In [None]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [3]:
import pandas as pd
import requests
import json
from zipfile import ZipFile
from io import BytesIO
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 3.4.4
Spark NLP_JSL Version : 3.5.3


**Note: The base url for this demo is: https://annotationlab.johnsnowlabs.com - you can change this accordingly**

**Provide you user credentials**

In [None]:
username = 'user'
password = 'pass'
client_secret = "secret"

**Helper Function to get cookies**

In [None]:
def get_cookies(username, password):
    
    
    url = "https://annotationlab.johnsnowlabs.com/openid-connect/token"
    
    headers = {
        "Content-Type": "application/json",
        "accept": "*/*",
    }
    
    data = {
      "username": username,
      "password": password,
      "client_id": "annotator",
      "client_secret": client_secret
    }
    
    resp = requests.post(url, headers=headers, json=data)
    print (resp.status_code)
    auth_info = resp.json()

    cookies = {
        'access_token': f"Bearer {auth_info['access_token']}",
        'refresh_token': auth_info['refresh_token']
    }
    return cookies

cookies = get_cookies(username, password)
#cookies

200


# Download sample data for uploading to Alab

In [None]:
# Downloading sample datasets.
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/mt_samples.csv
    

In [None]:
sample_data = pd.read_csv('mt_samples.csv')
print (sample_data.shape)
sample_data.head()

(50, 1)


Unnamed: 0,text
0,Sample Type / Medical Specialty:\nHematology -...
1,Sample Type / Medical Specialty:\nHematology -...
2,Sample Type / Medical Specialty:\nHematology -...
3,Sample Type / Medical Specialty:\nHematology -...
4,Sample Type / Medical Specialty:\nHematology -...


# 1. Pre-annotate, and upload to a project on Alab

**Note: Your project configuration should be coherent with your pre-annotation pipeline**

**1.1 Pipeline for pre-annotation. You can change according to requirements.**

In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')\
    .setCustomBounds(['\n'])

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")\
    .setSplitChars(['\[','\]'])\
    .setContextChars([".", ",", ";", ":", "!", "?", "*", "-", "(", ")", "\"", "'","+","%","-"])

word_embeddings = WordEmbeddingsModel().pretrained('embeddings_clinical', 'en', 'clinical/models')\
    .setInputCols(["sentence", 'token']) \
    .setOutputCol("embeddings")\

ner_model = MedicalNerModel.pretrained('ner_jsl', 'en', 'clinical/models')\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")

converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")

assertion_model = AssertionDLModel().pretrained('assertion_dl', 'en', 'clinical/models')\
    .setInputCols(["sentence", "ner_chunk", 'embeddings'])\
    .setOutputCol("assertion_res")

ner_pipeline = Pipeline(
    stages = [
        document,
        sentence,
        tokenizer,
        word_embeddings,
        ner_model,
        converter,
        assertion_model
    ])

empty_data = spark.createDataFrame([['']]).toDF("text")
pipeline_model = ner_pipeline.fit(empty_data)
lmodel = LightPipeline(pipeline_model)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14.5 MB
[OK!]
assertion_dl download started this may take some time.
Approximate size to download 1.3 MB
[OK!]


**1.2 Get Pre-Annotations using the pipeline above and convert to required format**

In [None]:
from pyspark.sql.types import *

def get_preannotations_from_NER (list_of_files, ner_prediction_model):
    
    print (len(list_of_files), " documents will be preannotated ...")
    
    file_text_tuples = []
    for index, file_text in enumerate(list_of_files):
        ## 
        file_text_tuples.append((index, # id of the file
                                 'demo_mt_samples_{}'.format(index), # this is the title that appears on the UI
                                 file_text # text of the file
                                ))
        
    # Define schema
    schema = StructType([
        StructField("task_id", StringType(), True),
        StructField("title", StringType(), True),
        StructField("text", StringType(), True)
    ])

    # Create dataframe
    
    spark_df = spark.createDataFrame(file_text_tuples, schema)
    
    print ("created spark dataframe ... transforming started")
        
    pred_df = ner_prediction_model.transform(spark_df)
    
    print ("pandas conversion started")
    
    view_df = pred_df.select("task_id",
                             'title', 
                             "text",
                             "ner_chunk", ## you can change this to any column name (the final chunk column in your pp)
                             "assertion_res" # - if you want assertion annotations as well
                            ).toPandas()
    
        
    view_df['task_id']=view_df['task_id'].astype(int)
    
    return view_df

preds_df = get_preannotations_from_NER(sample_data['text'].values, pipeline_model)


50  documents will be preannotated ...
created spark dataframe ... transforming started
pandas conversion started


**1.3 Prepare the JSON to upload to Alab**

In [None]:
import random as rand
import datetime

def generate_hash(length=10):    
    nums = list(range(48,58))
    uppers = list(range(65,91))
    lowers = list(range(97,123))
    all_chars = nums+uppers+lowers
    return "".join([chr(all_chars[rand.randint(0, len(all_chars)-1)]) for x in range(length)])

def create_import_json (username, pandas_pred_df, project_id):
    
    def build_label(chunk, start, end, label):
        
        label_json = {
                "from_name": "label",
                "id": generate_hash(),
                "source": "$text",
                "to_name": "text",
                "type": "labels",
                "value": {
                  "end": end,
                  "labels": [label],
                  "start": start,
                  "text": chunk
                }
              }
        return label_json

    import_json = []

    for i,row in pandas_pred_df.iterrows():
       
        results_jsons = [] 
        
        assertion_mapper = {}
        for x in row["ner_chunk"]: # assign proper column name
            if not pd.isna(x):
                results_jsons.append(build_label(x.result, x.begin, x.end+1, x.metadata["entity"]))
                assertion_mapper[x.begin] = x.result
                
        # comment out this loop if assertion is not required
        for x in row["assertion_res"]:
            if not pd.isna(x):
                results_jsons.append(build_label(assertion_mapper[x.begin], x.begin, x.end+1, x.result))
                
             
        import_json.append({"predictions": [{
            'created_username': username,
                "result":results_jsons
            }],
            "data":{
                "text":row["text"],
                "title":row['title']
            },
                            'id':row['task_id']
                           })
    
    print ("Annotations payload is ready")
    
    return import_json

annotation_json = create_import_json('ner_jsl', preds_df, 'demo_100')

print ('Annotated Documents:' , len(annotation_json))

Annotations payload is ready
Annotated Documents: 50


**1.4 Upload pre-annotations to Alab**

In [None]:
project_name = 'demo_100'
url = "https://annotationlab.johnsnowlabs.com/api/projects/{}/import".format(project_name)
print (url)

headers = {
        "Content-Type": "application/json",
        "accept": "*/*"
    }

cookies = get_cookies(username, password)

resp = requests.post(url, headers = headers, cookies = cookies, json = annotation_json)

resp.status_code
resp.text

https://annotationlab.johnsnowlabs.com/api/projects/demo_100/import
200




# 2. Download / Export a project as json from Alab

**2.1 Export project from Alab**

In [None]:
project_name = 'demo_100'
url = "https://annotationlab.johnsnowlabs.com/api/projects/{}/export?format=JSON".format(project_name)
print (url)

headers = {
        "Content-Type": "application/json",
        "accept": "*/*"
    }

cookies = get_cookies(username, password)

resp = requests.post(url, headers = headers, cookies = cookies)

zipfile = ZipFile(BytesIO(resp.content))
with zipfile.open(zipfile.namelist()[0]) as f:  
    data = f.read()  
project_json = json.loads(data)  

print ('Total tasks in the project with completions:', len(project_json))

with open('project_export.json', 'w') as f_:
    f_.write(json.dumps(project_json, indent=4))

https://annotationlab.johnsnowlabs.com/api/projects/demo_100/export?format=JSON
200
Total tasks in the project with completions: 4


**2.1 Parse the project json and generate conll file**

In [None]:
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
from pyspark.ml import Pipeline, PipelineModel

def get_nlp_pipeline():

    documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

    sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
      .setInputCols(["document"])\
      .setOutputCol("sentence")
    
    pattern = "\\s+|(?=[-.:;*+,$&%\\[\\]\\(\\)\\/])|(?<=[-.:;*+,$&%\\[\\]\\(\\)\\/])"
    
    regex_tokenizer = RegexTokenizer() \
        .setInputCols(["sentence"]) \
        .setOutputCol("token")\
        .setPositionalMask(True)\
        .setPattern(pattern)
        
    pos = PerceptronModel.pretrained("pos_clinical","en","clinical/models")\
        .setInputCols(["sentence", "token"]) \
        .setOutputCol("pos")

    pipeline = Pipeline(
        stages = [
            documentAssembler,
            sentenceDetector,
            regex_tokenizer,
            pos]
    )

    empty_data = spark.createDataFrame([[""]]).toDF("text")

    pipelineFit = pipeline.fit(empty_data)

    lp_pipeline = LightPipeline(pipelineFit)
    
    print ("Spark NLP lightpipeline is created")
    
    return lp_pipeline

lp_pipeline =  get_nlp_pipeline()

def get_token_pipeline():

    documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("sentence")
    
    pattern = "\\s+|(?=[-.:;*+,$&%\\[\\]\\(\\)\\/])|(?<=[-.:;*+,$&%\\[\\]\\(\\)\\/])"
    
    regex_tokenizer = RegexTokenizer() \
        .setInputCols(["sentence"]) \
        .setOutputCol("token")\
        .setPositionalMask(True)\
        .setPattern(pattern)

    pipeline = Pipeline(
        stages = [
            documentAssembler,
            regex_tokenizer]
    )

    empty_data = spark.createDataFrame([[""]]).toDF("text")

    pipelineFit = pipeline.fit(empty_data)

    lp_pipeline = LightPipeline(pipelineFit)
    
    print("Spark NLP lightpipeline is created")
    
    return lp_pipeline

token_lp_pipeline =  get_token_pipeline()

In [None]:
def get_conll_manual(lp_pipeline, output, excludedDocs=[''], 
                     excluded=['present', 'absent', 'possbile', 'hypothetical', 'conditional', 'associated_with_someone_else'], # set all assertions here
                     included_phrase=''):
    
    conll_lines=[]

    try:
        new_content = output['data']['text']
    except:
        new_content =  output['data']['longText']
    
    try:
        title = output['data']['title']
    except:
        title = "task_id" + str(output['id'])
    print(title)
    n = lp_pipeline.fullAnnotate(new_content)

    sent_tuples = {str(x.metadata['sentence']): (x.begin, x.end) for x in n[0]['sentence']}
    parsed = [(int(x.metadata['sentence']), x.result, x.begin, x.end, y.result, sent_tuples[x.metadata['sentence']][0]) for x,y in zip(n[0]["token"],n[0]["pos"])]

    ents = []
    
    if len(output['completions'])!=0 and title.strip() not in excludedDocs:

        ann_results = output['completions'][-1]['result']

        for d in ann_results: 
            #and included_phrase in d['value']['labels'][0] 
            if d['type']=='labels' and \
            d['value']['labels'][0].replace(' ','') not in excluded:                
                
                temp_text = d['value']['text']
                start=d['value']['start']
                end=d['value']['end']

                if len(temp_text)!=len(temp_text.rstrip()):
                    end = end-(len(temp_text)-len(temp_text.rstrip()))
                    temp_text = temp_text.rstrip()

                if len(temp_text)!=len(temp_text.lstrip()):
                    start = start+(len(temp_text)-len(temp_text.lstrip())) 
                    temp_text = temp_text.lstrip()

                ents.append((temp_text, d['value']['labels'][0].replace(' ',''), start, end))

        text_df = pd.DataFrame(ents, columns=['chunk','label','start','end']) 
        
        text_df['label'] = text_df['label'].apply(lambda x: x.replace(' ',''))   
        
        text_df.sort_values(by=['start', 'end'], inplace=True)
        text_df.reset_index(drop=True, inplace=True)
        
        df = text_df.copy()
        
        tag_dict = {}
        

        for i,row in df.iterrows():
                               
            base_ix= row["start"]
            
            chunk_tokens = token_lp_pipeline.fullAnnotate(row['chunk'])[0]['token']

            for i, token in enumerate(chunk_tokens):
                if i == 0:
                    iob = 'B-'
                else:
                    iob = 'I-'
                tag_dict[(base_ix+token.begin, token.result)] = iob + row['label'].replace(' ','')  
                
        s=0
        for i, p in enumerate(parsed):
            if p[0]!=s:
                conll_lines.append("\n")
                s+=1
            conll_lines.append("{} {} {} {}\n".format(p[1], p[4], p[4], tag_dict.get((p[2]+p[-1],p[1]),"O")))

        conll_lines.append("\n")

        return conll_lines
    
    else:
       
        return ''


def get_conll_from_json_manual(pid, bulk_json_file_path):
    
    with open(bulk_json_file_path, 'r') as json_file:
        
        json_outputs = json.load(json_file)
    
    bulk_conll_lines = ["-DOCSTART- -X- -X- O\n\n"]

    print (len(json_outputs))
    
    for o, output in enumerate(json_outputs):

        in_conll_lines = get_conll_manual(lp_pipeline, output) 
        
        bulk_conll_lines.extend(in_conll_lines)
        
        print (o)        
        
    try:
        os.mkdir('exported_conlls')
    except:
        pass

    with open('exported_conlls/project_{}.conll'.format(pid), 'w', encoding='utf-8') as f:
        for i in bulk_conll_lines:
            f.write(i)

    print ('exported_conlls/project_{}.conll'.format(pid))
    
    return bulk_conll_lines

In [None]:
get_conll_from_json_manual('demo_100','./project_export.json')

**2.2 Generate Data for Training Assertion Model**

In [None]:
from sparknlp_jsl.training import *
from pyspark.sql import functions as F

json_path = './project_export.json'

rdr = AnnotationToolJsonReader(assertion_labels = ['present', 'absent', 'possbile', 'hypothetical', 'conditional', 'associated_with_someone_else'])

df_anns = rdr.readDataset(spark, json_path).withColumn("json",F.lit(json_path))

In [None]:
rel_pds = df_anns.toPandas()
print (rel_pds.shape)
print (rel_pds['task_id'].nunique())
rel_pds = rel_pds.sort_values('completion_id').drop_duplicates('task_id', keep='last')

In [None]:
rel_pds = rel_pds[rel_pds['assertion_label'].apply(lambda x: len(x)) > 0]

all_tasks_assertions = []
all_tasks_relations = []
for index, group in rel_pds.groupby('task_id'):
    
    print (index)
    
    ann_chunks = pd.DataFrame( [{'chunk_id': i.metadata['chunk_id'], 'chunk': i.result, 'begin': int(i.begin), 'end': i.end, 'entity': i.metadata['entity']} for i in group['tool_chunk'].explode() ] )
        
    ner_chunks = pd.DataFrame( [{'chunk_num': ii, 'chunk': i.result, 'begin': int(i.begin), 'end': i.end, 'sentence_id': int(i.metadata['sentence'])} for ii, i in enumerate(group['ner_chunk'].explode()) ] )
    
    with_sent = pd.merge(ann_chunks[['chunk_id', 'begin', 'entity']], ner_chunks, on=['begin'], how='inner')
    
    sentences_df = pd.DataFrame( [{'sentence_id': int(i.metadata['sentence']), 'sentence': i.result, 'sent_begin': int(i.begin), 'sent_end': int(i.end)} for i in group['sentence'].explode() ] )
    
    with_sent = pd.merge(with_sent, sentences_df, on=['sentence_id'], how='inner')

    assertion_chunks = pd.DataFrame( [{'assertion_label': i.result, 'begin': int(i.begin) } for i in group['assertion_label'].explode() ] )
    
    assertion_chunks = pd.merge(assertion_chunks, with_sent, on=['begin'], how='inner')
    
    assertion_chunks['task_id'] = index
    all_tasks_assertions.append(assertion_chunks)
        
all_tasks_assertions = pd.concat(all_tasks_assertions, axis=0)
print (all_tasks_assertions['assertion_label'].value_counts())

all_tasks_assertions['begin'] = all_tasks_assertions['begin'] - all_tasks_assertions['sent_begin']
all_tasks_assertions['end'] = all_tasks_assertions['end'] - all_tasks_assertions['sent_begin']

print (all_tasks_assertions.shape)

46
47
48
49
present                         634
absent                          300
hypothetical                     48
associated_with_someone_else     36
conditional                      11
Name: assertion_label, dtype: int64
(1029, 12)


In [None]:
from itertools import groupby

def split_get_ind(str_):
    ret = []
    for k, g in groupby(enumerate(str_), lambda x: x[1] != ' '):
        if k:
            pos, first_item = next(g)
            res = first_item + ''.join([x for _, x in g])
            ret.append( (pos, pos+len(res)))
    return ret

tkn_st = []
tkn_ed = []
for i, row in all_tasks_assertions.iterrows():
    ass_tkns = split_get_ind(row['sentence'])
    st = -1
    ed = -1
    for tkn_ind, tkn in enumerate(ass_tkns):
        if int(row['begin']) in range(*tkn):
            st = tkn_ind
        if int(row['end']) in range(*tkn):
            ed = tkn_ind
    if st < 0 or ed < 0:
        tkn_st.append(None)
        tkn_ed.append(None)
    else:
        tkn_st.append(st)
        tkn_ed.append(ed)
    
    #print (st, ed)
all_tasks_assertions['tkn_start'] = tkn_st
all_tasks_assertions['tkn_end'] = tkn_ed
print (all_tasks_assertions.shape)
all_tasks_assertions.dropna(inplace=True)
all_tasks_assertions.reset_index(inplace=True, drop=True)
print (all_tasks_assertions.shape)
all_tasks_assertions = all_tasks_assertions[['task_id', 'sentence', 'tkn_start', 'tkn_end', 'chunk', 'entity', 'assertion_label']]
all_tasks_assertions.head(50)


(1029, 14)
(1029, 14)


Unnamed: 0,task_id,sentence,tkn_start,tkn_end,chunk,entity,assertion_label
0,46,Sample Type / Medical Specialty:\nHematology -...,4,4,Hematology,Clinical_Dept,present
1,46,Sample Type / Medical Specialty:\nHematology -...,6,6,Oncology,Clinical_Dept,present
2,46,Sample Type / Medical Specialty:\nHematology -...,7,10,Non-Small Cell Lung Cancer,Oncological,present
3,46,Sample Type / Medical Specialty:\nHematology -...,12,12,Description:,Section_Header,present
4,46,Sample Type / Medical Specialty:\nHematology -...,15,18,non-small cell lung cancer,Oncological,present
5,46,Sample Type / Medical Specialty:\nHematology -...,19,20,stage IV,Modifier,present
6,46,Sample Type / Medical Specialty:\nHematology -...,21,22,metastatic disease,Oncological,present
7,46,"At this point, he and his wife ask about wheth...",3,3,he,Gender,present
8,46,"At this point, he and his wife ask about wheth...",5,5,his,Gender,present
9,46,"At this point, he and his wife ask about wheth...",6,6,wife,Gender,present


**2.3 Generate Data for Training Relation Extraction Model**

In [None]:
from sparknlp_jsl.training import *
from pyspark.sql import functions as F

json_path = './project_export.json'

rdr = AnnotationToolJsonReader(assertion_labels = ['present', 'absent', 'possbile', 'hypothetical', 'conditional', 'associated_with_someone_else'])

df_anns = rdr.readDataset(spark, json_path).withColumn("json",F.lit(json_path))

In [None]:
rel_pds = df_anns.toPandas()
print (rel_pds.shape)
print (rel_pds['task_id'].nunique())
rel_pds = rel_pds.sort_values('completion_id').drop_duplicates('task_id', keep='last')

In [None]:
rel_pds = rel_pds[rel_pds['relations'].apply(lambda x: len(x)) > 0]

all_tasks_relations = []
for index, group in rel_pds.groupby('task_id'):
    
    print (index)
    
    ann_chunks = pd.DataFrame( [{'chunk_id': i.metadata['chunk_id'], 'chunk': i.result, 'begin': int(i.begin), 'end': i.end, 'entity': i.metadata['entity']} for i in group['tool_chunk'].explode() ] )
        
    ner_chunks = pd.DataFrame( [{'chunk_num': ii, 'chunk': i.result, 'begin': int(i.begin), 'end': i.end, 'sentence_id': int(i.metadata['sentence'])} for ii, i in enumerate(group['ner_chunk'].explode()) ] )
    
    with_sent = pd.merge(ann_chunks[['chunk_id', 'begin', 'entity']], ner_chunks, on=['begin'], how='inner')
    
    sentences_df = pd.DataFrame( [{'sentence_id': int(i.metadata['sentence']), 'sentence': i.result, 'sent_begin': int(i.begin), 'sent_end': int(i.end)} for i in group['sentence'].explode() ] )
    
    with_sent = pd.merge(with_sent, sentences_df, on=['sentence_id'], how='inner')
    
    relations_df = pd.DataFrame( [ i.metadata for i in group['relations'].explode() ] )
    
    relations_df = relations_df.merge(with_sent.rename(columns={'chunk_id': 'chunk_id_1'}),
                   on=['chunk_id_1'], how='inner')
     
    relations_df['relations'] = [i.result for i in group['relations'].explode()]
    
    relations_df['task_id'] = index
    
    all_tasks_relations.append(relations_df)
    
all_tasks_relations = pd.concat(all_tasks_relations, axis=0)

all_tasks_relations['entity_begin1'] = all_tasks_relations['entity_begin1'].astype(int) - all_tasks_relations['sent_begin']
all_tasks_relations['entity_begin2'] = all_tasks_relations['entity_begin2'].astype(int) - all_tasks_relations['sent_begin']
all_tasks_relations['entity_end1'] = all_tasks_relations['entity_end1'].astype(int) - all_tasks_relations['sent_begin']
all_tasks_relations['entity_end2'] = all_tasks_relations['entity_end2'].astype(int) - all_tasks_relations['sent_begin']

# 3. Simply Upload data to an Alab Project (without pre-annotations)

In [None]:
def create_sample_data(text_list):
    sample_data_for_upload = []
    for index, text in enumerate(text_list):
        sample_data_for_upload.append({'title': index, 'text': text})

    return sample_data_for_upload

sample_data_for_upload = create_sample_data(sample_data['text'].values)

project_name = 'demo_100'
url = "https://annotationlab.johnsnowlabs.com/api/projects/{}/import".format(project_name)
print (url)

headers = {
        "Content-Type": "application/json",
        "accept": "*/*"
    }

cookies = get_cookies(username, password)

resp = requests.post(url, headers = headers, cookies = cookies, json = sample_data_for_upload)

resp.status_code
resp.text

https://annotationlab.johnsnowlabs.com/api/projects/dummy/import
200


'{"code":500,"description":"The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.","error":"Internal Server Error"}\n'