![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.3.prepare_CoNLL_from_annotations_for_NER.ipynb)

# Prepare CoNLL file from annotations for NER

In [None]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.6.1

In [7]:
import pandas as pd

train_entities_df = pd.read_csv('data/ChemProt/chemprot_train_entities.csv')
train_entities_df.head()

Unnamed: 0,text_id,entity,begin,end,chunk
0,11319232,CHEMICAL,242,250,acyl-CoAs
1,11319232,CHEMICAL,1193,1200,triacsin
2,11319232,CHEMICAL,1441,1447,sucrose
3,11319232,CHEMICAL,1637,1651,triacylglycerol
4,11319232,CHEMICAL,1702,1710,acyl-CoAs


In [8]:
train_text_df = pd.read_csv('data/ChemProt/chemprot_train_text.csv')
train_text_df.head()

Unnamed: 0,text_id,text
0,16357751,Selective costimulation modulators: a novel ap...
1,14967461,Emerging role of epidermal growth factor recep...
2,23468099,Effects of chronic social defeat stress on beh...
3,23293962,Hepatocyte growth factor activator inhibitor t...
4,7678677,Alprenolol and bromoacetylalprenololmenthane a...


In [9]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import sparknlp
from sparknlp.annotator import *

from sparknlp.base import *

spark = sparknlp.start()


def get_nlp_pipeline ():

    document_assembler = DocumentAssembler() \
        .setInputCol("text")\
        .setOutputCol('document')

    sentence = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")\
    .setDetectLists(False) 

    # modify the tokenizer as you wish depending on your data specs
    tokenizer = Tokenizer() \
        .setInputCols(["sentence"]) \
        .setOutputCol("token")
    
    pos = PerceptronModel.pretrained() \
              .setInputCols(["sentence", "token"]) \
              .setOutputCol("pos")
    
    pipeline = Pipeline(
        stages = [
            document_assembler,
            sentence,
            tokenizer,
            pos]
    )

    empty_data = spark.createDataFrame([[""]]).toDF("text")

    pipelineFit = pipeline.fit(empty_data)

    lp_pipeline = LightPipeline(pipelineFit)
    
    print ("Spark NLP lightpipeline is created")
    
    return lp_pipeline


lp_pipeline =  get_nlp_pipeline()


pos_anc download started this may take some time.
Approximate size to download 4.3 MB
[OK!]
Spark NLP lightpipeline is created


In [29]:
import pandas as pd
import json
import string

def get_conll_per_doc (lp_pipeline, single_entities_df, text, first_doc=True):
    
    if first_doc:
        conll_lines=["-DOCSTART- -X- -X- O\n\n"]
    else:
        conll_lines = []

    n = lp_pipeline.fullAnnotate(text)

    parsed = [(int(x.metadata['sentence']), x.result, x.begin, x.end, y.result) for x,y in zip(n[0]["token"],n[0]["pos"])]

    ents = []

    ann_results = single_entities_df


    for i, row in single_entities_df.iterrows(): 


        temp_text = row['chunk']
        start = row['begin']
        end = row['end']

        if len(temp_text)!=len(temp_text.rstrip()):
            end = end-(len(temp_text)-len(temp_text.rstrip()))
            temp_text = temp_text.rstrip()

        if len(temp_text)!=len(temp_text.lstrip()):
            start = start+(len(temp_text)-len(temp_text.lstrip())) 
            temp_text = temp_text.lstrip()

        ents.append((temp_text, row['entity'], start, end))

    df = pd.DataFrame(ents, columns=['chunk','label','start','end'])    
    
    ix_list=[]
    token_list=[]
    tag_list=[]

    for i,row in df.iterrows():

        base_ix= row["start"]

        w_len = 0

        punc_flag = False

        try:
            if row["chunk"][-1] in string.punctuation:
                punc_flag=True
                chunk = row["chunk"][:-1]+' '+row["chunk"][-1]
            else:
                chunk = row["chunk"]
        except:
            chunk = row["chunk"]

        last_ix = len(chunk.split())


        for i,t in enumerate(chunk.split()):

            if i==0:
                ix=base_ix
                iob = "B-"
            else:
                ix=ix+w_len+1
                iob = "I-"

            token_list.append(t)
            if punc_flag and i == last_ix-1:
                ix_list.append(ix-1)
            else:
                ix_list.append(ix)

            tag_list.append(iob+row['label'])

            w_len = len(t)

    tagged= list(zip(ix_list,token_list,tag_list))

    tag_dict = {(ix,token):tag for ix,token,tag in tagged}

    s=0

    for i, p in enumerate(parsed):

        if p[0]!=s:
            conll_lines.append("\n")
            s+=1

        conll_lines.append("{} {} {} {}\n".format(p[1], p[4], p[4], tag_dict.get((p[2],p[1]),"O")))

    conll_lines.append("\n")

    return conll_lines



In [30]:
from datetime import datetime
from tqdm import tqdm

def get_Conll_file (text_df, entities_df, path=None, limit = None):
    
    if limit is not None:
    
        text_df = text_df[:limit]
    
    conll_lines_list = []
    
    for i, row in tqdm(text_df.iterrows(), total=text_df.shape[0]):
        
        single_entities_df = entities_df[entities_df.text_id==row['text_id']]
        
        if i==0:
            first_doc = True
        else:
            first_doc = False
            
        lines = get_conll_per_doc (lp_pipeline, single_entities_df, row['text'], first_doc)
    
        conll_lines_list.extend(lines)
        
    if path is not None:
        
        
        conll_filename = '{}/ner_annotations_{}.conll'.format(path, str(datetime.now().date()))
        
        with open(conll_filename, 'w') as f:
            for i in conll_lines_list:
                f.write(i)

        print (conll_filename,  'is saved.')

    else:
        
        return conll_lines_list
    

In [31]:
conll = get_Conll_file (train_text_df, train_entities_df, path='data/ChemProt', limit=10)

100%|██████████| 10/10 [00:04<00:00,  2.37it/s]

data/ChemProt/ner_annotations_2020-09-24.conll is saved.





In [32]:
conll = get_Conll_file (train_text_df, train_entities_df, path=None, limit=10)

100%|██████████| 10/10 [00:04<00:00,  2.46it/s]


In [34]:
conll[:20]

['-DOCSTART- -X- -X- O\n\n',
 'Selective NNP NNP O\n',
 'costimulation NN NN O\n',
 'modulators NNS NNS O\n',
 ': : : O\n',
 'a DT DT O\n',
 'novel NN NN O\n',
 'approach NN NN O\n',
 'for IN IN O\n',
 'the DT DT O\n',
 'treatment NN NN O\n',
 'of IN IN O\n',
 'rheumatoid NN NN O\n',
 'arthritis NN NN O\n',
 '. . . O\n',
 '\n',
 'T NN NN O\n',
 'cells NNS NNS O\n',
 'have VBP VBP O\n',
 'a DT DT O\n']