# Pre Annotate Training Data with CRF and Labelled Data

In [1]:
%reset -f

In [2]:
import pandas as pd
import json
from seqeval.metrics import f1_score,classification_report
from sklearn.model_selection import train_test_split
from nltk import pos_tag
from sklearn_crfsuite import CRF

This is a pipeline for using previously annotated dataturks .tsv data to preannotate unlabelled sentances which can then be uploaded to dataturks and fixed accordingly. The purpose is to speed up the annotation process by annotating tags which the model can consistently predict as a human would and using human annotations to occasionally fix these tags and label more difficult tags.

---

## Contents

#### 1. Format Annotated Turks TSV File for CRF Model
#### 2. Train / Eval CRF Model
#### 3. Format Unannotated Sentances for Prediction
#### 4. Create Annotations
#### 5. Format Annotations for Dataturks Ingestion

---

## 1. Format Annotated Turks TSV File for CRF Model

In [3]:
def read_turks(file):
    with open(file) as f:
        lines = [i.rstrip().split("\t") for i in f.readlines()]
    return lines

def create_seqs(word_ents):
    seqs = []
    seq = []
    for ents in word_ents:
        if len(ents)>1:
            if len(ents[0])>0:
                if ents[0][-1] == ".":
                    seq.append([ents[0][:-1],ents[1]])
                if len(ents[0])>1:
                    seq.append([ents[0].replace(",",""),ents[1]])
                else:
                    seq.append(ents)
        else:
            seqs.append([i for i in seq if len(i[0])>0])
            seq=[]
    return seqs

def clean_tags(word_ents):
    '''adds IOB scheme to tags'''
    new_ents = []
    for i in range(0,len(word_ents)):
        if word_ents[i][1] == "O":
            tag = word_ents[i][1]
        else:
            if not i:
                tag = "B-"+word_ents[i][1]
            else:
                if (word_ents[i][1] != word_ents[i-1][1]):
                    tag = "B-"+word_ents[i][1]
                else:
                    tag = "I-"+word_ents[i][1]

        new_ents.append([word_ents[i][0],tag])
    return new_ents

def add_pos(seqs):
    new_seqs = []
    for sentance in seqs:
        words = [word[0] for word in sentance]
        pos = pos_tag(words)        
        new_seq = [pos[i]+(sentance[i][1],) for i in range(len(sentance))]
        new_seqs.append(new_seq)
    return new_seqs

def word2features(sent, i):
    '''
    
    From:
    https://www.depends-on-the-definition.com/named-entity-recognition-conditional-random-fields-python/
    
    '''
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

def create_crf_data(file):
    seqs = read_turks(file)
    seqs = create_seqs(seqs)
    seqs = [clean_tags(ents) for ents in seqs]
    seqs = add_pos(seqs)
    x = [sent2features(s) for s in seqs]
    y = [sent2labels(s) for s in seqs]
    tokens = [sent2tokens(s) for s in seqs]
    return x,y,tokens

In [4]:
annotated_file = "./data/Medical NER V2 4000.tsv"
x,y,tokens = create_crf_data(annotated_file)
print("X-Sample (single value in sequence):")
print(x[0][0])
print("\nFull Sequence Tokens:")
print(tokens[0])
print("\nFull Sequence Labels:")
print(y[0])

X-Sample (single value in sequence):
{'bias': 1.0, 'word.lower()': '"history', 'word[-3:]': 'ORY', 'word[-2:]': 'RY', 'word.isupper()': True, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'NN', 'postag[:2]': 'NN', 'BOS': True, '+1:word.lower()': 'of', '+1:word.istitle()': False, '+1:word.isupper()': True, '+1:postag': 'NNP', '+1:postag[:2]': 'NN'}

Full Sequence Tokens:
['"HISTORY', 'OF', 'PRESENT', 'ILLNESS', 'This', 'is', 'an', '81-year-old', 'female', 'with', 'a', 'history', 'of', 'emphysema', 'not', 'on', 'home', 'O2', ',', 'who', 'presents', 'with', 'three', 'days', 'of', 'shortness', 'of', 'breath', 'thought', 'by', 'her', 'primary', 'care', 'doctor', 'to', 'be', 'a', 'COPD', 'flare', '"']

Full Sequence Labels:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Age', 'B-Gender', 'O', 'O', 'O', 'O', 'B-Condition', 'O', 'O', 'B-Drug', 'I-Drug', 'O', 'O', 'O', 'O', 'B-Duration', 'I-Duration', 'O', 'B-DOS', 'I-DOS', 'I-DOS', 'O', 'O', 'O', 'B-POI', 'I-POI', 'I-POI', 'O', 'O', 'O',

## 2. Train / Eval CRF Model

In [5]:
def train_crf(x,y):
    crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

    crf.fit(x, y)
    
    return crf

def eval_crf(x,y,split_size = 0.1):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=split_size, random_state=42)
    crf = train_crf(x_train,y_train)

    # Evaluate Test Performance
    pred = crf.predict(x_test)
    report = classification_report(y_test,pred)
    print("Test Results:\n","-"*60)
    print(report)
    print("-"*60)
    f1 = f1_score(y_test,pred)
    return f1

In [6]:
eval_crf(x,y)
crf = train_crf(x,y)

Test Results:
 ------------------------------------------------------------
                            precision    recall  f1-score   support

                       DOS       0.69      0.66      0.67       357
                 Procedure       0.66      0.57      0.61        91
        Patient Relocation       0.81      0.81      0.81        83
                      Date       0.85      0.81      0.83        42
                      BODY       0.47      0.46      0.47        56
Other Measurement / Result       0.73      0.63      0.68        81
                      Drug       0.91      0.86      0.88       146
                     Route       0.90      0.86      0.88        51
                       GEO       0.87      0.86      0.86       118
                      Time       0.66      0.70      0.68        67
                 Condition       0.66      0.65      0.65       138
               Temperature       0.82      0.69      0.75        13
                  Duration       0.37  

## 3. Format Unannotated Sentances for Prediction

In [7]:
def load_sentances(file):
    with open(file) as f:
            lines = [i.rstrip().split(" ") for i in f.readlines()]
    return lines

def add_pos_sentances(seqs):
    new_seqs = []
    for sentance in seqs:
        pos = pos_tag(sentance)        
        new_seqs.append(pos)
    return new_seqs

def remove_null_words(seq):
    return [word for word in seq if len(word)>0]

def prep_unannotated_data(file):
    # Load Data 
    seqs = load_sentances(file)
    
    # Remove null words
    seqs = [remove_null_words(seq) for seq in seqs]
    
    # Add CRF Features
    seqs = add_pos_sentances(seqs)
    return seqs

In [8]:
unannot_file = "./data/sentances_0-10.txt"
seqs = prep_unannotated_data(unannot_file)
seqs[0][15:30]

[('on', 'IN'),
 ('home', 'NN'),
 ('O2', 'NNP'),
 (',', ','),
 ('who', 'WP'),
 ('presents', 'VBZ'),
 ('with', 'IN'),
 ('three', 'CD'),
 ('days', 'NNS'),
 ('of', 'IN'),
 ('shortness', 'NN'),
 ('of', 'IN'),
 ('breath', 'NN'),
 ('thought', 'VBN'),
 ('by', 'IN')]

## 4. Create Annotations

Data is ready to be used by CRF, tags will needed to be formatted post prediciton in order to capture multi word entities

In [9]:
def create_annotations(crf,seqs):
    x = [sent2features(s) for s in seqs]
    ner_tags = crf.predict(x)
    return ner_tags

def combine_B_I_tags(seq):
    combi_seq = []
    phrase = ""
    for word,tag in seq:
        if tag == "O":
            if len(phrase)>0:
                combi_seq.append([phrase,phrase_tag])
                phrase = ""
            combi_seq.append([word,tag])
        else:
            if tag[0] == "B":
                if len(phrase)>0:
                    combi_seq.append([phrase,phrase_tag])
                    phrase = ""
                phrase = word
                phrase_tag = tag[2:]
            if tag[0] == "I":
                phrase += " "+word
    if len(phrase)>0:
        combi_seq.append([phrase,phrase_tag])
    return combi_seq

def create_dataset(crf,seqs):
    ner_tags = create_annotations(crf,seqs)
    data = []
    for i in range(len(seqs)):
        data_entry = [[seqs[i][j][0],ner_tags[i][j]] for j in range(len(seqs[i]))]
        data.append(data_entry)
    data = [combine_B_I_tags(seq) for seq in data]
    return data

In [10]:
data = create_dataset(crf,seqs)
data[0][15:25]

[['on', 'O'],
 ['home O2', 'Drug'],
 [',', 'O'],
 ['who', 'O'],
 ['presents', 'O'],
 ['with', 'O'],
 ['three days', 'Duration'],
 ['of', 'O'],
 ['shortness of breath', 'DOS'],
 ['thought', 'O']]

## 5. Format Annotations for DataTurks Ingestion:

Each line in .txt file will be in the following format:

{"content":"cd players and tuners","annotation":[{"label":["Category"],"points":[{"start":0,"end":1,"text":"cd"}]},{"label":["Category"],"points":[{"start":3,"end":9,"text":"players"}]},{"label":["Category"],"points":[{"start":15,"end":20,"text":"tuners"}]}],"extras":{"Name":"columnName","Class":"ColumnValue"}}

In [11]:
def get_points(seq):
    start = 0
    end = 0
    starts = []
    ends = []
    for entry in seq:
        starts.append(start)
        start += len(entry[0])+1
        ends.append(start-2)
    new_seq = [[seq[i][0],seq[i][1],starts[i],ends[i]] for i in range(len(seq))]
    return new_seq

def get_annotation(seq,ignore_ents):
    new_seq = get_points(seq)
    annotations = []
    for text,tag,start,end in new_seq:
        if tag not in ignore_ents:
            annot = {}
            annot["label"] = [tag]
            annot["points"] = [{"start":start,"end":end,"text":text}]
            annotations.append(annot)
    return annotations

def write_json(file,data,ignore_tags):
    with open(file,"w") as f:
        for seq in data:
            line = {}
            line["content"] = " ".join([i[0] for i in seq])
            line["annotation"]  = get_annotation(seq,ignore_tags)
            line["extras"] = {"Name":"ColumnName","Class":"ColumnValue"}
            f.write(json.dumps(line))
            f.write("\n")
    print("~~~Annotations Saved~~~")
    return

In [12]:
# Cutoff Precision of 0.6 when evaluating Tags
ignore_tags = ["O",
               "Duration",
               "BODY"]

save_file = "./data/pre_annot_sentances_0-10.txt"

write_json(save_file,data,ignore_tags)

~~~Annotations Saved~~~


## Full Pipeline


In [13]:
def smart_predict(annotated_file,unannotated_file,save_file,ignore_tags=["O"],eval_only=False):
    '''
    Inputs:
    
    annotated_file - file path to labelled data in .tsv format outputed from dataturks
    
    unannotated_file - file path to .txt file with each line containing an unannotated sentance
    
    save_file - file path to annotations generatated by crf model for upload to dataturks
    
    eval_only - if set to True will print a classification report using 10% of the annotated data as
                a test set. Can be used to determine which tags to ignore.
    
    Outputs:
    
    None
    
    Desc:
    
    Loads annotated data, trains crf model. Makes predictions on unannottated sentances and saves
    output for upload to dataturks annotation service. 
    
    
    '''
    
    x,y,_ = create_crf_data(annotated_file)
    
    if eval_only:
        eval_crf(x,y)
        return
    
    crf = train_crf(x,y)

    seqs = prep_unannotated_data(unannotated_file)
    data = create_dataset(crf,seqs)

    write_json(save_file,data,ignore_tags)
    return

def show_crf_eval(annotated_file):
    x,y,tokens = create_crf_data(annotated_file)
    if show_eval:
        eval_crf(x,y)
    return

In [14]:
annotated_file = "./data/Medical NER V2 4000.tsv"
unannot_file = "./data/sentances_0-10.txt"
save_file = "./data/pre_annot_sentances_0-10.txt"

smart_predict(annotated_file,unannot_file,save_file,eval_only=True)

Test Results:
 ------------------------------------------------------------
                            precision    recall  f1-score   support

                       DOS       0.69      0.66      0.67       357
                 Procedure       0.66      0.57      0.61        91
        Patient Relocation       0.81      0.81      0.81        83
                      Date       0.85      0.81      0.83        42
                      BODY       0.47      0.46      0.47        56
Other Measurement / Result       0.73      0.63      0.68        81
                      Drug       0.91      0.86      0.88       146
                     Route       0.90      0.86      0.88        51
                       GEO       0.87      0.86      0.86       118
                      Time       0.66      0.70      0.68        67
                 Condition       0.66      0.65      0.65       138
               Temperature       0.82      0.69      0.75        13
                  Duration       0.37  

Precision is the most important for annotations we want to make sure that the highest percentage of annotations are relevant. If the model misses an annotation its no difference than if it was unannotated before but if it gives us a false positive we will have to delete it which will cost time.

Here for example if we set a precision cut off of 0.6 we will ignore the "Duration" and "BODY" tags as we will likely have to delete >40% of them.

In [15]:
ignore_tags = ["O",
               "Duration",
               "BODY",
              ]

smart_predict(annotated_file,unannot_file,save_file,ignore_tags)

~~~Annotations Saved~~~


---