# Pre Annotate DataTurks Data with CRF

In [1]:
%reset -f

In [201]:
import pandas as pd
import re
import numpy as np
import random
import time
import json
from seqeval.metrics import f1_score,classification_report
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,Input,concatenate,SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from nltk import pos_tag
import eli5
from sklearn_crfsuite import CRF
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import flat_f1_score

In [3]:
def read_turks(file):
    with open(file) as f:
        lines = [i.rstrip().split("\t") for i in f.readlines()]
    return lines

file = "./data/Medical NER V2 4000.tsv"
word_ents = read_turks(file)
word_ents[:20]

[['"HISTORY', 'O'],
 ['OF', 'O'],
 ['PRESENT', 'O'],
 ['ILLNESS', 'O'],
 ['This', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'Age'],
 ['female', 'Gender'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'Condition'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'Drug'],
 ['O2', 'Drug'],
 [',', 'O'],
 ['who', 'O']]

In [5]:
def clean_words(word_ents):
    '''removes quote and comma characters from'''
    new_word_ents = []
    for ents in word_ents:
        word = ents[0]
        word = word.replace('"','')
        ents[0] = word
        new_word_ents.append(ents)
    return new_word_ents

In [6]:
new_ents = clean_words(word_ents)
new_ents[:20]

[['HISTORY', 'O'],
 ['OF', 'O'],
 ['PRESENT', 'O'],
 ['ILLNESS', 'O'],
 ['This', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'Age'],
 ['female', 'Gender'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'Condition'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'Drug'],
 ['O2', 'Drug'],
 [',', 'O'],
 ['who', 'O']]

In [253]:
def create_seqs(word_ents):
    seqs = []
    seq = []
    for ents in word_ents:
        if len(ents)>1:
            if len(ents[0])>0:
                if ents[0][-1] == ".":
                    seq.append([ents[0][:-1],ents[1]])
                if len(ents[0])>1:
                    seq.append([ents[0].replace(",",""),ents[1]])
                else:
                    seq.append(ents)
        else:
            seqs.append([i for i in seq if len(i[0])>0])
            seq=[]
    return seqs

In [254]:
seqs = create_seqs(new_ents)
seqs[0]

[['HISTORY', 'O'],
 ['OF', 'O'],
 ['PRESENT', 'O'],
 ['ILLNESS', 'O'],
 ['This', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'Age'],
 ['female', 'Gender'],
 ['with', 'O'],
 ['a', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'Condition'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'Drug'],
 ['O2', 'Drug'],
 [',', 'O'],
 ['who', 'O'],
 ['presents', 'O'],
 ['with', 'O'],
 ['three', 'Duration'],
 ['days', 'Duration'],
 ['of', 'O'],
 ['shortness', 'DOS'],
 ['of', 'DOS'],
 ['breath', 'DOS'],
 ['thought', 'O'],
 ['by', 'O'],
 ['her', 'O'],
 ['primary', 'POI'],
 ['care', 'POI'],
 ['doctor', 'POI'],
 ['to', 'O'],
 ['be', 'O'],
 ['a', 'O'],
 ['COPD', 'Condition'],
 ['flare', 'Condition']]

In [259]:
def add_pos(seqs):
    new_seqs = []
    for sentance in seqs:
        words = [word[0] for word in sentance]
        pos = pos_tag(words)        
        new_seq = [pos[i]+(sentance[i][1],) for i in range(len(sentance))]
        new_seqs.append(new_seq)
    return new_seqs

In [260]:
pos_seqs = add_pos(seqs)
print(pos_seqs[0])
print(len(pos_seqs))

[('HISTORY', 'NN', 'O'), ('OF', 'NNP', 'O'), ('PRESENT', 'NNP', 'O'), ('ILLNESS', 'NNP', 'O'), ('This', 'DT', 'O'), ('is', 'VBZ', 'O'), ('an', 'DT', 'O'), ('81-year-old', 'JJ', 'Age'), ('female', 'NN', 'Gender'), ('with', 'IN', 'O'), ('a', 'DT', 'O'), ('history', 'NN', 'O'), ('of', 'IN', 'O'), ('emphysema', 'NN', 'Condition'), ('not', 'RB', 'O'), ('on', 'IN', 'O'), ('home', 'NN', 'Drug'), ('O2', 'NNP', 'Drug'), (',', ',', 'O'), ('who', 'WP', 'O'), ('presents', 'VBZ', 'O'), ('with', 'IN', 'O'), ('three', 'CD', 'Duration'), ('days', 'NNS', 'Duration'), ('of', 'IN', 'O'), ('shortness', 'NN', 'DOS'), ('of', 'IN', 'DOS'), ('breath', 'NN', 'DOS'), ('thought', 'VBN', 'O'), ('by', 'IN', 'O'), ('her', 'PRP$', 'O'), ('primary', 'JJ', 'POI'), ('care', 'NN', 'POI'), ('doctor', 'NN', 'POI'), ('to', 'TO', 'O'), ('be', 'VB', 'O'), ('a', 'DT', 'O'), ('COPD', 'NNP', 'Condition'), ('flare', 'NN', 'Condition')]
4002


In [261]:
def word2features(sent, i):
    '''
    
    From:
    https://www.depends-on-the-definition.com/named-entity-recognition-conditional-random-fields-python/
    
    '''
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [262]:
x = [sent2features(s) for s in pos_seqs]
y = [sent2labels(s) for s in pos_seqs]

In [263]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

In [264]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

crf.fit(x_train, y_train)
pred = crf.predict(x_test)
tr_pred = crf.predict(x_train)

In [265]:
report = classification_report(y_test,pred)
print("Test Results:\n","-"*60)
print(report)
print("-"*60)

Test Results:
 ------------------------------------------------------------
                    precision    recall  f1-score   support

               Age       1.00      1.00      1.00        20
              Dose       1.00      0.92      0.96        65
               DOS       0.67      0.62      0.64       357
              Drug       0.92      0.89      0.90       146
  Test / Screening       0.79      0.73      0.76        71
             Route       0.90      0.84      0.87        51
         Condition       0.65      0.64      0.64       138
              Date       0.85      0.81      0.83        42
               GEO       0.82      0.84      0.83       118
              Time       0.69      0.72      0.70        67
              BODY       0.63      0.56      0.59       101
  Respiratory Rate       1.00      0.70      0.82        10
         Frequency       0.67      0.50      0.57        16
         Procedure       0.70      0.59      0.64        91
Patient Relocation     

In [266]:
model_f1 = f1_score(y_test,pred)
print("F1-Score:",model_f1)

F1-Score: 0.7282535401213756


In [267]:
idx = 2
sample_sentance = [i["word.lower()"] for i in x_test[idx]]

print("{:20} {:20} {:20}".format("Word:","Prediction:","Real Label:"))
for i in range(len(sample_sentance)):
    print("{:20} {:20} {:20}".format(sample_sentance[i],pred[idx][i],y_test[idx][i]))

Word:                Prediction:          Real Label:         
her                  O                    O                   
chest                Test / Screening     Test / Screening    
x-ray                Test / Screening     Test / Screening    
was                  O                    O                   
consistent           O                    O                   
with                 O                    O                   
increasing           O                    O                   
congestive           Condition            Condition           
heart                Condition            Condition           
failure              Condition            Condition           
compared             O                    O                   
with                 O                    O                   
earlier              Time                 Time                
in                   Time                 Time                
the                  Time                 Time         

In [268]:
sample_seqs = pos_seqs[:10]

In [278]:
sample_seqs[0]

[('HISTORY', 'NN', 'O'),
 ('OF', 'NNP', 'O'),
 ('PRESENT', 'NNP', 'O'),
 ('ILLNESS', 'NNP', 'O'),
 ('This', 'DT', 'O'),
 ('is', 'VBZ', 'O'),
 ('an', 'DT', 'O'),
 ('81-year-old', 'JJ', 'Age'),
 ('female', 'NN', 'Gender'),
 ('with', 'IN', 'O'),
 ('a', 'DT', 'O'),
 ('history', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('emphysema', 'NN', 'Condition'),
 ('not', 'RB', 'O'),
 ('on', 'IN', 'O'),
 ('home', 'NN', 'Drug'),
 ('O2', 'NNP', 'Drug'),
 (',', ',', 'O'),
 ('who', 'WP', 'O'),
 ('presents', 'VBZ', 'O'),
 ('with', 'IN', 'O'),
 ('three', 'CD', 'Duration'),
 ('days', 'NNS', 'Duration'),
 ('of', 'IN', 'O'),
 ('shortness', 'NN', 'DOS'),
 ('of', 'IN', 'DOS'),
 ('breath', 'NN', 'DOS'),
 ('thought', 'VBN', 'O'),
 ('by', 'IN', 'O'),
 ('her', 'PRP$', 'O'),
 ('primary', 'JJ', 'POI'),
 ('care', 'NN', 'POI'),
 ('doctor', 'NN', 'POI'),
 ('to', 'TO', 'O'),
 ('be', 'VB', 'O'),
 ('a', 'DT', 'O'),
 ('COPD', 'NNP', 'Condition'),
 ('flare', 'NN', 'Condition')]

In [269]:
def create_annotations(crf,seqs):
    x = [sent2features(s) for s in seqs]
    ner_tags = crf.predict(x)
    return ner_tags

In [270]:
def create_dataset(crf,seqs):
    ner_tags = create_annotations(crf,seqs)
    data = []
    for i in range(len(seqs)):
        data_entry = [[seqs[i][j][0],ner_tags[i][j]] for j in range(len(seqs[i]))]
        data.append(data_entry)
    return data

In [271]:
data = create_dataset(crf,sample_seqs)
data[0][:10]

[['HISTORY', 'O'],
 ['OF', 'O'],
 ['PRESENT', 'O'],
 ['ILLNESS', 'O'],
 ['This', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'Age'],
 ['female', 'Gender'],
 ['with', 'O']]

In [272]:
sample_row = data[0]

In [273]:
def get_points(seq):
    start = 0
    end = 0
    starts = []
    ends = []
    for entry in seq:
        starts.append(start)
        start += len(entry[0])+1
        ends.append(start-2)
    new_seq = [[seq[i][0],seq[i][1],starts[i],ends[i]] for i in range(len(seq))]
    return new_seq

In [274]:
def get_annotation(seq):
    new_seq = get_points(seq)
    annotations = []
    for text,tag,start,end in new_seq:
        if tag != "O":
            annot = {}
            annot["label"] = [tag]
            annot["points"] = [{"start":start,"end":end,"text":text}]
            annotations.append(annot)
    return annotations

In [275]:
get_points(data[3])

[['Review', 'Test / Screening', 0, 5],
 ['of', 'Test / Screening', 7, 8],
 ['systems', 'Test / Screening', 10, 16],
 ['is', 'O', 18, 19],
 ['negative', 'Other Measurement / Result', 21, 28],
 ['for', 'O', 30, 32],
 ['the', 'O', 34, 36],
 ['following', 'O', 38, 46],
 ['Fevers', 'DOS', 48, 53],
 ['chills', 'DOS', 55, 60],
 ['nausea', 'DOS', 62, 67],
 ['vomiting', 'DOS', 69, 76],
 ['night', 'DOS', 78, 82],
 ['sweats', 'DOS', 84, 89],
 ['change', 'DOS', 91, 96],
 ['in', 'DOS', 98, 99],
 ['weight', 'DOS', 101, 106],
 ['gastrointestinal', 'DOS', 108, 123],
 ['complaints', 'DOS', 125, 134],
 ['neurologic', 'DOS', 136, 145],
 ['changes', 'DOS', 147, 153],
 ['rashes', 'DOS', 155, 160],
 ['palpitations', 'DOS', 162, 173],
 ['orthopnea', 'DOS', 175, 183]]

In [276]:
def write_json(file,data):
    with open(file,"w") as f:
        for seq in data:
            line = {}
            line["content"] = " ".join([i[0] for i in seq])
            line["annotation"]  = get_annotation(seq)
            line["extras"] = {"Name":"ColumnName","Class":"ColumnValue"}
            f.write(json.dumps(line))
            f.write("\n")
    print("~~~Annotations Saved~~~")
    return

In [277]:
save_file = "./data/sample_annots.txt"
write_json(save_file,data)

~~~Annotations Saved~~~
