# Training a CRF with DataTurks Annotated .tsv File.

In [None]:
%reset -f

In [None]:
import pandas as pd
import re
import numpy as np
import random
import time
from seqeval.metrics import f1_score,classification_report
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,Input,concatenate,SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from nltk import pos_tag
import eli5
from sklearn_crfsuite import CRF
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import flat_f1_score

## Formatting Data to Sequences

### Contents:

1. Reading File and formatting as sequences
2. Formatting Entities to IOB Scheme
3. Adding POS and Other CRF Features
4. Train Test Split
5. Training Model
6. Evaluating Model Preformance
7. Saving Model Results

### 1. Reading and Formatting File:

In [None]:
def read_turks(file):
    with open(file) as f:
        lines = [i.rstrip().split("\t") for i in f.readlines()]
    return lines

In [None]:
file = "./data/Medical NER V2 2000.tsv"
word_ents = read_turks(file)
word_ents[:20]

In [None]:
def clean_words(word_ents):
    '''removes quote and comma characters from'''
    new_word_ents = []
    for ents in word_ents:
        word = ents[0]
        word = word.replace('"','')
        ents[0] = word
        new_word_ents.append(ents)
    return new_word_ents

In [None]:
new_ents = clean_words(word_ents)
new_ents[:20]

In [None]:
def create_seqs(word_ents):
    seqs = []
    seq = []
    for ents in word_ents:
        if len(ents)>1:
            if len(ents[0])>0:
                if ents[0][-1] == ".":
                    seq.append([ents[0][:-1],ents[1]])
                else:
                    seq.append(ents)
        else:
            seqs.append(seq)
            seq=[]
    return seqs

In [None]:
seqs = create_seqs(new_ents)
seqs[0]

### Expand Special Characters

In [None]:
def expand_word(ent):
    '''Splits at specified special characters keeping special characters as their own value'''
    words = re.split('([/\-\%><])',ent[0])
    return [[i,ent[1]] for i in words if len(i)>0]

def expand_special_chars(seqs):
    '''Expands special characters in words into seperate words while '''
    new_seqs = []
    for seq in seqs:
        new_seq = []
        for word in seq:
            new_seq += expand_word(word)
        new_seqs.append(new_seq)
    return new_seqs

In [None]:
ex_seqs = expand_special_chars(seqs)
ex_seqs[0]

### Encode Common Numeric Values

In [None]:
def encode_numerics(seq):
    '''Add encodings for common number types'''
    enc_seq = []
    for ent in seq:
        enc = ent[0].strip()
        if re.match("^\d$",ent[0]) != None:
            enc = "<1DigitNum>"
        elif re.match("^\d\d$",ent[0]) != None:
            enc = "<2DigitNum>"
        elif re.match("^\d\d\d$",ent[0]) !=None:
            enc = "<3DigitNum>"
        elif re.match("^\d{4}$",ent[0]) != None:
            enc = "4DigitNum"
        elif re.match("^\d*\.\d*$",ent[0]) != None:
            enc = "<DecimalNum>"
        elif re.match("^\d+,\d+$",ent[0]) != None:
            enc = "<CommaNum>"
        elif re.match("^\d+'?s$",ent[0]) !=None:
            enc = "<RangeNum>"
            
        enc_seq.append([enc,ent[1]])
    return enc_seq

In [None]:
enc_seqs = [encode_numerics(seq) for seq in ex_seqs]
enc_seqs[0]

### 2. Formatting Entities to IOB (Inside,Outside, Beginning) Scheme 

In [None]:
def clean_tags(word_ents):
    '''adds IOB scheme to tags'''
    new_ents = []
    for i in range(0,len(word_ents)):
        if word_ents[i][1] == "O":
            tag = word_ents[i][1]
        else:
            if not i:
                tag = "B-"+word_ents[i][1]
            else:
                if (word_ents[i][1] != word_ents[i-1][1]):
                    tag = "B-"+word_ents[i][1]
                else:
                    tag = "I-"+word_ents[i][1]

        new_ents.append([word_ents[i][0],tag])
    return new_ents

In [None]:
cleaned_tag_seqs = [clean_tags(ents) for ents in enc_seqs]
cleaned_tag_seqs[3]

### 3. Adding POS and Other CRF Specific Features

In [None]:
def add_pos(seqs):
    new_seqs = []
    for sentance in seqs:
        words = [word[0] for word in sentance]
        pos = pos_tag(words)        
        new_seq = [pos[i]+(sentance[i][1],) for i in range(len(sentance))]
        new_seqs.append(new_seq)
    return new_seqs

In [None]:
pos_seqs = add_pos(cleaned_tag_seqs)
print(pos_seqs[0])
print(len(pos_seqs))

In [None]:
def word2features(sent, i):
    '''
    
    From:
    https://www.depends-on-the-definition.com/named-entity-recognition-conditional-random-fields-python/
    
    '''
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
x = [sent2features(s) for s in pos_seqs]
y = [sent2labels(s) for s in pos_seqs]

### 4. Train Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

### 5. Training Model

In [None]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [None]:
crf.fit(x_train, y_train)
pred = crf.predict(x_test)
tr_pred = crf.predict(x_train)

### 6. Evaluating Model Performance

In [None]:
report = classification_report(y_test,pred)
print("Test Results:\n","-"*60)
print(report)
print("-"*60)

In [None]:
tr_report = classification_report(y_train,tr_pred)
print("Train Results:\n","-"*60)
print(tr_report)
print("-"*60)

Here we can see some of the weights used by the CRF in determining label classes.

In [None]:
eli5.show_weights(crf,top=30)

In [None]:
model_f1 = f1_score(y_test,pred)
print("F1-Score:",model_f1)

The model yields a respectable F1-Score compared to the other models. Only the BiLSTM with Custom Word2Vec Embeddings is able to outperform it and it does so only marginally.

### 7. Saving Model Results

In [None]:
n_samples = len(x)
model_desc = f"CRF"
results_file = "./data/model_results.csv"
note = '''Simple CRF model with same encodings as Word2Vec Models'''
def append_model_results(model_f1,n_samples,model_desc,file,note):
    with open(file,'a') as f:
        results = f"\n{model_f1},{int(n_samples)},{model_desc},{time.ctime()},{note}"
        f.writelines(results)
    print("~~~Results Successfully Saved")
    return

In [None]:
append_model_results(model_f1,n_samples,model_desc,results_file,note)

In [None]:
pd.read_csv(results_file).tail()

---