# Training a CRF with DataTurks Annotated .tsv File.

In [6]:
%reset -f

In [7]:
import pandas as pd
import re
import numpy as np
import random
import time
from seqeval.metrics import f1_score,classification_report
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,Input,concatenate,SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from nltk import pos_tag
import eli5
from sklearn_crfsuite import CRF
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import flat_f1_score

## Formatting Data to Sequences

### Contents:

1. Reading File and formatting as sequences
2. Formatting Entities to IOB Scheme
3. Adding POS and Other CRF Features
4. Train Test Split
5. Training Model
6. Evaluating Model Preformance
7. Saving Model Results

### 1. Reading and Formatting File:

In [8]:
def read_turks(file):
    with open(file) as f:
        lines = [i.rstrip().split("\t") for i in f.readlines()]
    return lines

In [9]:
file = "./data/Medical NER V2 2000.tsv"
word_ents = read_turks(file)
word_ents[:20]

[['"In', 'O'],
 ['the', 'O'],
 ['Hospital3', 'GEO'],
 ['Emergency', 'GEO'],
 ['Room', 'GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['100%', 'O2 Saturation'],
 ['on', 'O2 Saturation'],
 ['CPAP', 'O2 Saturation'],
 ['"', 'O'],
 [''],
 ['She', 'O'],
 ['was', 'O'],
 ['not', 'DOS'],
 ['able', 'DOS'],
 ['to', 'DOS']]

In [10]:
def clean_words(word_ents):
    '''removes quote and comma characters from '''
    new_word_ents = []
    for ents in word_ents:
        word = ents[0]
        if word.find(',') > 0:
            word = word[word.find(',')+1:]
        word = word.replace('"','')
        ents[0] = word
        new_word_ents.append(ents)
    return new_word_ents

In [11]:
new_ents = clean_words(word_ents)
new_ents[:20]

[['In', 'O'],
 ['the', 'O'],
 ['Hospital3', 'GEO'],
 ['Emergency', 'GEO'],
 ['Room', 'GEO'],
 [',', 'O'],
 ['her', 'O'],
 ['oxygen', 'O'],
 ['saturation', 'O'],
 ['was', 'O'],
 ['100%', 'O2 Saturation'],
 ['on', 'O2 Saturation'],
 ['CPAP', 'O2 Saturation'],
 ['', 'O'],
 [''],
 ['She', 'O'],
 ['was', 'O'],
 ['not', 'DOS'],
 ['able', 'DOS'],
 ['to', 'DOS']]

In [7]:
def create_seqs(word_ents):
    seqs = []
    seq = []
    for ents in word_ents:
        if len(ents)>1:
            if len(ents[0])>1:
                seq.append(ents)
        else:
            seqs.append(seq)
            seq=[]
    return seqs

In [8]:
seqs = create_seqs(new_ents)
seqs[0]

[['HISTORY', 'O'],
 ['OF', 'O'],
 ['PRESENT', 'O'],
 ['ILLNESS', 'O'],
 ['This', 'O'],
 ['is', 'O'],
 ['an', 'O'],
 ['81-year-old', 'CONDITION/SYMPTOM'],
 ['female', 'CONDITION/SYMPTOM'],
 ['with', 'O'],
 ['history', 'O'],
 ['of', 'O'],
 ['emphysema', 'CONDITION/SYMPTOM'],
 ['not', 'O'],
 ['on', 'O'],
 ['home', 'DRUG'],
 ['O2', 'DRUG'],
 ['who', 'O'],
 ['presents', 'O'],
 ['with', 'O'],
 ['three', 'AMOUNT'],
 ['days', 'AMOUNT'],
 ['of', 'O'],
 ['shortness', 'CONDITION/SYMPTOM'],
 ['of', 'CONDITION/SYMPTOM'],
 ['breath', 'CONDITION/SYMPTOM'],
 ['thought', 'O'],
 ['by', 'O'],
 ['her', 'O'],
 ['primary', 'O'],
 ['care', 'O'],
 ['doctor', 'O'],
 ['to', 'O'],
 ['be', 'O'],
 ['COPD', 'CONDITION/SYMPTOM'],
 ['flare', 'CONDITION/SYMPTOM']]

### 2. Formatting Entities to IOB (Inside,Outside, Beginning) Scheme 

In [9]:
def clean_tags(word_ents):
    '''adds IOB scheme to tags'''
    new_ents = []
    for i in range(0,len(word_ents)):
        if word_ents[i][1] == "O":
            tag = word_ents[i][1]
        else:
            if not i:
                tag = "B-"+word_ents[i][1]
            else:
                if (word_ents[i][1] != word_ents[i-1][1]):
                    tag = "B-"+word_ents[i][1]
                else:
                    tag = "I-"+word_ents[i][1]

        new_ents.append([word_ents[i][0],tag])
    return new_ents

In [10]:
cleaned_tag_seqs = [clean_tags(ents) for ents in seqs]
cleaned_tag_seqs[3]

[['In', 'O'],
 ['the', 'O'],
 ['Hospital3', 'O'],
 ['Emergency', 'O'],
 ['her', 'O'],
 ['oxygen', 'B-MEASUREMENT'],
 ['saturation', 'I-MEASUREMENT'],
 ['was', 'I-MEASUREMENT'],
 ['100%', 'I-MEASUREMENT'],
 ['on', 'O'],
 ['CPAP', 'O']]

### 3. Adding POS and Other CRF Specific Features

In [11]:
def add_pos(seqs):
    new_seqs = []
    for sentance in seqs:
        words = [word[0] for word in sentance]
        pos = pos_tag(words)        
        new_seq = [pos[i]+(sentance[i][1],) for i in range(len(sentance))]
        new_seqs.append(new_seq)
    return new_seqs

In [12]:
pos_seqs = add_pos(cleaned_tag_seqs)
print(pos_seqs[0])
print(len(pos_seqs))

[('HISTORY', 'NN', 'O'), ('OF', 'NNP', 'O'), ('PRESENT', 'NNP', 'O'), ('ILLNESS', 'NNP', 'O'), ('This', 'DT', 'O'), ('is', 'VBZ', 'O'), ('an', 'DT', 'O'), ('81-year-old', 'JJ', 'B-CONDITION/SYMPTOM'), ('female', 'NN', 'I-CONDITION/SYMPTOM'), ('with', 'IN', 'O'), ('history', 'NN', 'O'), ('of', 'IN', 'O'), ('emphysema', 'NN', 'B-CONDITION/SYMPTOM'), ('not', 'RB', 'O'), ('on', 'IN', 'O'), ('home', 'NN', 'B-DRUG'), ('O2', 'NNP', 'I-DRUG'), ('who', 'WP', 'O'), ('presents', 'VBZ', 'O'), ('with', 'IN', 'O'), ('three', 'CD', 'B-AMOUNT'), ('days', 'NNS', 'I-AMOUNT'), ('of', 'IN', 'O'), ('shortness', 'NN', 'B-CONDITION/SYMPTOM'), ('of', 'IN', 'I-CONDITION/SYMPTOM'), ('breath', 'NN', 'I-CONDITION/SYMPTOM'), ('thought', 'VBN', 'O'), ('by', 'IN', 'O'), ('her', 'PRP$', 'O'), ('primary', 'JJ', 'O'), ('care', 'NN', 'O'), ('doctor', 'NN', 'O'), ('to', 'TO', 'O'), ('be', 'VB', 'O'), ('COPD', 'NNP', 'B-CONDITION/SYMPTOM'), ('flare', 'NN', 'I-CONDITION/SYMPTOM')]
2592


In [13]:
def word2features(sent, i):
    '''
    
    From:
    https://www.depends-on-the-definition.com/named-entity-recognition-conditional-random-fields-python/
    
    '''
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [14]:
x = [sent2features(s) for s in pos_seqs]
y = [sent2labels(s) for s in pos_seqs]

### 4. Train Test Split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

### 5. Training Model

In [16]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [17]:
crf.fit(x_train, y_train)
pred = crf.predict(x_test)

### 6. Evaluating Model Performance

In [18]:
report = classification_report(y_test,pred)
print("Test Results:\n","-"*60)
print(report)
print("-"*60)

Test Results:
 ------------------------------------------------------------
                   precision    recall  f1-score   support

CONDITION/SYMPTOM       0.63      0.57      0.60       305
         LOCATION       0.74      0.74      0.74       102
        FREQUENCY       1.00      0.23      0.38        13
      MEASUREMENT       0.62      0.54      0.58        65
             DRUG       0.82      0.72      0.77        86
            EVENT       0.60      0.49      0.54       120
              AGE       0.67      0.80      0.73        15
           AMOUNT       0.67      0.54      0.60        63
             TIME       0.58      0.39      0.47        36
           GENDER       0.58      0.78      0.67         9
     ORGANIZATION       0.80      0.62      0.70        13

        micro avg       0.66      0.59      0.62       827
        macro avg       0.67      0.59      0.62       827

------------------------------------------------------------


Here we can see some of the weights used by the CRF in determining label classes.

In [19]:
eli5.show_weights(crf,top=30)

From \ To,O,B-AGE,I-AGE,B-AMOUNT,I-AMOUNT,B-CONDITION/SYMPTOM,I-CONDITION/SYMPTOM,B-DATE,B-DRUG,I-DRUG,B-EVENT,I-EVENT,B-FREQUENCY,I-FREQUENCY,B-GENDER,B-LOCATION,I-LOCATION,B-MEASUREMENT,I-MEASUREMENT,B-ORGANIZATION,I-ORGANIZATION,B-TIME,I-TIME
O,3.987,2.675,0.0,3.048,0.0,5.326,0.0,0.696,4.102,0.0,3.646,0.0,2.126,0.0,3.824,3.794,0.0,3.877,0.0,3.517,0.0,2.53,0.0
B-AGE,1.889,0.0,4.716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-AGE,0.757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-AMOUNT,-0.42,0.0,0.0,0.0,7.364,2.591,0.0,0.0,2.843,0.0,1.634,0.0,1.329,0.0,0.0,0.224,0.0,0.0,0.0,0.0,0.0,-0.249,0.0
I-AMOUNT,-0.418,0.0,0.0,0.0,6.641,2.218,0.0,0.0,2.443,0.0,0.403,0.0,1.501,0.0,0.0,0.0,0.0,0.0,0.0,1.095,0.0,-0.725,0.0
B-CONDITION/SYMPTOM,0.269,0.0,0.0,1.183,0.0,0.0,6.814,0.0,-0.323,0.0,0.368,0.0,-0.005,0.0,0.0,-0.374,0.0,0.755,0.0,0.0,0.0,0.501,0.0
I-CONDITION/SYMPTOM,0.416,0.0,0.0,1.12,0.0,0.0,7.011,0.0,0.0,0.0,0.219,0.0,-0.401,0.0,0.0,0.928,0.0,1.011,0.0,-0.16,0.0,0.8,0.0
B-DATE,-0.078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-DRUG,0.646,0.0,0.0,2.654,0.0,1.867,0.0,0.0,0.0,6.463,0.88,0.0,0.616,0.0,0.0,0.0,0.0,0.855,0.0,0.638,0.0,0.789,0.0
I-DRUG,0.13,0.0,0.0,2.052,0.0,0.296,0.0,0.0,0.0,6.454,0.0,0.0,1.156,0.0,0.0,0.0,0.0,0.706,0.0,0.0,0.0,0.422,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8,Unnamed: 20_level_8,Unnamed: 21_level_8,Unnamed: 22_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9,Unnamed: 18_level_9,Unnamed: 19_level_9,Unnamed: 20_level_9,Unnamed: 21_level_9,Unnamed: 22_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10,Unnamed: 20_level_10,Unnamed: 21_level_10,Unnamed: 22_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11,Unnamed: 18_level_11,Unnamed: 19_level_11,Unnamed: 20_level_11,Unnamed: 21_level_11,Unnamed: 22_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12,Unnamed: 18_level_12,Unnamed: 19_level_12,Unnamed: 20_level_12,Unnamed: 21_level_12,Unnamed: 22_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13,Unnamed: 18_level_13,Unnamed: 19_level_13,Unnamed: 20_level_13,Unnamed: 21_level_13,Unnamed: 22_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14,Unnamed: 18_level_14,Unnamed: 19_level_14,Unnamed: 20_level_14,Unnamed: 21_level_14,Unnamed: 22_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15,Unnamed: 18_level_15,Unnamed: 19_level_15,Unnamed: 20_level_15,Unnamed: 21_level_15,Unnamed: 22_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16,Unnamed: 18_level_16,Unnamed: 19_level_16,Unnamed: 20_level_16,Unnamed: 21_level_16,Unnamed: 22_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17,Unnamed: 18_level_17,Unnamed: 19_level_17,Unnamed: 20_level_17,Unnamed: 21_level_17,Unnamed: 22_level_17
Weight?,Feature,Unnamed: 2_level_18,Unnamed: 3_level_18,Unnamed: 4_level_18,Unnamed: 5_level_18,Unnamed: 6_level_18,Unnamed: 7_level_18,Unnamed: 8_level_18,Unnamed: 9_level_18,Unnamed: 10_level_18,Unnamed: 11_level_18,Unnamed: 12_level_18,Unnamed: 13_level_18,Unnamed: 14_level_18,Unnamed: 15_level_18,Unnamed: 16_level_18,Unnamed: 17_level_18,Unnamed: 18_level_18,Unnamed: 19_level_18,Unnamed: 20_level_18,Unnamed: 21_level_18,Unnamed: 22_level_18
Weight?,Feature,Unnamed: 2_level_19,Unnamed: 3_level_19,Unnamed: 4_level_19,Unnamed: 5_level_19,Unnamed: 6_level_19,Unnamed: 7_level_19,Unnamed: 8_level_19,Unnamed: 9_level_19,Unnamed: 10_level_19,Unnamed: 11_level_19,Unnamed: 12_level_19,Unnamed: 13_level_19,Unnamed: 14_level_19,Unnamed: 15_level_19,Unnamed: 16_level_19,Unnamed: 17_level_19,Unnamed: 18_level_19,Unnamed: 19_level_19,Unnamed: 20_level_19,Unnamed: 21_level_19,Unnamed: 22_level_19
Weight?,Feature,Unnamed: 2_level_20,Unnamed: 3_level_20,Unnamed: 4_level_20,Unnamed: 5_level_20,Unnamed: 6_level_20,Unnamed: 7_level_20,Unnamed: 8_level_20,Unnamed: 9_level_20,Unnamed: 10_level_20,Unnamed: 11_level_20,Unnamed: 12_level_20,Unnamed: 13_level_20,Unnamed: 14_level_20,Unnamed: 15_level_20,Unnamed: 16_level_20,Unnamed: 17_level_20,Unnamed: 18_level_20,Unnamed: 19_level_20,Unnamed: 20_level_20,Unnamed: 21_level_20,Unnamed: 22_level_20
Weight?,Feature,Unnamed: 2_level_21,Unnamed: 3_level_21,Unnamed: 4_level_21,Unnamed: 5_level_21,Unnamed: 6_level_21,Unnamed: 7_level_21,Unnamed: 8_level_21,Unnamed: 9_level_21,Unnamed: 10_level_21,Unnamed: 11_level_21,Unnamed: 12_level_21,Unnamed: 13_level_21,Unnamed: 14_level_21,Unnamed: 15_level_21,Unnamed: 16_level_21,Unnamed: 17_level_21,Unnamed: 18_level_21,Unnamed: 19_level_21,Unnamed: 20_level_21,Unnamed: 21_level_21,Unnamed: 22_level_21
Weight?,Feature,Unnamed: 2_level_22,Unnamed: 3_level_22,Unnamed: 4_level_22,Unnamed: 5_level_22,Unnamed: 6_level_22,Unnamed: 7_level_22,Unnamed: 8_level_22,Unnamed: 9_level_22,Unnamed: 10_level_22,Unnamed: 11_level_22,Unnamed: 12_level_22,Unnamed: 13_level_22,Unnamed: 14_level_22,Unnamed: 15_level_22,Unnamed: 16_level_22,Unnamed: 17_level_22,Unnamed: 18_level_22,Unnamed: 19_level_22,Unnamed: 20_level_22,Unnamed: 21_level_22,Unnamed: 22_level_22
+4.309,word.lower():history,,,,,,,,,,,,,,,,,,,,,
+4.180,word.lower():patient,,,,,,,,,,,,,,,,,,,,,
+3.720,word.lower():baseline,,,,,,,,,,,,,,,,,,,,,
+3.479,word.lower():possible,,,,,,,,,,,,,,,,,,,,,
+3.463,word.lower():similar,,,,,,,,,,,,,,,,,,,,,
+3.197,+1:word.lower():inital,,,,,,,,,,,,,,,,,,,,,
+3.172,word.lower():diagnosis,,,,,,,,,,,,,,,,,,,,,
+3.163,word.lower():postdialysis,,,,,,,,,,,,,,,,,,,,,
+3.068,word.lower():pt,,,,,,,,,,,,,,,,,,,,,
+3.054,word.lower():access,,,,,,,,,,,,,,,,,,,,,

Weight?,Feature
+4.309,word.lower():history
+4.180,word.lower():patient
+3.720,word.lower():baseline
+3.479,word.lower():possible
+3.463,word.lower():similar
+3.197,+1:word.lower():inital
+3.172,word.lower():diagnosis
+3.163,word.lower():postdialysis
+3.068,word.lower():pt
+3.054,word.lower():access

Weight?,Feature
+3.199,-1:word.lower():illness
+2.667,-1:word.lower():is
+2.532,-1:word.lower():age
+2.288,+1:word.lower():resident
+1.706,+1:word.lower():underwent
+1.694,word[-3:]:ale
+1.688,-1:word.lower():elderly
+1.670,word.lower():female
+1.619,-1:word.lower():this
+1.595,-1:word.lower():old

Weight?,Feature
+2.412,-1:word.lower():69
+1.252,word[-3:]:old
+1.157,word[-2:]:ld
+1.154,+1:word.lower():right-handed
+1.141,-1:word.lower():over
+1.128,word[-3:]:90
+1.128,word.lower():90
+1.118,word[-2:]:90
+1.113,+1:word.lower():status
+1.093,+1:postag:NN

Weight?,Feature
+3.701,word[-2:]:mg
+3.219,-1:word.lower():cough
+3.004,word.lower():none
+2.800,word.lower():1-25
+2.742,-1:word.lower():sytems
+2.626,+1:word.lower():of
+2.624,+1:word.lower():bare
+2.571,+1:word.lower():ng
+2.507,word.lower():days
+2.390,word.lower():liter

Weight?,Feature
+2.389,word.lower():day
+2.015,-1:word.lower():hours
+2.012,word.lower():minute
+1.839,-1:word.lower():moderate
+1.788,+1:word.lower():completed
+1.786,+1:word.lower():history
+1.773,+1:word.lower():course
+1.760,word.lower():x1
+1.705,-1:word.lower():evening
+1.654,word[-2:]:x1

Weight?,Feature
+4.214,word[-2:]:ia
+4.079,word[-3:]:tis
+3.967,word.lower():vomited
+3.927,word.lower():swelling
+3.866,word.lower():vomiting
+3.822,word.lower():seizure
+3.818,word.lower():lightheaded
+3.681,word.lower():afebrile
+3.602,word.lower():unresponsive
+3.553,word.lower():somnolent

Weight?,Feature
+3.840,+1:word.lower():until
+2.988,+1:word.lower():wheezing
+2.875,+1:word.lower():fever/chills/cough
+2.815,+1:word.lower():growth
+2.731,word.lower():fever/chills/cough
+2.631,word.lower():pain
+2.623,-1:word.lower():dilated
+2.606,word.lower():vomiting
+2.530,-1:word.lower():speech
+2.513,-1:word.lower():eyes

Weight?,Feature
2.013,word.lower():2149-11-17
1.858,-1:word.lower():on
1.762,word.lower():2171-11-4
1.762,word[-3:]:1-4
1.568,word[-2:]:-4
1.281,word[-2:]:17
1.225,word[-3:]:-17
1.207,postag:JJ
1.083,postag[:2]:JJ
0.76,+1:word.lower():and

Weight?,Feature
+3.485,word[-2:]:ol
+3.273,word[-3:]:rin
+3.261,word[-2:]:NS
+3.229,word.lower():nebulizers
+3.093,word.lower():versed
+3.035,word[-2:]:yl
+2.920,word.lower():medications
+2.888,-1:word.lower():received
+2.882,word.lower():pressors
+2.827,word[-3:]:ium

Weight?,Feature
+2.134,-1:word.lower():125
+1.866,word.lower():meds
+1.843,word[-2:]:in
+1.809,word[-3:]:eds
+1.752,word[-2:]:en
+1.706,word.lower():gtt
+1.624,+1:word.lower():gtt
+1.528,word[-3:]:cin
+1.503,word[-2:]:yl
+1.494,-1:word.lower():iv

Weight?,Feature
+5.915,word.lower():intubated
+4.654,word.lower():extubated
+4.511,word.lower():hemodialysis
+3.964,word.lower():hospitalization
+3.592,word.lower():intubation
+3.365,word.lower():noncompliant
+3.055,word.lower():arrived
+3.034,word.lower():ultrasound
+2.869,word[-3:]:ube
+2.839,word[-2:]:py

Weight?,Feature
+2.983,-1:word.lower():review
+2.501,word.lower():removed
+2.470,+1:word.lower():feeling
+2.419,+1:word.lower():sent
+2.277,word[-3:]:sty
+2.272,+1:word.lower():pending
+2.238,word[-2:]:py
+2.112,+1:word.lower():hospitalizations
+2.088,word.lower():stents
+2.080,-1:word.lower():ua

Weight?,Feature
+4.128,word.lower():intermittently
+3.200,word.lower():frequent
+3.012,word.lower():chronic
+2.643,word.lower():daily
+2.632,word.lower():constant
+2.630,word.lower():intermittent
+2.303,word.lower():occasional
+2.112,word.lower():occasionally
+2.110,-1:word.lower():only
+2.023,word[-2:]:nt

Weight?,Feature
+2.545,-1:word.lower():every
+1.527,-1:word.lower():per
+1.500,+1:word.lower():off
+1.431,word[-3:]:day
+1.371,word.lower():day
+1.341,word.lower():times
+1.327,word[-3:]:mes
+1.308,word[-2:]:ay
+1.244,word[-3:]:off
+1.244,word.lower():off

Weight?,Feature
+3.588,word[-3:]:man
+3.212,word[-3:]:ale
+2.170,word[-2:]:an
+2.155,word.lower():male
+2.059,word[-2:]:le
+1.924,word.lower():man
+1.736,-1:word.lower():yo
+1.690,-1:word.lower():young
+1.233,+1:word.lower():patient
+1.210,word.lower():lady

Weight?,Feature
+3.545,word.lower():home
+3.247,word.lower():substernal
+3.033,word.lower():rehabilitation
+2.992,-1:word.lower():discharged
+2.911,-1:word.lower():twi
+2.908,word[-3:]:ded
+2.887,word.lower():c-spine
+2.731,+1:word.lower():signout
+2.509,+1:word.lower():colon
+2.494,word.lower():lungs

Weight?,Feature
+2.590,-1:word.lower():left
+2.490,-1:word.lower():upper
+2.390,word.lower():laboratory
+2.385,+1:word.lower():756
+2.091,+1:word.lower():garden
+2.055,word.lower():hospital
+2.048,word[-3:]:rso
+2.048,word.lower():torso
+1.984,+1:word.lower():calf
+1.920,+1:word.lower():ca

Weight?,Feature
+3.016,word.lower():hypotension
+2.918,-1:word.lower():initial
+2.726,BOS
+2.664,word.lower():elevated
+2.512,-1:word.lower():runs
+2.436,+1:word.lower():showing
+2.427,word.lower():negative
+2.226,-1:word.lower():intubation
+2.195,word.lower():unremarkable
+2.124,+1:word.lower():scc

Weight?,Feature
+3.318,word[-2:]:0s
+2.024,+1:word.lower():been
+1.917,-1:word.lower():pressure
+1.799,word[-2:]:'s
+1.722,word.lower():percent
+1.698,-1:word.lower():hr
+1.595,word.lower():range
+1.517,-1:word.lower():hematocrit
+1.511,+1:word.lower():2l
+1.485,-1:word.lower():fever

Weight?,Feature
+3.562,word[-3:]:ist
+3.177,-1:word.lower():per
+2.895,-1:word.lower():called
+2.800,word.lower():ems
+2.609,word[-3:]:ogy
+2.563,word[-2:]:gy
+2.353,-1:word.lower():by
+2.314,word.lower():medicine
+2.233,word[-2:]:ro
+1.970,word.lower():ambulance

Weight?,Feature
+1.770,word.lower():service
+1.730,word.lower():physician
+1.701,-1:word.lower():primary
+1.668,word[-3:]:ian
+1.631,-1:word.lower():ed
+1.538,+1:word.lower():team
+1.354,word.lower():services
+1.283,+1:word.lower():service
+1.270,word[-3:]:ces
+1.236,word[-3:]:aff

Weight?,Feature
+4.635,word.lower():overnight
+3.221,word.lower():am
+3.185,word[-3:]:day
+3.119,+1:word.lower():summer
+3.046,word[-2:]:pm
+2.732,word.lower():yest
+2.664,word.lower():afternoon
+2.651,word.lower():evening
+2.633,word[-2:]:ay
+2.529,+1:word.lower():#3

Weight?,Feature
+2.747,word.lower():later
+2.711,word.lower():morning
+2.423,-1:word.lower():post
+2.416,-1:word.lower():last
+2.349,-1:word.lower():past
+2.162,-1:word.lower():before
+2.133,word.lower():ago
+2.133,word[-3:]:ago
+2.080,+1:word.lower():abg
+2.048,word[-2:]:am


In [20]:
model_f1 = f1_score(y_test,pred)
print("F1-Score:",model_f1)

F1-Score: 0.6225080385852089


The model yields a respectable F1-Score compared to the other models. Only the BiLSTM with Custom Word2Vec Embeddings is able to outperform it and it does so only marginally.

### 7. Saving Model Results

In [21]:
n_samples = len(x)
model_desc = f"CRF"
results_file = "./nlp_data/model_results.csv"
note = '''Simple CRF model'''
def append_model_results(model_f1,n_samples,model_desc,file,note):
    with open(file,'a') as f:
        results = f"\n{model_f1},{int(n_samples)},{model_desc},{time.ctime()},{note}"
        f.writelines(results)
    print("~~~Results Successfully Saved")
    return

In [22]:
append_model_results(model_f1,n_samples,model_desc,results_file,note)

~~~Results Successfully Saved


In [23]:
pd.read_csv(results_file).tail()

Unnamed: 0,F1-Score,N-Samples,Model Type,Date,Note
13,0.606472,2592.0,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 7 13:30:13 2019,Reduced Max Sequence Length to 43 (95th percen...
14,0.606472,2592.0,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 7 13:30:51 2019,Used Custom Word2Vec Embeddings of entire Disc...
15,0.622917,2592.0,BiLSTM-Word2Vec-EmbedSize-100,Thu Nov 7 13:46:03 2019,Used Custom Word2Vec Embeddings of entire Disc...
16,0.545845,2592.0,BiLSTM-Glove-EmbedSize-100,Thu Nov 7 14:34:22 2019,Max Len Reverted Back to 50 Words
17,0.622508,2592.0,CRF,Thu Nov 7 14:41:08 2019,Simple CRF model


---