Notebook for training NER for CoNL2003
Achives 93-96 f1 measure


In [None]:
import numpy as np
import os
import tensorflow as tf
import string
import random
import math
import sys

from ner_model import NerModel
from dataset_encoder import DatasetEncoder
from ner_model_saver import NerModelSaver
from embeddings_resolver import *

In [None]:
word_dim = 1024 #1024  # 768, or 50, 100,  200, 300 for glove
#word_dim = 100
embeddings_type = 'bert' 
#embeddings_type = 'glove'

glove_embeddings_file = 'glove.6B.{}d.txt'.format(word_dim)
bert_embeddings_file = 'cased_L-24_H-1024_A-16' if word_dim == 1024 else 'cased_L-12_H-768_A-12'

dataset_folder = './conll2003/'
train_file = dataset_folder + 'eng.train'
test_file_a = dataset_folder + 'eng.testa'
test_file_b = dataset_folder + 'eng.testb'

In [None]:
# Returns sentences, each sentence is an array of tuples: (word, tag)
def read_conll(file):
    
    # array of tuple (word, tag)
    sentence = []
    
    with open(file) as f:
        for line in f:                
            items = line.strip().split(' ')
            if len(items) < 4 or items[0] == '-DOCSTART-':
                if len(sentence) > 0:
                    yield sentence
                    
                    sentence = []
            else:
                word = items[0]
                tag = items[3]
                
                sentence.append((word, tag))
                
    if len(sentence) > 0:
        yield sentence

In [None]:
import pandas as pd

def slice(items, batch, start = 0):
    result = []
    for i, item in enumerate(items):
        if i >= start:
            result.append(item)
            if len(result) >= batch:
                yield result
                result = []
                
    if result:
        yield result
            
def read_dataset_incrementally(file, encoder, flush_size = 1000, embeddings_name = ''):
    file_to_store = os.path.basename(file) + embeddings_name + '.hdf'
    
    if os.path.exists(file_to_store):
        result = pd.read_hdf(file_to_store, key='dataset')
        read = result.shape[0]
    else:
        result = pd.DataFrame()
        read = 0
        
    sentences = read_conll(file)
    
    for sentences_batch in slice(sentences, flush_size, read):
        encoded = encoder.encode(sentences_batch)
        batch = pd.DataFrame(encoded)
        result = pd.concat([result, batch])
        result.to_hdf(file_to_store, key='dataset')
        
        print('totaly read:', result.shape[0])
    
    return result.to_dict('records')

In [None]:
embeddings_name = '_' + embeddings_type + '_' + str(word_dim)

if embeddings_type == 'glove':
    resolver = EmbeddingsDbResolver.read_from_file(glove_embeddings_file, word_dim, lowercase=True)
    encoder = DatasetEncoder(resolver)
    train = read_dataset_incrementally(train_file, encoder, 5000, embeddings_name)
    valid = read_dataset_incrementally(test_file_a, encoder, 5000, embeddings_name)
    test = read_dataset_incrementally(test_file_b, encoder, 5000, embeddings_name)

else:
    bert = BertEmbeddingsResolver(bert_embeddings_file, max_length=128)
    encoder = DatasetEncoder(bert)
    
    train = read_dataset_incrementally(train_file, encoder, 100, embeddings_name)
    valid = read_dataset_incrementally(test_file_a, encoder, 100, embeddings_name)
    test = read_dataset_incrementally(test_file_b, encoder, 100, embeddings_name)
    

In [None]:
def is_zero(arr):
    for item in arr:
        if item != 0.:
            return False
    
    return True

def words_in_embeddings(dataset):
    zero = 0
    other = 0
    for sentence in dataset:
        for embeddings in sentence["word_embeddings"]:
            if is_zero(embeddings):
                zero += 1
            else:
                other += 1
    
    return (zero, other)

(zero, other) = words_in_embeddings(train)
print('train word embeddings coverage: {}'.format(other / (zero + other)))

(zero, other) = words_in_embeddings(valid)
print('valid word embeddings coverage: {}'.format(other / (zero + other)))

(zero, other) = words_in_embeddings(test)
print('test word embeddings coverage: {}'.format(other / (zero + other)))

In [None]:
print(len(train))

labels = set()
for item in read_conll(train_file):
    labels  = labels | set(([label for (word, label) in item]))
    
    
print(labels)
print('chars: {}'.format(len(encoder.char2id)))

all_chars = set()
for item in read_conll(train_file):
    for (word, label) in item:
        all_chars = all_chars | set(word)
    
print('chars: {}'.format(len(all_chars)))

In [None]:
dummy_tags = ['O', '[X]']
dummy_tag_ids = [encoder.tag2id[tag] for tag in dummy_tags if tag in encoder.tag2id]

ner = NerModel(dummy_tags = dummy_tag_ids, use_contrib=True)
ner.add_cnn_char_repr(101, 25, 30)
ner.add_bilstm_char_repr(101, 25, 30)
ner.add_pretrained_word_embeddings(word_dim)
ner.add_context_repr(10, 128, 3)
ner.add_inference_layer(True)
ner.add_training_op(5.0)

ner.init_variables()

In [None]:
for i in range(0, 100):
    ner.train(train, 
          valid,
          lr = 1e-3,
          po = 0.005,
          batch_size = 32,
          dropout = 0.5,
          epoch_start = i, 
          epoch_end = i + 1
    )
    (prec, rec, f1) = ner.measure(test)
    print('Test quality prec: {}, rec: {}, f1: {}'.format(prec, rec, f1))
    

In [None]:
saver = NerModelSaver(ner, encoder)
saver.save('conll_model')