Notebook for training NER for CoNL2003
Achives 91.5-97.7 f1 measure


In [None]:
word_dim = 100
word_embeddings_folder = './'
word_embeddings_file = word_embeddings_folder + 'glove.6B.{}d.txt'.format(word_dim)

dataset_folder = './'
train_file = dataset_folder + 'eng.train'
test_file_a = dataset_folder + 'eng.testa'
test_file_b = dataset_folder + 'eng.testb'

In [None]:
import numpy as np
import os
import tensorflow as tf
import string
import random
import math
import sys

from ner_model import NerModel
from dataset_encoder import DatasetEncoder
from ner_model_saver import NerModelSaver

In [None]:
def read_glove(file):
    word2id = {}
    vectors = []
    
    def add_vector(word, vector):
        vectors.append(vector)
        word2id[word] = len(word2id)        
    
    dummy_added = False
    with open(file, encoding='utf-8') as f:
        cnt = 0
        for line in f:
            items = line.split(' ')
            word = items[0]
            vector = [float(x) for x in items[1:]]
            
            if not dummy_added:
                add_vector('__oov__', [0] * len(vector))
                dummy_added = True
            
            add_vector(DatasetEncoder.normalize(word), vector)
    
    return word2id, vectors


# Returns sentences, each sentence is an array of tuples: (word, tag)
def read_conll(file):
    
    # array of tuple (word, tag)
    sentence = []
    
    with open(file) as f:
        for line in f:                
            items = line.strip().split(' ')
            if len(items) < 4 or items[0] == '-DOCSTART-':
                if len(sentence) > 0:
                    yield sentence
                    
                    sentence = []
            else:
                word = items[0]
                tag = items[3]
                
                sentence.append((word, tag))
                
    if len(sentence) > 0:
        yield sentence

In [None]:
word2id, embeddings = read_glove(word_embeddings_file)

In [None]:
print(len(word2id))

In [None]:
encoder = DatasetEncoder(word2id, embeddings)

In [None]:
train = list(encoder.encode(read_conll(train_file)))
valid = list(encoder.encode(read_conll(test_file_a)))
test = list(encoder.encode(read_conll(test_file_b)))

In [None]:
def words_in_embeddings(dataset):
    zero = 0
    other = 0
    for sentence in dataset:
        for word_id in sentence["word_ids"]:
            if word_id == 0:
                zero += 1
            else:
                other += 1
    
    return (zero, other)

(zero, other) = words_in_embeddings(train)
print('train word embeddings coverage: {}'.format(other / (zero + other)))

(zero, other) = words_in_embeddings(valid)
print('valid word embeddings coverage: {}'.format(other / (zero + other)))

(zero, other) = words_in_embeddings(test)
print('test word embeddings coverage: {}'.format(other / (zero + other)))

In [None]:
print(len(list(encoder.encode(read_conll(train_file)))))

labels = set()
for item in read_conll(train_file):
    labels  = labels | set(([label for (word, label) in item]))
    
    
print(labels)
print('chars: {}'.format(len(encoder.char2id)))

all_chars = set()
for item in read_conll(train_file):
    for (word, label) in item:
        all_chars = all_chars | set(word)
    
print('chars: {}'.format(len(all_chars)))

In [None]:
ner = NerModel()
ner.add_cnn_char_repr(101, 25, 30)
ner.add_pretrained_word_embeddings(word_dim)
ner.add_context_repr(10, 200)
ner.add_inference_layer(False)
ner.add_training_op(5.0)

ner.init_variables()

In [None]:
for i in range(0, 100):
    ner.train(train, 
          valid, 
          lr = 0.2,
          po = 0.05,
          batch_size = 9,
          dropout = 0.5, 
          epoch_start = i, 
          epoch_end = i + 1
    )
    (prec, rec, f1) = ner.measure(test)
    print('Test quality prec: {}, rec: {}, f1: {}'.format(prec, rec, f1))
    

In [None]:
saver = NerModelSaver(ner, encoder)
saver.save('conll_model')