### P.S.
* here we not focus on pursuing a higher F1 score, but give a quick example for how to set the model, so we set all the model hyper-parameters to a quite simple level to make the run faster.
* you need to modify the config.py to create a more robust model.

* here we use **CoNLL2003** EN_NER task for example, you should download the dataset and put them as following:
```
.
└───data
│   └───CoNLL2003
│       │   eng.testa
│       │   eng.testb
│       │   eng.train
```

In [1]:
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
import numpy as np

In [6]:
# import libs
from model import Model
tf.reset_default_graph()
from utils import get_idx, get_inputs
from config import Config

# GloVe

In [7]:
# setting the embedding file path
from config_examples.config_glove import Config
config = Config('glove')
glove_file_path = 'data/glove/glove.6B.100d.txt'
# where to save the predictions, model, index files
save_path = 'test/glove_test/'
config.init_glove(glove_file_path, save_path)

# parse the corpus and generate the input data
token2idx, char2idx, label2idx, lookup_table = get_idx(config)
train_x, train_y = get_inputs('train', token2idx, char2idx, label2idx, config)
eval_x, eval_y = get_inputs('eval', token2idx, char2idx, label2idx, config)
test_x, test_y = get_inputs('test', token2idx, char2idx, label2idx, config)

2019-03-26 17:14:45,310 config object Initialized
Building vocab...
vocabulary for this corpus: 12447 tokens, 85 chars, 8 labels
vocabulary construction time:  7.563478005002253


In [8]:
# initial the same NER model 
ner_model = Model(config)
ner_model.build_graph()
ner_model.initialize_session()

# training and test
ner_model.train(train_x,train_y,eval_x,eval_y)
ner_model.test(eval_x,eval_y, 'eval')
ner_model.test(test_x,test_y, 'test')
ner_model.close()
tf.reset_default_graph()

2019-03-26 17:14:56,611 Initializing tf session
2019-03-26 17:14:56,848 Epoch 1 out of 5
2019-03-26 17:15:12,391 Epoch 1 's F1 =31.139110311133965, epoch_runing_time =15.541498899459839 .
2019-03-26 17:15:12,393 - new best F1, save new model.
2019-03-26 17:15:12,782 Epoch 2 out of 5
2019-03-26 17:15:26,976 Epoch 2 's F1 =62.235889296696755, epoch_runing_time =14.19306468963623 .
2019-03-26 17:15:26,978 - new best F1, save new model.
2019-03-26 17:15:27,249 Epoch 3 out of 5
2019-03-26 17:15:40,412 Epoch 3 's F1 =71.09647058823529, epoch_runing_time =13.162132740020752 .
2019-03-26 17:15:40,414 - new best F1, save new model.
2019-03-26 17:15:40,674 Epoch 4 out of 5
2019-03-26 17:15:54,906 Epoch 4 's F1 =75.21691973969631, epoch_runing_time =14.231114149093628 .
2019-03-26 17:15:54,908 - new best F1, save new model.
2019-03-26 17:15:55,156 Epoch 5 out of 5
2019-03-26 17:16:09,319 Epoch 5 's F1 =77.59048970901348, epoch_runing_time =14.161910057067871 .
2019-03-26 17:16:09,321 - new best F

# w2v

In [15]:
# setting the embedding file path
from config_examples.config_w2v import Config
from gensim.models import KeyedVectors
config = Config('w2v')
path ="data/GoogleNews-vectors-negative300.bin"
w2v = KeyedVectors.load_word2vec_format(path, binary=True)
config.init_w2v(w2v)

# parse the corpus and generate the input data
token2idx, char2idx, label2idx, lookup_table = get_idx(config)
train_x, train_y = get_inputs('train', token2idx, char2idx, label2idx, config)
eval_x, eval_y = get_inputs('eval', token2idx, char2idx, label2idx, config)
test_x, test_y = get_inputs('test', token2idx, char2idx, label2idx, config)

# initial the same NER model 
ner_model = Model(config)
ner_model.build_graph()
ner_model.initialize_session()

In [14]:
# training and test
ner_model.train(train_x,train_y,eval_x,eval_y)
ner_model.test(eval_x,eval_y, 'eval')
ner_model.test(test_x,test_y, 'test')
ner_model.close()
tf.reset_default_graph()

# Fasttext

In [12]:
# setting the embedding file path
from config_examples.config_fasttext import Config
config = Config('fasttext')
command ='../fastText/fasttext'
bin_file ='../fastText/data/cc.en.300.bin'
config.init_fasttext(command, bin_file)

# parse the corpus and generate the input data
token2idx, char2idx, label2idx, lookup_table = get_idx(config)
train_x, train_y = get_inputs('train', token2idx, char2idx, label2idx, config)
eval_x, eval_y = get_inputs('eval', token2idx, char2idx, label2idx, config)
test_x, test_y = get_inputs('test', token2idx, char2idx, label2idx, config)

# initial the same NER model 
ner_model = Model(config)
ner_model.build_graph()
ner_model.initialize_session()

In [13]:
# training and test
ner_model.train(train_x,train_y,eval_x,eval_y)
ner_model.test(eval_x,eval_y, 'eval')
ner_model.test(test_x,test_y, 'test')
ner_model.close()
tf.reset_default_graph()

# Contextual Embedding

## flair + glove

In [16]:
# from config import Config
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings, FlairEmbeddings
from config_examples.config_contextual import Config
from utils import load_cropus, get_cropus_len, get_inputs_contextual
config = Config('flair_glove')

# create a StackedEmbedding object that combines the embedding you want
stacked_embeddings = StackedEmbeddings([
                                        WordEmbeddings('glove'), 
                                        FlairEmbeddings('news-forward-fast'), 
                                        FlairEmbeddings('news-backward-fast'),
                                       ])

# load the corpus into flair libs
token2idx1, char2idx, label2idx = get_idx(config)
train, dev, test = load_cropus(config)




# setting the [the number of token in corpus, the dimension of the stacked embedding]
# this two number should be computed by your own cropus and the embedding combination your choose
# for CONLL dataset, the cropus_len = 301418, flair-news-forward-fast + glove.100d = 2148
datasets = [config.path_train, config.path_eval, config.path_test]
cropus_len = get_cropus_len(datasets)
lookup_table = np.zeros([cropus_len, 1124])
token2idx = []


train_x, train_y, offset = get_inputs_contextual(train,stacked_embeddings, 0, 
                                            lookup_table,token2idx, char2idx, label2idx,)
eval_x, eval_y, offset1 = get_inputs_contextual(dev,stacked_embeddings, offset, 
                                            lookup_table,token2idx, char2idx, label2idx,)
test_x, test_y, offset2 = get_inputs_contextual(test,stacked_embeddings, offset1, 
                                            lookup_table,token2idx, char2idx, label2idx,)

# update the lookup_table and token2idx according to the dataset since they will be contextual dependent
config.init_contextual(lookup_table, token2idx)

In [17]:
# initial the same NER model 
ner_model = Model(config)
ner_model.build_graph()
ner_model.initialize_session()

# training and test
ner_model.train(train_x,train_y,eval_x,eval_y)
ner_model.test(eval_x,eval_y,'eval')
ner_model.test(test_x,test_y, 'test')
ner_model.close()

## elmo + w2v

In [None]:
# from config import Config
from config_examples.config_contextual import Config
from utils import load_cropus, get_cropus_len, get_inputs_contextual
from flair.embeddings import ELMoEmbeddings,StackedEmbeddings,WordEmbeddings
elmo_embedding = ELMoEmbeddings()
w2v_embedding = WordEmbeddings('/home/semantic/Liang_NER/data/word_embedding/word2vec/w2v.gensim')
config = Config('elmo_w2v')

# load the corpus into flair libs
token2idx1, char2idx, label2idx = get_idx(config)
train, dev, test = load_cropus(config)

# create a StackedEmbedding object that combines the embedding you want
stacked_embeddings = StackedEmbeddings(embeddings=[w2v_embedding,elmo_embedding])
datasets = [config.path_train, config.path_eval, config.path_test]
cropus_len = get_cropus_len(datasets)
lookup_table = np.zeros([cropus_len, 1124])
token2idx = []


train_x, train_y, offset = get_inputs_contextual(train,stacked_embeddings, 0, 
                                            lookup_table,token2idx, char2idx, label2idx,)
eval_x, eval_y, offset1 = get_inputs_contextual(dev,stacked_embeddings, offset, 
                                            lookup_table,token2idx, char2idx, label2idx,)
test_x, test_y, offset2 = get_inputs_contextual(test,stacked_embeddings, offset1, 
                                            lookup_table,token2idx, char2idx, label2idx,)

# update the lookup_table and token2idx according to the dataset since they will be contextual dependent
config.init_contextual(lookup_table, token2idx)