In [1]:
%reload_ext autoreload
%autoreload 2

In [18]:
import os
import pickle
import itertools

import tensorflow as tf
import numpy as np
from pprint import pprint

from loader import load_sentences, update_tag_scheme, load_or_create_maps
from loader import char_mapping, tag_mapping, augment_with_pretrained
from loader import prepare_dataset
from logger import get_logger

from utils import clean, make_path, save_config, load_config
from data_utils import load_word2vec

from model import Model

def estimate_accuracy(results):
    n_total = 0
    n_right = 0
    for sent in results:
        n = len(sent[0])
        n_total += n
        for i in range(n):
            if sent[1][i] == sent[2][i]:
                n_right += 1
    return n_right/n_total

def update_best_score(sess, model, score, dataset='dev_data'):
    model_best = model.best_dev_score if dataset=='dev_data' else model.best_test_score
    best_core = model_best.eval()
    if best_core < score:
        tf.assign(model_best, score).eval()
    return best_core < score

## Configuration

In [3]:
config = {}

config['crf'] = True # "Use CRF"
config['clean'] = False # "clean train folder"
config['train'] = True # "weither train the model"

config['train_file'] = os.path.join("../CEC-Corpus/news.train")
config['dev_file'] = os.path.join("../CEC-Corpus/news.dev")
config['test_file'] = os.path.join("../CEC-Corpus/news.test")

config['map_file'] = 'maps.pkl' # "file for maps"
config['emb_file'] = 'wiki_100.utf8' # "Path for pre_trained embedding"

config['ckpt_path'] = 'ckpt' # "Path to save model"
config['result_path'] = 'result'
config['config_file'] = 'config_file'
config['log_file'] = 'train.log' # "File for log"

config['pre_emb'] = True # "Wither use pre-trained embedding"
config['zeros'] = False  # "Wither replace digits with zero"
config['lower'] = True   # "Wither lower case"

config['seg_dim'] = 20 # "Embedding size for segmentation, 0 if not used"
config['char_dim'] = 100 # "Embedding size for characters"
config['lstm_dim'] = 200 # "Num of hidden units in LSTM"
config['tag_schema'] = 'iobes' # "tagging schema iobes or iob"

config['lr'] = 0.001 # "Initial learning rate"
config['batch_size'] = 64 # "Batch size"
config['optimizer'] = 'adam' # "Optimizer for training"
config['clip'] = 5 # "Gradient clip"
config['dropout'] = 0.5 # "Dropout rate"

## Load Dataset

In [4]:
# load sentences and update them to format we want
train_sentences = load_sentences(config['train_file'], config['lower'], config['zeros'])
dev_sentences = load_sentences(config['dev_file'], config['lower'], config['zeros'])
test_sentences = load_sentences(config['test_file'], config['lower'], config['zeros'])
update_tag_scheme(train_sentences, config['tag_schema'])
update_tag_scheme(test_sentences, config['tag_schema'])
# load or create maps
char_to_id, id_to_char, tag_to_id, id_to_tag = load_or_create_maps(train_sentences, test_sentences, config)

# prepare data, get a collection of list containing index
# data = [[chars], [idx_chars], [segments], [idx_tag]]
# where segments is {0: word with one sigle char, 1: begin of a word, 2: inside a word, 3: end of a word}
train_data = prepare_dataset(
    train_sentences, char_to_id, tag_to_id, config['lower']
)
dev_data = prepare_dataset(
    dev_sentences, char_to_id, tag_to_id, config['lower']
)
test_data = prepare_dataset(
    test_sentences, char_to_id, tag_to_id, config['lower']
)
print("%i / %i / %i sentences in train / dev / test." % (
    len(train_data), len(dev_data), len(test_data)))

2977 / 1489 / 1488 sentences in train / dev / test.


## Prepare before running the model
- load and save exist configuration
- initialize the logger
- set tf_config
- set dataset

In [5]:
# make path for store log and model if not exist
make_path(config)
if os.path.isfile(config['config_file']):
    config = load_config(config['config_file'])
else:
    config['num_chars'] = len(char_to_id)
    config['num_tags'] = len(tag_to_id)
    save_config(config, config['config_file'])
make_path(config)

log_path = os.path.join("log", config['log_file'])
mylogger = get_logger(config['log_file'])
# print config
for k, v in config.items():
    mylogger.info("{}:\t{}".format(k.ljust(15), v))

# limit GPU memory
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = False
model = Model(config)
model.set_dataset(train_data, 'train_data')
model.set_dataset(test_data, 'test_data')
model.set_dataset(dev_data, 'dev_data')

2018-03-17 23:48:51,718 - train.log - INFO - crf            :	True
2018-03-17 23:48:51,718 - train.log - INFO - clean          :	False
2018-03-17 23:48:51,718 - train.log - INFO - train          :	True
2018-03-17 23:48:51,718 - train.log - INFO - train_file     :	../CEC-Corpus/news.train
2018-03-17 23:48:51,718 - train.log - INFO - dev_file       :	../CEC-Corpus/news.dev
2018-03-17 23:48:51,718 - train.log - INFO - test_file      :	../CEC-Corpus/news.test
2018-03-17 23:48:51,734 - train.log - INFO - map_file       :	maps.pkl
2018-03-17 23:48:51,734 - train.log - INFO - emb_file       :	wiki_100.utf8
2018-03-17 23:48:51,734 - train.log - INFO - ckpt_path      :	ckpt
2018-03-17 23:48:51,734 - train.log - INFO - result_path    :	result
2018-03-17 23:48:51,734 - train.log - INFO - config_file    :	config_file
2018-03-17 23:48:51,734 - train.log - INFO - log_file       :	train.log
2018-03-17 23:48:51,734 - train.log - INFO - pre_emb        :	True
2018-03-17 23:48:51,734 - train.log - INFO -

## **TRAIN**

In [20]:
with tf.Session(config=tf_config) as sess:
    model.load(sess, config['ckpt_path'], load_word2vec, config, id_to_char, mylogger)
    
    train_init = model.make_dataset_init('train_data', shuffle=3000)
    
    for i in range(100):
        sess.run(train_init)
        loss = []
        n_batches = 0
        try:
            while True:
                step, batch_loss = model.run_step(sess, True)
                loss.append(batch_loss)
                n_batches += 1
                if step % 100 == 0:
                    mylogger.info("iteration:{} step:{} loss:{:>9.6f}".format(
                        i, step, np.mean(loss)))
        except tf.errors.OutOfRangeError:
            pass
        
        mylogger.info("iteration{} finished.".format(i))
        
        mylogger.info("evaluate:test")
        score = estimate_accuracy(model.evaluate(sess, 'test_data', id_to_tag, id_to_char))
        isbest = update_best_score(sess, model, score, dataset='test_data')
        mylogger.info("Accuracy: {}.".format(score))
        
        mylogger.info("evaluate:dev")
        score = estimate_accuracy(model.evaluate(sess, 'dev_data', id_to_tag, id_to_char))
        isbest = update_best_score(sess, model, score, dataset='dev_data')
        mylogger.info("Accuracy: {}.".format(score))
        if isbest:
            mylogger.info("New record, save current model.")
            model.save(sess, config['ckpt_path']) 

2018-03-18 00:03:33,223 - train.log - INFO - Reading model parameters from ckpt\ee.ckpt


INFO:tensorflow:Restoring parameters from ckpt\ee.ckpt


2018-03-18 00:03:43,876 - train.log - INFO - iteration:0 step:2200 loss: 0.039448
2018-03-18 00:03:46,193 - train.log - INFO - iteration0 finished.
2018-03-18 00:03:46,193 - train.log - INFO - evaluate:test
2018-03-18 00:03:49,643 - train.log - INFO - Accuracy: 0.7673322521792013.
2018-03-18 00:03:49,644 - train.log - INFO - evaluate:dev
2018-03-18 00:03:53,021 - train.log - INFO - Accuracy: 0.7814538676607642.


KeyboardInterrupt: 

In [16]:
with tf.Session(config=tf_config) as sess:
    model.load(sess, config['ckpt_path'], load_word2vec, config, id_to_char, mylogger)
    


2018-03-17 23:53:24,072 - train.log - INFO - Reading model parameters from ckpt\ee.ckpt


INFO:tensorflow:Restoring parameters from ckpt\ee.ckpt
0.78248936
False


In [None]:
clean(config)