In [1]:
from __future__ import print_function
import tensorflow as tf
import collections
from collections import Counter
import string
import re
import json
import sys
import modeling
from run_squad import FLAGS, convert_examples_to_features, SquadExample, write_predictions, input_fn_builder, model_fn_builder, FLAGS, validate_flags_or_throw, FeatureWriter, RawResult, get_final_text, _get_best_indexes, _compute_softmax
import tokenization
from tqdm import tqdm
import os
import random

In [2]:
tf.app.flags.DEFINE_string('f', '', 'kernel')
FLAGS.bert_config_file = './checkpoints/chinese_L-12_H-768_A-12/bert_config.json'
FLAGS.vocab_file = './checkpoints/chinese_L-12_H-768_A-12/vocab.txt'
FLAGS.init_checkpoint = './checkpoints/chinese_L-12_H-768_A-12/bert_model.ckpt'
FLAGS.output_dir = './checkpoints/CIPS/'
FLAGS.predict_file = './data/CIPS-sogou/valid.json'
FLAGS.do_predict = True
FLAGS.train_file = './data/CIPS-sogou/train.v1.json'
FLAGS.do_train = True
FLAGS.doc_stride = 256
FLAGS.max_seq_length = 512

In [3]:
def read_cips_unfactoid_examples(input_file, is_training):
    """Read a cips unfactoid json file into a list of SquadExample."""
    input_data = []
    with tf.gfile.Open(input_file, "r") as reader:
        for line in reader:
            result = json.loads(line.strip())
            input_data.append(result)

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in tqdm(input_data):
        paragraphs = {}
        qas_id = entry["query_id"]
        question_text = entry["query"]
        for paragraph in entry["passages"]:
            passage_id = paragraph["passage_id"]
            start_position = -1
            end_position = -1
            orig_answer_text = ""
            is_impossible = False
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph["passage_text"]:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)
            not_find_answer_on_passage = False
            for answer in entry["answer"]:
                answer_id = answer["answer_id"]
                from_passage = answer["from_passage"]
                if from_passage == passage_id:
                    if is_training:
                        if FLAGS.version_2_with_negative:
                            is_impossible = entry["is_impossible"]
                        if (len(entry["answer"]) < 1) and (not is_impossible):
                            raise ValueError(
                                "For training, each question should have exactly 1 answer.")
                        if not is_impossible:
                            if not re.search(re.escape(answer["answer_text"]), paragraph["passage_text"]):
#                                 tf.logging.warning('can not find answer from corresponding paragraph: {}\t{}'.format(answer["answer_text"], paragraph["passage_text"]))
#                                 tf.logging.warning('can not find answer from corresponding paragraph: {}\t{}'.format(qas_id, passage_id))
                                not_find_answer_on_passage = True
                                continue
                            answer_offset = re.search(re.escape(answer["answer_text"]), paragraph["passage_text"]).span()[0]
                            orig_answer_text = answer["answer_text"]
                            answer_length = len(orig_answer_text)
                            start_position = char_to_word_offset[answer_offset]
                            end_position = char_to_word_offset[answer_offset + answer_length -
                                                               1]
                            # Only add answers where the text can be exactly recovered from the
                            # document. If this CAN'T happen it's likely due to weird Unicode
                            # stuff so we will just skip the example.
                            #
                            # Note that this means for training mode, every example is NOT
                            # guaranteed to be preserved.
                            # 进行一些适合中文的预处理
                            actual_text = " ".join(
                                doc_tokens[start_position:(end_position + 1)])
                            cleaned_answer_text = " ".join(
                                tokenization.whitespace_tokenize(orig_answer_text))
                            if actual_text.find(cleaned_answer_text) == -1:
                                tf.logging.warning("Could not find answer: '%s' vs. '%s'",
                                                   actual_text, cleaned_answer_text)
                                not_find_answer_on_passage = True
                                continue
                        else:
                            start_position = -1
                            end_position = -1
                            orig_answer_text = ""
                    break
            if not_find_answer_on_passage:
                continue
            example = SquadExample(
                qas_id="{}-{}".format(qas_id, passage_id),
                question_text=question_text,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position,
                is_impossible=is_impossible)
            examples.append(example)
    return examples    

In [4]:
tf.logging.set_verbosity(tf.logging.INFO)

bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

validate_flags_or_throw(bert_config)

tf.gfile.MakeDirs(FLAGS.output_dir)

tokenizer = tokenization.FullTokenizer(
    vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

tpu_cluster_resolver = None
if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    master=FLAGS.master,
    model_dir=FLAGS.output_dir,
    save_checkpoints_steps=FLAGS.save_checkpoints_steps,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=FLAGS.iterations_per_loop,
        num_shards=FLAGS.num_tpu_cores,
        per_host_input_for_training=is_per_host))

train_examples = None
num_train_steps = None
num_warmup_steps = None
if FLAGS.do_train:
    if not tf.gfile.Exists(os.path.join(FLAGS.output_dir, "train.tf_record")):
        train_examples = read_cips_unfactoid_examples(
            input_file=FLAGS.train_file, is_training=True)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        # Pre-shuffle the input to avoid having to make a very large shuffle
        # buffer in in the `input_fn`.
        rng = random.Random(12345)
        rng.shuffle(train_examples)

        train_writer = FeatureWriter(
            filename=os.path.join(FLAGS.output_dir, "train.tf_record"),
            is_training=True)
        convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=FLAGS.max_seq_length,
            doc_stride=FLAGS.doc_stride,
            max_query_length=FLAGS.max_query_length,
            is_training=True,
            output_fn=train_writer.process_feature)
        train_writer.close()

        train_filename = os.path.join(FLAGS.output_dir, "train.tf_record")
        train_examples_number = len(train_examples)
        del train_examples
    else:
        with tf.gfile.Open(FLAGS.train_file, "r") as reader:
            input_data = json.load(reader)["data"]
        train_examples_number = 0
        for entry in input_data:
            for paragraph in entry["paragraphs"]:
                for qa in paragraph["qas"]:
                    train_examples_number += 1
        num_train_steps = int(
            train_examples_number / FLAGS.train_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
        train_filename = os.path.join(FLAGS.output_dir, "train.tf_record")

model_fn = model_fn_builder(
    bert_config=bert_config,
    init_checkpoint=FLAGS.init_checkpoint,
    learning_rate=FLAGS.learning_rate,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=FLAGS.use_tpu,
    use_one_hot_embeddings=FLAGS.use_tpu)

# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=FLAGS.use_tpu,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=FLAGS.train_batch_size,
    predict_batch_size=FLAGS.predict_batch_size)

if FLAGS.do_train:
    # We write to a temporary file to avoid storing very large constant tensors
    # in memory.

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num orig examples = %d", train_examples_number)
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)


    train_input_fn = input_fn_builder(
        input_file=train_filename,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

if FLAGS.do_predict:
    eval_examples = read_cips_unfactoid_examples(
        input_file=FLAGS.predict_file, is_training=False)

    eval_writer = FeatureWriter(
        filename=os.path.join(FLAGS.output_dir, "eval.tf_record"),
        is_training=False)
    eval_features = []

    def append_feature(feature):
        eval_features.append(feature)
        eval_writer.process_feature(feature)

    convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=FLAGS.max_seq_length,
        doc_stride=FLAGS.doc_stride,
        max_query_length=FLAGS.max_query_length,
        is_training=False,
        output_fn=append_feature)
    eval_writer.close()

    tf.logging.info("***** Running predictions *****")
    tf.logging.info("  Num orig examples = %d", len(eval_examples))
    tf.logging.info("  Num split examples = %d", len(eval_features))
    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

    all_results = []

    predict_input_fn = input_fn_builder(
        input_file=eval_writer.filename,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False)

    # If running eval on the TPU, you will need to specify the number of
    # steps.
    all_results = []
    for result in estimator.predict(
            predict_input_fn, yield_single_examples=True):
        if len(all_results) % 1000 == 0:
            tf.logging.info("Processing example: %d" % (len(all_results)))
        unique_id = int(result["unique_ids"])
        start_logits = [float(x) for x in result["start_logits"].flat]
        end_logits = [float(x) for x in result["end_logits"].flat]
        all_results.append(
            RawResult(
                unique_id=unique_id,
                start_logits=start_logits,
                end_logits=end_logits))

    output_prediction_file = os.path.join(
        FLAGS.output_dir, "predictions.json")
    output_nbest_file = os.path.join(
        FLAGS.output_dir, "nbest_predictions.json")
    output_null_log_odds_file = os.path.join(
        FLAGS.output_dir, "null_odds.json")

    write_predictions(eval_examples, eval_features, all_results,
                      FLAGS.n_best_size, FLAGS.max_answer_length,
                      FLAGS.do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file)

JSONDecodeError: Extra data: line 2 column 1 (char 2106)