In [1]:
import tensorflow as tf
from __future__ import print_function
import collections
from collections import Counter
import string
import re
import json
import sys
import modeling
from run_squad import convert_examples_to_features, SquadExample, write_predictions, input_fn_builder, model_fn_builder, FLAGS, validate_flags_or_throw, FeatureWriter, RawResult, get_final_text, _get_best_indexes, _compute_softmax
import tokenization
from tqdm import tqdm
import os

In [2]:
tf.app.flags.DEFINE_string('f', '', 'kernel')
FLAGS.bert_config_file = './checkpoints/chinese_L-12_H-768_A-12/bert_config.json'
FLAGS.vocab_file = './checkpoints/chinese_L-12_H-768_A-12/vocab.txt'
FLAGS.init_checkpoint = './checkpoints/DRCD/model.ckpt-13467'
FLAGS.output_dir = './checkpoints/CIPS/'
FLAGS.predict_file = './data/CIPS-sogou/train_factoid_1.json'
FLAGS.do_predict = True

In [None]:

bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

validate_flags_or_throw(bert_config)

tf.gfile.MakeDirs(FLAGS.output_dir)

tokenizer = tokenization.FullTokenizer(
    vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

tpu_cluster_resolver = None
if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    master=FLAGS.master,
    model_dir='./checkpoints/CIPS/',
    save_checkpoints_steps=FLAGS.save_checkpoints_steps,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=FLAGS.iterations_per_loop,
        num_shards=FLAGS.num_tpu_cores,
        per_host_input_for_training=is_per_host))

train_examples = None
num_train_steps = None
num_warmup_steps = None

model_fn = model_fn_builder(
        bert_config=bert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=FLAGS.use_tpu,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=FLAGS.train_batch_size,
    predict_batch_size=FLAGS.predict_batch_size)

In [None]:
def read_cips_factoid_examples(input_file, is_training=False):
    """Read a cips json file into a list of SquadExample."""
    input_data = []
    with tf.gfile.Open(input_file, "r") as reader:
        for line in reader:
            result = json.loads(line.strip())
            input_data.append(result)

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in tqdm(input_data):
        qas_id = entry["query_id"]
        question_text = entry["query"]
        prev_is_whitespace = True
        paragraph_text = ""
        for idx, paragraph in enumerate(entry["passages"]):
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            paragraph_text = ""
            paragraph_text = paragraph["passage_text"]
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)
            start_position = -1
            end_position = -1
            orig_answer_text = ""
            is_impossible = False
            example = SquadExample(
                qas_id='%s-%d' % (entry["query_id"], idx+1),
                question_text=question_text,
                doc_tokens=doc_tokens,
                orig_answer_text=orig_answer_text,
                start_position=start_position,
                end_position=end_position,
                is_impossible=is_impossible)
            examples.append(example)
    return examples

In [None]:
def write_predictions_multi(all_examples, all_features, all_results, n_best_size,
                            max_answer_length, do_lower_case, output_prediction_file,
                            output_nbest_file, output_null_log_odds_file):
    """Write final predictions to the json file and log-odds of null if needed."""
    tf.logging.info("Writing predictions to: %s" % (output_prediction_file))
    tf.logging.info("Writing nbest to: %s" % (output_nbest_file))

    example_index_to_features = collections.defaultdict(list)
    for feature in all_features:
        example_index_to_features[feature.example_index].append(feature)

    unique_id_to_result = {}
    for result in all_results:
        unique_id_to_result[result.unique_id] = result

    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction",
        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])

    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    scores_diff_json = collections.OrderedDict()
    
    last_logits = -1000
    
    for (example_index, example) in enumerate(tqdm(all_examples)):
        features = example_index_to_features[example_index]
        query_id = int(example.qas_id.split('-')[0])
        
        prelim_predictions = []
        # keep track of the minimum score of null start+end of position 0
        score_null = 1000000  # large and positive
        min_null_feature_index = 0  # the paragraph slice with min mull score
        null_start_logit = 0  # the start logit at the slice with min null score
        null_end_logit = 0  # the end logit at the slice with min null score
        for (feature_index, feature) in enumerate(features):
            result = unique_id_to_result[feature.unique_id]
            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
            # if we could have irrelevant answers, get the min score of irrelevant
            if FLAGS.version_2_with_negative:
                feature_null_score = result.start_logits[0] + \
                    result.end_logits[0]
                if feature_null_score < score_null:
                    score_null = feature_null_score
                    min_null_feature_index = feature_index
                    null_start_logit = result.start_logits[0]
                    null_end_logit = result.end_logits[0]
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # We could hypothetically create invalid predictions, e.g., predict
                    # that the start of the span is in the question. We throw out all
                    # invalid predictions.
                    if start_index >= len(feature.tokens):
                        continue
                    if end_index >= len(feature.tokens):
                        continue
                    if start_index not in feature.token_to_orig_map:
                        continue
                    if end_index not in feature.token_to_orig_map:
                        continue
                    if not feature.token_is_max_context.get(start_index, False):
                        continue
                    if end_index < start_index:
                        continue
                    length = end_index - start_index + 1
                    if length > max_answer_length:
                        continue
                    prelim_predictions.append(
                        _PrelimPrediction(
                            feature_index=feature_index,
                            start_index=start_index,
                            end_index=end_index,
                            start_logit=result.start_logits[start_index],
                            end_logit=result.end_logits[end_index]))

        if FLAGS.version_2_with_negative:
            prelim_predictions.append(
                _PrelimPrediction(
                    feature_index=min_null_feature_index,
                    start_index=0,
                    end_index=0,
                    start_logit=null_start_logit,
                    end_logit=null_end_logit))
        prelim_predictions = sorted(
            prelim_predictions,
            key=lambda x: (x.start_logit + x.end_logit),
            reverse=True)

        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
            "NbestPrediction", ["text", "start_logit", "end_logit"])

        seen_predictions = {}
        nbest = []
        for pred in prelim_predictions:
            if len(nbest) >= n_best_size:
                break
            feature = features[pred.feature_index]
            if pred.start_index > 0:  # this is a non-null prediction
                tok_tokens = feature.tokens[pred.start_index:(
                    pred.end_index + 1)]
                orig_doc_start = feature.token_to_orig_map[pred.start_index]
                orig_doc_end = feature.token_to_orig_map[pred.end_index]
                orig_tokens = example.doc_tokens[orig_doc_start:(
                    orig_doc_end + 1)]
                tok_text = " ".join(tok_tokens)

                # De-tokenize WordPieces that have been split off.
                tok_text = tok_text.replace(" ##", "")
                tok_text = tok_text.replace("##", "")

                # Clean whitespace
                tok_text = tok_text.strip()
                tok_text = " ".join(tok_text.split())
                orig_text = " ".join(orig_tokens)

                final_text = get_final_text(tok_text, orig_text, do_lower_case)
                if final_text in seen_predictions:
                    continue

                seen_predictions[final_text] = True
            else:
                final_text = ""
                seen_predictions[final_text] = True

            nbest.append(
                _NbestPrediction(
                    text=final_text,
                    start_logit=pred.start_logit,
                    end_logit=pred.end_logit))

        # if we didn't inlude the empty option in the n-best, inlcude it
        if FLAGS.version_2_with_negative:
            if "" not in seen_predictions:
                nbest.append(
                    _NbestPrediction(
                        text="", start_logit=null_start_logit,
                        end_logit=null_end_logit))
        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
        if not nbest:
            nbest.append(
                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))

        assert len(nbest) >= 1

        total_scores = []
        best_non_null_entry = None
        for entry in nbest:
            total_scores.append(entry.start_logit + entry.end_logit)
            if not best_non_null_entry:
                if entry.text:
                    best_non_null_entry = entry

        probs = _compute_softmax(total_scores)

        nbest_json = []
        for (i, entry) in enumerate(nbest):
            output = collections.OrderedDict()
            output["text"] = entry.text
            output["probability"] = probs[i]
            output["start_logit"] = entry.start_logit
            output["end_logit"] = entry.end_logit
            nbest_json.append(output)

        assert len(nbest_json) >= 1

        if not FLAGS.version_2_with_negative:
            if query_id in all_predictions:
                all_predictions[query_id] = nbest_json[0]["text"] if (nbest_json[0]["start_logit"] + nbest_json[0]["end_logit"]) > last_logits else last_text
                last_logits = (nbest_json[0]["start_logit"] + nbest_json[0]["end_logit"]) \
                              if (nbest_json[0]["start_logit"] + nbest_json[0]["end_logit"]) > last_logits else last_logits
                last_text = all_predictions[query_id]
            else:
                all_predictions[query_id] = nbest_json[0]["text"]
                last_logits = nbest_json[0]["start_logit"] + nbest_json[0]["end_logit"]
                last_text = all_predictions[query_id]
        else:
            # predict "" iff the null score - the score of best non-null > threshold
            score_diff = score_null - best_non_null_entry.start_logit - (
                best_non_null_entry.end_logit)
            scores_diff_json[query_id] = score_diff
            if score_diff > FLAGS.null_score_diff_threshold:
                all_predictions[query_id] = ""
            else:
                all_predictions[query_id] = best_non_null_entry.text
        
        if query_id in all_nbest_json:
            all_nbest_json[query_id].extend(nbest_json)
        else:
            all_nbest_json[query_id] = nbest_json

    with tf.gfile.GFile(output_prediction_file, "w") as writer:
        writer.write(json.dumps(all_predictions, indent=4) + "\n")

    with tf.gfile.GFile(output_nbest_file, "w") as writer:
        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")

    if FLAGS.version_2_with_negative:
        with tf.gfile.GFile(output_null_log_odds_file, "w") as writer:
            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

In [None]:
#coding: utf-8
eval_examples = read_cips_factoid_examples(
            input_file=FLAGS.predict_file, is_training=False)

eval_writer = FeatureWriter(
    filename=os.path.join(FLAGS.output_dir, "eval.tf_record"),
    is_training=False)
eval_features = []

def append_feature(feature):
    eval_features.append(feature)
    eval_writer.process_feature(feature)

convert_examples_to_features(
    examples=eval_examples,
    tokenizer=tokenizer,
    max_seq_length=FLAGS.max_seq_length,
    doc_stride=FLAGS.doc_stride,
    max_query_length=FLAGS.max_query_length,
    is_training=False,
    output_fn=append_feature)
eval_writer.close()

print("***** Running predictions *****")
print("  Num orig examples = %d", len(eval_examples))
print("  Num split examples = %d", len(eval_features))
print("  Batch size = %d", FLAGS.predict_batch_size)

all_results = []

predict_input_fn = input_fn_builder(
    input_file=eval_writer.filename,
    seq_length=FLAGS.max_seq_length,
    is_training=False,
    drop_remainder=False)

# If running eval on the TPU, you will need to specify the number of
# steps.
all_results = []
for result in estimator.predict(
        predict_input_fn, yield_single_examples=True):
    if len(all_results) % 1000 == 0:
        tf.logging.info("Processing example: %d" % (len(all_results)))
    unique_id = int(result["unique_ids"])
    start_logits = [float(x) for x in result["start_logits"].flat]
    end_logits = [float(x) for x in result["end_logits"].flat]
    all_results.append(
        RawResult(
            unique_id=unique_id,
            start_logits=start_logits,
            end_logits=end_logits))

output_prediction_file = os.path.join(
    FLAGS.output_dir, "predictions.json")
output_nbest_file = os.path.join(
    FLAGS.output_dir, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(
    FLAGS.output_dir, "null_odds.json")

write_predictions_multi(eval_examples, eval_features, all_results,
                        FLAGS.n_best_size, FLAGS.max_answer_length,
                        FLAGS.do_lower_case, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file)

In [None]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    """caculate f1 score on prediction"""
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    """cacualte exact match score on prediction"""
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate(dataset, predictions):
    f1 = exact_match = total = 0
    for entry in dataset:
        total += 1
        qas_id = str(entry["query_id"])
        ground_truths = [entry["answer"]]
        if qas_id not in predictions:
            message = 'Unanswered question ' + str(qas_id) + \
                              ' will receive score 0.'
            print(message, file=sys.stderr)
            continue
        prediction = predictions[qas_id]
        exact_match += metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
            f1_score, prediction, ground_truths)
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

In [3]:
with open('./checkpoints/CIPS/predictions.json', 'r') as reader:
    predictions = json.load(reader)
with open('./data/CIPS-sogou/train_factoid_1.json', 'r') as reader:
    input_data = []
    for line in reader:
        result = json.loads(line.strip())
        input_data.append(result)
print(evaluate(input_data, predictions))

NameError: name 'evaluate' is not defined

In [4]:
with open('./checkpoints/CIPS/predictions.json', 'r') as reader:
    predictions = json.load(reader)
predictions = sorted(predictions.items(), key=lambda x: x[0])
for key, value in predictions:
    print(key + ':' + value)

10000:台湾作家林海音
10001:中国
10002:历史趣闻　ｗｗｗ．ｌｉｓｈｉｑｕｗｅｎ．ｃｏｍ　分享：　［导读］　导读：“围魏救赵”这句成语指避实就虚、袭击敌人后方以迫使进攻之敌撤回的战术。故事发生在战国时期的魏国国都大梁即现在的开封。　．．．
10003:年纪八十八，不上又不下。开门七件事，家家都有它。（打一字）　老油条　©　１９９５－２０１０　Ｗｕｈａｎ　Ｕｎｉｖｅｒｓｉｔｙ　Ｏｖｅｒｓｅａｓ　Ａｌｕｍｎｉ　Ｎｅｔｗｏｒｋ武汉大学海外校友网　．．．
10004:《未婚妻》
10005:empty
10006:疯狂猜成语一匹马后面一副弓箭是什么成语？　答案：　汗马功劳　【解释】：汗马：将士骑的马奔驰出汗，比喻征战劳苦。．．．
10007:圆明园
10008:　２０１５年２月１８日三十夜晚上１１点左右，使用腾讯微信红包包了５个２００元红包，对方只能领取４个红包，有一个２００元红包对方无法领取。腾讯承诺２４－７２小时予以退回未领取红包，２０１５年２月２１日至今我都在打电话咨询这个２００元的红包什么时候退回，腾讯官方一再承诺２４小时之内，今天３月２日却也没有退回红包。．．．
10009:马来西亚泰莱大学的英文缩写　０５年以前是Ｔａｙｌｏｒ　Ｃｏｌｌｅｇｅ，　．．．　ｉｎｔｅｒｎａｔｉｏｎａｌ　ＢＢＣ：ｂｒａｔｒａｎ　ｂｒｏａｄｃａｓｔ　ｃｏｍｐａｎｙ　英国电台　．．．
10010:成语是【一五一十】劳动节是五一　，国庆节是十一，成语就是【一五一十】建议提问的朋友遇到正确答案时，能够及时将最快回答正确的答案采纳，免得其他朋友以为前面还没正确的答案而费尽脑筋。采．．．
10011:１５年
10012:《恋恋不忘电视剧》
10013:在职干警中，党员共３８人　同义词　吴起县人民法院一般指延安市吴起县人民法院　吴起县人民法院位于吴起县城胜利大街中街。．．．　吴起法院近三年来先后被省高级人民法院评为“全省法院司法宣传调研工作先进单位”、“全省法院网络宣传工作先进集体”、“政法干警核心价值观宣传工作先进单位”、“档案先进工作单位”和“司法宣传工作先进集体”。．．．
10014:００８５２是哪里的电话区号　００８５２是香港的区号。　００８５２一６２１０６１２３是哪里的电话　……　是诈骗的号码，别上当　．．．
10015:长沙高桥大市场属于哪个区？　：属雨花区。我就住高桥北门

11466:ｄｏｃ
11467:北京移动的手机服务密码是几位数？　答：客服密码是中国移动客户的身份识别密码，北京移动的手机服务密码是６位数字，每一位均可以是０－９的任一阿拉伯数字。．．．
11468:时崎狂三头像　时崎狂三是哪部动漫　时崎狂三高清壁纸　时崎狂三电脑壁纸＿用户２８８１９７９３７０＿新浪博客，用户２８８１９７９３７０，
11469:上海市徐汇区虹漕路１１８号是　上海夜场演艺吧介绍：它位于上海市繁华的徐汇区，总投资六千万元，１００间豪华包厢，集餐饮、ＫＴＶ、综艺演出为一体的大型豪华娱乐．．．
11470:投稿：０　粉丝：－－　收藏　硬币　－　稍后看　马克一下～　用手机看　离线看更方便　【克鲁赛德战记】夜魔女麻美　用哔哩哔哩客户端或其他应用扫描二维码　点赞　自制　（２０１６．７．１５上传了麻美单Ｂ８）　《夜魔女麻美》正确培养后高爆发法系枪手（英雄评分：）半玩具（ＰＶＣ）＊ＢＵＧ较多、即便对方ＳＰ未满５０也不一定会触发炮击．＊不要学技能、因为这个关系特地练了两只．．．话说两只麻美可以共享标记的噢最佳词条：ＡＡ（法穿＋爆伤／法穿）一般只打ＪＪＣ所以穿爆比较实际妥协词条：ＡＦ（法穿＋暴击）ＡＤ（法穿＋物抗）技能推荐：不学　
11471:普贤菩萨
11472:empty
11473:《人间鬼事》作家妖九拐六“所著的一部科幻灵异小说。　简介：　这个世界上，有很多事情是你我没有遇见过的。　　　．．．　我叫程小凡，养父叫程真一。我是个弃婴，被他从外面捡回来的时候，差不多只剩下半条命了。．．．
11474:德国．柏林大学是德国的一所综合性高等学校．原名柏林弗里特里希－威廉大学，设于柏林．１８０９　年由普鲁士王国内务部文教总管　Ｆ．ｖｏｎＫ．Ｗ．洪堡负责筹建，１８１０年１０月正式开学．设哲学、法学、医学和神学４科．第一任校长为哲学家　Ｊ．．．
11475:巡音，年龄２０岁，声源是浅川悠，擅长曲风是拉丁音乐、爵士、民族系流行音乐、家庭到电子系舞曲，象征物是章鱼和鲔。　巡音的名字和构想呼应着的是“广阔的世界（不同文化间）中巡回的声音、空气．．．
11476:多久
11477:empty
11478:船山区
11479:地址：　济南市历城区山大路１４６号成大高科技市场１层０５３１－８８５１１８０３　交通：　注：地图位置标注仅供参考，具体情况以实际道路标识信息为准。　．．．
1148

13731:龙王菩萨
13732:王羲之
13733:那么袁世凯究竟当．．．［详细］百战专题文章：袁世凯当了　８３天皇帝　至１２月１１日，御用的参政院推戴他为中华帝国大皇帝。．．．
13734:长沙会战　长沙会战打了四次，在前两次会战中，中日双方都宣称自己是胜利方，在第三次会战中，中国战胜日本，在第四次会战中，日本攻陷长沙。　长沙会战共打了几次　长沙会战一共打了四次，史称为“长沙会战”，或称“长沙保卫战”。　．．．
13735:《分手大师》
13736:安徽省
13737:标题：　屁股标志只有一个张大嘴的豹子头是啥车？　贴数：　７　 分页：　分享到：　文章主题： 屁股标志只有一个张大嘴的豹子头是啥车？　南海鳄神　身份　用户　文章　星座　双子座　积分　．．．
13738:没听说过河南菜　．．　都是家常菜
13739:克尔维特跑车
13740:反恐部队
13741:《土地的誓言》是男作家端木蕻良（原名：曹汉文，又名：曹京平）所写，选自《中国新文学大系１９３７——１９４９　散文》卷一，入选人教版初一（七年级）下学期课本。．．．
13742:“破釜沉舟”讲的是历史上的项羽　项羽杀了宋义自封为上将军，立刻攻向鉅鹿，而且渡黄河之后“破釜沉舟”，下令全军只许带三日干粮，磨砺楚军之志，终以三万军大败章邯二十万
13743:速效救心丸的保质期是多久？：药品的有效期是指药品在规定的贮藏条件下质量能够符合规定要求的期限。药效损失１０％所需的时间：如有效期规定为２００２年１２月，就是指这批生产的药品至２００２年１２月３１日前仍然有效。　．．．
13744:谢滨
13745:德国我乐厨柜衣柜　意大利阿里斯顿中央采暖　美国霍尼韦尔中央净水　日本大金中央空调　美国霍尼韦尔中央新风　．．．
13746:闵行区
13747:美国
13748:河南省
13749:１０２４Ｍ
13750:日心说，也称为地动说，是关于天体运动的和地心说相立的学说，它认为太阳是银河系的中心，而不是地球。　日心说哥白尼提出的日心说，推翻了长期以来居于宗教统治地位的地心说，实现了天文学的根本变革。　．．．
13751:（中关村在线江苏行情）６４Ｇ苹果ｉＰｈｏｎｅ　５ｓ今日在商家“乐购时尚数码店”【陈美华：１８８１４４６７２６１　ＱＱ：５０６３６１８１５】处热促，　苹果ＩＰＨＯＮＥ７ＰＬＵＳ武汉最低报价４９８０元，支持０元分期付

In [14]:
#coding: utf-8
with open('./checkpoints/CIPS/nbest_predictions.json', 'r') as reader:
    nbest_predictions = json.load(reader)
with open('./data/CIPS-sogou/nbest_predictions_cn.txt', 'wb') as writer:    
    nbest_predictions = sorted(nbest_predictions.items(), key=lambda x: x[0])
    for number, nbest_json_results in tqdm(nbest_predictions):
        writer.write(number)
        for nbest_json in nbest_json_results:
            for key, result in nbest_json.items():
                if key == 'text':
                    writer.write('{}: {}'.format(key, result))
                else:
                    writer.write(key + ':' + str(result))

  0%|          | 0/5000 [00:00<?, ?it/s]


UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-6: ordinal not in range(128)

In [None]:
??sorted