In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np

import os

import run_classifier
import tokenization
import modeling
import optimization

data = pd.read_csv("test.csv")

BERT_CONFIG_FILE = "./bert/bert_config.json"
INIT_CHECKPOINT = './output4/model.ckpt-51567'
VOCAB_FILE = "./bert/vocab.txt"
OUTPUT_DIR = './output4/'
TEST_FILE = "./output4/test.tf_record"
DO_LOWER_CASE = False
DO_TRAIN = False
DO_EVAL = False
DO_PREDICT = True
USE_TPU = False
USE_ONE_HOT_EMBEDDING = False
MAX_SEQ_LENGTH = 256
TRAIN_BATCH_SIZE = 28
EVAL_BATCH_SIZE = 28
PREDICT_BATCH_SIZE = 28
LEARNING_RATE = 1e-8
NUM_TRAIN_EPOCHS = 1.0
WARMUP_PROPORTION = 0.1
MASTER = None
SAVE_CHECKPOINTS_STEPS = 5000
ITERATIONS_PER_LOOP = 1000
NUM_TPU_CORES = 1
TPU_CLUSTER_RESOLVER = None
IS_PER_HOST = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [3]:
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)

def convert_input(x):
    return run_classifier.InputExample(guid=x["id"], 
                                       text_a = x["comment_text"],
                                       text_b = None, 
                                       label = 0)

test_InputExamples = data.apply(convert_input, axis= 1)

# run_classifier.file_based_convert_examples_to_features(test_InputExamples, [0,1,2,3,4,5,6,7,8,9,10], MAX_SEQ_LENGTH, tokenizer, TEST_FILE)

NUM_TRAIN_STEPS = int(len(test_InputExamples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
NUM_WARMUP_STEPS = int(NUM_TRAIN_STEPS * WARMUP_PROPORTION)

In [4]:
test_input_fn = run_classifier.file_based_input_fn_builder(
    input_file=TEST_FILE, 
    seq_length=MAX_SEQ_LENGTH, 
    is_training=False, 
    drop_remainder=False)

bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG_FILE)

tpu_config = tf.contrib.tpu.TPUConfig(
    iterations_per_loop=ITERATIONS_PER_LOOP,
    num_shards=NUM_TPU_CORES,
    per_host_input_for_training=IS_PER_HOST)

run_config = tf.contrib.tpu.RunConfig(
    cluster=TPU_CLUSTER_RESOLVER,
    master=MASTER,
    model_dir=OUTPUT_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tpu_config)

model_fn = run_classifier.model_fn_builder(
    bert_config=bert_config,
    num_labels=11,
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=NUM_TRAIN_STEPS,
    num_warmup_steps=NUM_WARMUP_STEPS,
    use_tpu=USE_TPU,
    use_one_hot_embeddings=USE_ONE_HOT_EMBEDDING)

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE) 

predictions = estimator.predict(test_input_fn)

INFO:tensorflow:Using config: {'_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_save_checkpoints_steps': 5000, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None, '_global_id_in_cluster': 0, '_save_checkpoints_secs': None, '_device_fn': None, '_experimental_distribute': None, '_num_ps_replicas': 0, '_is_chief': True, '_task_id': 0, '_task_type': 'worker', '_eval_distribute': None, '_log_step_count_steps': None, '_master': '', '_save_summary_steps': 100, '_model_dir': './output4/', '_keep_checkpoint_max': 5, '_service': None, '_num_worker_replicas': 1, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa55b4cbc88>, '_train_distribute': None, '_protocol': None, '_evaluation_master': '', '_tf_random_seed': None, '_k

In [5]:
probs = []
for p in predictions:
    probs.append(p["probabilities"])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use `tf.data.experimental.map_and_batch(...)`.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (?, 256)
INFO:tensorflow:  name = input_mask, shape = (?, 256)
INFO:tensorflow:  name = is_real_example, shape = (?,)
INFO:tensorflow:  name = label_ids, shape = (?,)
INFO:tensorflow:  name = segment_ids, shape = (?, 256)
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.print instead of tf.Print. Note that tf.print returns a no-output operator that directly prints the output. Outside of defuns or eager mode, this operator will not be executed unless it is directly specified in session.run or used as a control dependency for other operators. This is only a concern in graph mode. Below is an example of how to

In [None]:
# data["prediction"] = probs

In [None]:
# submission = data.drop(["comment_text"], axis =1)

In [None]:
# submission.to_csv('submission4.csv', index=False, header=True)

In [9]:
test_max = np.argmax(probs, axis=1)

In [14]:
dic = {}
for t in test_max:
    if t in dic:
        dic[t] = dic[t] + 1
    else:
        dic[t] = 1

In [15]:
dic

{0: 97320}