In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import json
import csv
import os
import random
from tempfile import gettempdir

import numpy as np
import pandas as pd
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
import shutil

# tf.enable_eager_execution()
tf.executing_eagerly()

False

## Read dataset

In [None]:
ls -la ../maniac/arc/ARC-V1-Feb2018-2/ARC-Easy

In [None]:
file_loc = '../maniac/arc/ARC-V1-Feb2018-2/ARC-Easy/ARC-Easy-Dev.jsonl'

In [None]:
with open(file_loc) as f:
    content = f.readlines()
print(len(content))

In [None]:
content[0]

In [None]:
json_string = json.loads(content[0])

In [None]:
json_string

### Weird entries
Some have numbered answers, fill in the blank

In [None]:
json_string = json.loads(content[8])

In [None]:
json_string

## Write jsonl data to csv as labeled dataset

In [None]:
import re
regex = r"(?<!\d)[.,;:?](?!\d)"

letter_answers = ['A', 'B', 'C', 'D', 'E']
number_answers = ['1', '2', '3', '4', '5']

def extract_question_answer(str_line):
  """Reads in line as string, returns tuple of question, answer"""
  json_line = json.loads(str_line)
  
  question = json_line['question']['stem']
  
  choices = [choice['text'] for choice in json_line['question']['choices']]
  
  answer = json_line['answerKey']
  if answer in letter_answers:
    answer_idx = letter_answers.index(answer)
  else: 
    answer_idx = number_answers.index(answer)
  answer_text = json_line['question']['choices'][answer_idx]['text']
  
  question_formated = re.sub(
    regex, "", question, 0).replace("\"","").replace("\'","").replace(",","").replace(";","")
  choices_formated = ";".join(
    [re.sub(
      regex, "", choice, 0).replace("\"","").replace("\'","").replace(",","").replace(";","") 
     for choice in choices])
  
  return question_formated, choices_formated, answer_idx

In [None]:
import io
import codecs

filename = "train_data.csv"
file_loc = "../maniac/arc/ARC-V1-Feb2018-2/ARC-Easy/ARC-Easy-Train.jsonl"

if os.path.exists(filename):
  os.remove(filename)
  
with io.open(file_loc, encoding='utf-8') as f:
  with open(filename, "w") as out:
    for line in f:
      question, choices, answer_idx = extract_question_answer(line)
      with codecs.open(filename, "a", "utf-8") as temp:
        temp.write(question + "," + choices + "," + str(answer_idx) + "\n")

In [None]:
import io
import codecs

filename = "eval_data.csv"
file_loc = "../maniac/arc/ARC-V1-Feb2018-2/ARC-Easy/ARC-Easy-Dev.jsonl"

if os.path.exists(filename):
  os.remove(filename)
  
with io.open(file_loc, encoding='utf-8') as f:
  with open(filename, "w") as out:
    for line in f:
      question, choices, answer_idx = extract_question_answer(line)
      with codecs.open(filename, "a", "utf-8") as temp:
        temp.write(question + "," + choices + "," + str(answer_idx) + "\n")

In [None]:
!head -5 train_data.csv

In [None]:
!head -5 eval_data.csv

## Learn embeddings from ARC corpus

In [None]:
!head -10 ARC_Corpus.txt

In [None]:
import re
regex = r"(?<!\d)[.,;:?`\"\'\(\)\[\]\{\}\\/“”](?!\d)"

# Step 1: Download the data.
filename = "ARC_Corpus.txt"

# Read the data into a list of strings.
def read_data(filename):
  with open(filename, 'r') as myfile:
    data = myfile.read().replace('\n', ' ')
    data = re.sub(
      regex, "", data, 0).replace("("," ").replace(")"," ").replace("- ","").replace("“","").replace("”","").replace("\"","").replace("\'","").replace("  "," ")
    data = data.split(' ')
    data[:] = [word 
               for word in data if (word != " " and word != "\n" and word != "")]
    
  return data

vocabulary = read_data(filename)
print(len(vocabulary))

In [None]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000

def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    index = dictionary.get(word, 0)
    if index == 0:  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary

# Filling 4 global variables:
# data - list of codes (integers from 0 to vocabulary_size-1).
#   This is the original text but words are replaced by their codes
# count - map of words(strings) to count of occurrences
# dictionary - map of words(strings) to their codes(integers)
# reverse_dictionary - maps codes(integers) to words(strings)
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0

In [None]:
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(
    shape = (batch_size), 
    dtype = np.int32)
  labels = np.ndarray(
    shape = (batch_size, 1), 
    dtype = np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  if data_index + span > len(data):
    data_index = 0
  buffer.extend(data[data_index:data_index + span])
  data_index += span
  for i in range(batch_size // num_skips):
    context_words = [w for w in range(span) if w != skip_window]
    words_to_use = random.sample(context_words, num_skips)
    for j, context_word in enumerate(words_to_use):
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[context_word]
    if data_index == len(data):
      buffer[:] = data[:span]
      data_index = span
    else:
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size = 8, num_skips = 2, skip_window = 1)
# This is first doing target word with left word, then target word with right word
for i in range(8):
  print(batch[i], reverse_dictionary[batch[i]],
    '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

In [None]:
# Step 4: Build and train a skip-gram model.
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1     # How many words to consider left and right.
num_skips = 2     # How many times to reuse an input to generate a label.
num_sampled = 64    # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16   # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
# Generate a uniform random sample from np.arange(valid_window) of size valid_size without replacement
valid_examples = np.random.choice(
  a = valid_window, 
  size = valid_size, 
  replace = False)

graph = tf.Graph()

with graph.as_default():
  # Input data.
  train_inputs = tf.placeholder(dtype = tf.int32, shape = [batch_size]) # batch_size vector
  train_labels = tf.placeholder(dtype = tf.int32, shape = [batch_size, 1]) # batch_size x 1 matrix
  valid_dataset = tf.constant(value = valid_examples, dtype = tf.int32) # valid_size vector
  #print("valid_dataset = ", valid_dataset)

  # Ops and variables pinned to the CPU because of missing GPU implementation
  with tf.device("/cpu:0"):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
      initial_value = tf.random_uniform(
        shape = [vocabulary_size, embedding_size], 
        minval = -1.0, 
        maxval = 1.0)) # embedding weights are vocabulary_size x embedding_size matrix
    #print("embeddings = ", embeddings)
    embed = tf.nn.embedding_lookup(
      params = embeddings, 
      ids = train_inputs) # embedding_size x embedding_size matrix
    #print("embed = ", embed)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
      initial_value = tf.truncated_normal(
        shape = [vocabulary_size, embedding_size], 
        stddev = 1.0 / math.sqrt(embedding_size))) # nce weights are vocabulary_size x embedding_size matrix
    #print("nce_weights = ", nce_weights)
    nce_biases = tf.Variable(
      initial_value = tf.zeros(
        shape = [vocabulary_size])) # nce biases are a vocabulary_size vector
    #print("nce_biases = ", nce_biases)

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  # Explanation of the meaning of NCE loss:
  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
  loss = tf.reduce_mean(
    tf.nn.nce_loss(weights=nce_weights,
           biases=nce_biases,
           labels=train_labels,
           inputs=embed,
           num_sampled=num_sampled,
           num_classes=vocabulary_size)) # scalar
    #print("loss = ", loss)

  # Construct the SGD optimizer using a learning rate of 1.0.
  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(
    x = tf.reduce_sum(
      input_tensor = tf.square(
        x = embeddings), 
      axis = 1, 
      keepdims = True)) # vocabulary_size x 1 matrix, summed along the embedding_size axis
  #print("norm = ", norm)
  normalized_embeddings = embeddings / norm # vocabulary_size x embedding_size matrix
  #print("normalized_embeddings = ", normalized_embeddings)
  valid_embeddings = tf.nn.embedding_lookup(
    params = normalized_embeddings, 
    ids = valid_dataset) # valid_size x embedding_size matrix
  #print("valid_embeddings = ", valid_embeddings)
  # cosine similarity
  similarity = tf.matmul(
    a = valid_embeddings, 
    b = normalized_embeddings, 
    transpose_b = True) #  valid_size x vocabulary_size = (valid_size x embedding_size matrix) x (vocabulary_size x embedding_size matrix)^T
  #print("similarity = ", similarity)

  # Add variable initializer.
  init = tf.global_variables_initializer()

In [None]:
# Step 5: Begin training.
num_steps = 100001

with tf.Session(graph = graph) as session:
  # We must initialize all variables before we use them.
  init.run()
  print("Initialized")

  average_loss = 0
  for step in xrange(num_steps):
    batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # We perform one update step by evaluating the optimizer op (including it in the list of returned values for session.run()
    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += loss_val

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print("Average loss at step ", step, ": ", average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in xrange(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8  # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
        log_str = "Nearest to %s:" % valid_word # finding the k nearest neighbors to the validation word
        for k in xrange(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log_str = "%s %s," % (log_str, close_word)
        print(log_str)

  # Get final embeddings after training is complete
  final_embeddings = normalized_embeddings.eval()

In [None]:
np.savetxt(
  fname = "vocab.tsv", 
  X = np.array(object = list(dictionary.keys())), 
  fmt = '%s', 
  delimiter = '\n') # write vocabulary to file
np.savetxt(
  fname = "word_embeddings.csv", 
  X = final_embeddings, 
  delimiter = ',')

In [None]:
!head -10 vocab.tsv

In [None]:
# !head -5 word_embeddings.csv

## Read in trained embeddings

In [2]:
pre_trained_embeddings_array = np.loadtxt(
  fname = "word_embeddings.csv", 
  dtype = np.float64, 
  delimiter = ',')

In [3]:
vocabulary_size = pre_trained_embeddings_array.shape[0]
embedding_size = pre_trained_embeddings_array.shape[1]
print("vocabulary_size = {} & embedding_size = {}".format(vocabulary_size, embedding_size))

vocabulary_size = 50000 & embedding_size = 128


## Create TensorFlow Continuous Bag of Words Model

In [12]:
arguments = {}
# File arguments
arguments["train_file_pattern"] = "train_data.csv"
arguments["eval_file_pattern"] = "eval_data.csv"
arguments["output_dir"] = "trained_model_cbow"

# Sequence shape hyperparameters
arguments["batch_size"] = 32

# DNN hyperparameters
arguments["dnn_hidden_units"] = [1024, 256, 64]

# Training parameters
arguments["train_steps"] = 10000
arguments["learning_rate"] = 0.1
arguments["start_delay_secs"] = 30
arguments["throttle_secs"] = 30

In [5]:
# Set logging to be level of INFO
tf.logging.set_verbosity(tf.logging.INFO)

In [6]:
# Determine CSV and label columns
CSV_COLUMNS = "question,choices,answer_idx".split(',')
LABEL_COLUMN = "answer_idx"
VOCAB_FILE_PATH = "vocab.tsv" # where vocabulary is saved, dynamically set in train_and_eval function
PADWORD = 'ZYXW'

# Set default values for each CSV column
DEFAULTS = [[""], [""], [0]]

In [7]:
# Create an input function reading a file using the Dataset API
# Then provide the results to the Estimator API
def read_dataset(filename, mode, batch_size, params):
  def _input_fn():
    def decode_csv(value_column):
      columns = tf.decode_csv(
        records = value_column, record_defaults = DEFAULTS, field_delim = ',')
      features = dict(zip(CSV_COLUMNS, columns))
      labels = tf.cast(x = features.pop(LABEL_COLUMN), dtype = tf.int64)
      
      return features, labels
    
    # Create list of files that match pattern
    file_list = tf.gfile.Glob(filename = filename)

    # Create dataset from file list
    dataset = tf.data.TextLineDataset(filenames = file_list)  # Read text file

    # Decode the CSV file into a features dictionary of tensors
    dataset = dataset.map(map_func = decode_csv)
    
    # Determine amount of times to repeat file based on if we are training or evaluating
    if mode == tf.estimator.ModeKeys.TRAIN:
      num_epochs = None # indefinitely
    else:
      num_epochs = 1 # end-of-input after this

    # Repeat files num_epoch times
    dataset = dataset.repeat(count = num_epochs)

    # Group the data into batches
    dataset = dataset.batch(batch_size = batch_size)
    
    # Determine if we should shuffle based on if we are training or evaluating
    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = dataset.shuffle(buffer_size = 10 * batch_size)

    # Create a iterator and then pull the next batch of features from the example queue
    batch_features, batch_labels = dataset.make_one_shot_iterator().get_next()

    return batch_features, batch_labels
  return _input_fn

In [8]:
# Create our model function to be used in our custom estimator
def cbow_text_question_answer(features, labels, mode, params):
  # Function to split string tensors into multiple substring tensors
  def split_strings(features):
    # Function to split string tensors into substring tensors
    def split_string_into_substrings(string_tensor, delimiter):
      # Split string tensor into a sparse tensor based on delimiter
      # indices: shape = (cur_batch_size, 2), values: shape = (cur_batch_size,), dense_shape: shape = (2,)
      split_string_sparse_tensor = tf.string_split(
        source = string_tensor, 
        delimiter = delimiter)

      # Create a dense tensor of the float values that were converted from text csv
      # shape = (cur_batch_size, max_substrings_across_batch)
      split_string_dense_tensor = tf.sparse_tensor_to_dense(
        sp_input = split_string_sparse_tensor, 
        default_value = PADWORD)

      # The index for the sequence I am currently on, the first column of the sparse tensor indicies
      # shape = (cur_batch_size,)
      sequence_index = split_string_sparse_tensor.indices[:, 0]

      # The index for the timestep I am currently on, the second column of the sparse tensor indicies
      # shape = (cur_batch_size,)
      time_index = split_string_sparse_tensor.indices[:, 1]

      # The sequence lengths for each sequence
      # shape = (cur_batch_size,)
      lengths_vector = tf.segment_max(
        data = time_index, 
        segment_ids = sequence_index) + 1

      return split_string_dense_tensor, lengths_vector

    # Function to split a single string of multiple setences into words
    def split_multi_sentence_string_into_words(multi_sentence_string_tensor):
      # split_multi_sentence_string_dense_tensor.shape = (cur_batch_size, max_num_choices_across_batch)
      # num_sentences_vector.shape = (cur_batch_size,)
      split_multi_sentence_string_dense_tensor, num_sentences_vector = \
        split_string_into_substrings(string_tensor = multi_sentence_string_tensor, delimiter = ';')

      cur_batch_size = tf.shape(
        input = split_multi_sentence_string_dense_tensor, 
        out_type = tf.int64)[0] # shape = ()

      # Calculate the max number of choices across all questions in the batch
      max_num_choices_across_batch = tf.reduce_max(
        input_tensor = num_sentences_vector) # shape = ()

      split_multi_sentence_string_dense_tensor_flattened = tf.reshape(
        tensor = split_multi_sentence_string_dense_tensor, 
        shape = [cur_batch_size * max_num_choices_across_batch])

      # split_multi_sentence_word_string_dense_tensor_flattened.shape = (cur_batch_size * max_num_choices_across_batch, max_num_words_across_batch)
      # num_multi_sentence_word_tensor_flattened.shape = (cur_batch_size * max_num_choices_across_batch,)
      split_multi_sentence_word_string_dense_tensor_flattened, num_multi_sentence_word_tensor_flattened = \
        split_string_into_substrings(string_tensor = split_multi_sentence_string_dense_tensor_flattened, delimiter = ' ')

      max_num_words_across_batch = tf.reduce_max(
        input_tensor = num_multi_sentence_word_tensor_flattened) # shape = ()

      split_multi_sentence_word_string_dense_tensor = tf.reshape(
        tensor = split_multi_sentence_word_string_dense_tensor_flattened, 
        shape = [cur_batch_size, max_num_choices_across_batch, max_num_words_across_batch])

      num_multi_sentence_word_tensor = tf.reshape(
        tensor = num_multi_sentence_word_tensor_flattened, 
        shape = [cur_batch_size, max_num_choices_across_batch])

      return split_multi_sentence_word_string_dense_tensor, num_multi_sentence_word_tensor, split_multi_sentence_string_dense_tensor, num_sentences_vector

    # Start split_strings function
    # question_split_words_strings_tensor.shape = (cur_batch_size, max_num_question_words_across_batch)
    # question_num_words.shape = (cur_batch_size,)
    question_split_words_strings_tensor, question_num_words = \
      split_string_into_substrings(string_tensor = features["question"], delimiter = ' ')

    # choices_split_words_strings_tensor.shape = (cur_batch_size, max_num_choices_across_batch, max_num_choice_words_across_batch)
    # choices_num_words.shape = (cur_batch_size, max_num_choices)
    # choices_split_sentences_strings_tensor.shape = (cur_batch_size, max_num_choices)
    # choices_num_sentences. shape = (cur_batch_size,)
    choices_split_words_strings_tensor, choices_num_words, choices_split_sentences_strings_tensor, choices_num_sentences = \
      split_multi_sentence_string_into_words(features["choices"])

    return question_split_words_strings_tensor, question_num_words, choices_split_words_strings_tensor, choices_num_words, choices_split_sentences_strings_tensor, choices_num_sentences

  # question_split_words_strings_tensor.shape = (cur_batch_size, max_num_question_words_across_batch)
  # question_num_words.shape = (cur_batch_size,)
  # choices_split_words_strings_tensor.shape = (cur_batch_size, max_num_choices_across_batch, max_num_choice_words_across_batch)
  # choices_num_words.shape = (cur_batch_size, max_num_choices)
  # choices_split_sentences_strings_tensor.shape = (cur_batch_size, max_num_choices)
  # choices_num_sentences.shape = (cur_batch_size,)
  question_split_words_strings_tensor, question_num_words, choices_split_words_strings_tensor, choices_num_words, choices_split_sentences_strings_tensor, choices_num_sentences = \
    split_strings(features)
  
  # Map each word to respective integer index
  word_to_id_lookup_table = tf.contrib.lookup.index_table_from_file(
    vocabulary_file = "vocab.tsv",
    num_oov_buckets = 0,
    vocab_size = vocabulary_size,
    default_value = 0,  # for words not in vocabulary (OOV)
    delimiter = ' ')

  question_split_words_ids_tensor = word_to_id_lookup_table.lookup(
    question_split_words_strings_tensor) # shape = (cur_batch_size, max_num_question_words_across_batch)
  choices_split_words_ids_tensor = word_to_id_lookup_table.lookup(
    choices_split_words_strings_tensor) # shape = (cur_batch_size, max_num_choices_across_batch, max_num_choice_words_across_batch)
  
  # Load trained embeddings into variable
  embeddings_placeholder = tf.placeholder(
    dtype = tf.float64, 
    shape = [vocabulary_size, embedding_size], 
    name = "embedding_placeholder") # shape = (vocabulary_size, embedding_size)

  embeddings_variable = tf.Variable(
    initial_value = embeddings_placeholder, 
    trainable = True, 
    name = "embedding_variable", 
    dtype = tf.float64, 
    expected_shape = [vocabulary_size, embedding_size]) # shape = (vocabulary_size, embedding_size)
  
  # Get dynamic batch size in case there was a partially filled batch
  cur_batch_size = tf.shape(input = question_split_words_ids_tensor, out_type = tf.int64)[0] # shape = ()
  
  # Calculate the max number of choices across all questions in the batch
  max_num_choices_across_batch = tf.reduce_max(input_tensor = choices_num_sentences) # shape = ()
  
  max_num_choice_words_across_batch = tf.reduce_max(input_tensor = choices_num_words) # shape = ()

  # Gather the embedding vectors for the question's words
  # shape = (cur_batch_size, max_num_question_words_across_batch, embedding_size)
  question_split_words_embeddings_tensor = tf.nn.embedding_lookup(
    params = embeddings_variable, 
    ids = question_split_words_ids_tensor)
  
  # Gather the embedding vectors for the answer's words
  # shape = (cur_batch_size, max_num_choices_across_batch, max_num_choice_words_across_batch, embedding_size)
  choices_split_words_embeddings_tensor = tf.nn.embedding_lookup(
    params = embeddings_variable, 
    ids = choices_split_words_ids_tensor)
  
  # shape = (cur_batch_size, embedding_size)
  question_cbow = tf.map_fn(
    fn = lambda x: tf.reduce_mean(
      input_tensor = tf.gather(
        params = question_split_words_embeddings_tensor[x[0], :, :], 
        indices = tf.range(start = 0, limit = x[1], dtype = tf.int64),
        axis = 0), 
      axis = 0), 
    elems = [tf.range(start = 0, limit = cur_batch_size, dtype = tf.int64), 
             question_num_words], 
    dtype = tf.float64)

  ################################################################################
  
  # Create the input layer to our DNN
  network = question_cbow # shape = shape = (cur_batch_size, embedding_size)
  
  # Add hidden layers with the given number of units/neurons per layer
  for units in params['dnn_hidden_units']:
    network = tf.layers.dense(
      inputs = network, 
      units = units, 
      activation = tf.nn.relu) # shape = (cur_batch_size, dnn_hidden_units[i])

  # Connect the final hidden layer to a dense layer with no activation to get the logits
  logits = tf.layers.dense(
    inputs = network, 
    units = embedding_size, 
    activation = None) # shape = (cur_batch_size, embedding_size)
  print("cbow_text_question_answer: logits = \n{}".format(logits))

  # shape = (cur_batch_size * max_num_choices_across_batch, max_num_choice_words_across_batch, embedding_size)
  choices_split_words_embeddings_tensor_flattened = tf.reshape(
    tensor = choices_split_words_embeddings_tensor, 
    shape = [cur_batch_size * max_num_choices_across_batch, max_num_choice_words_across_batch, embedding_size])

  # shape = (cur_batch_size * max_num_choices_across_batch, embedding_size)
  choices_flattened_cbow = tf.map_fn(
    fn = lambda x: tf.reduce_mean(input_tensor = tf.gather(
      params = choices_split_words_embeddings_tensor_flattened[x[0], :, :], 
      indices = tf.range(start = 0, limit = x[1], dtype = tf.int64), 
      axis = 0), axis = 0), 
    elems = [tf.range(start = 0, limit = cur_batch_size * max_num_choices_across_batch, dtype = tf.int64), 
             tf.reshape(tensor = choices_num_words, shape = [cur_batch_size * max_num_choices_across_batch])], 
    dtype = tf.float64)

  # shape = (cur_batch_size, max_num_choices_across_batch, embedding_size)
  choices_cbow = tf.reshape(
    tensor = choices_flattened_cbow, 
    shape = [cur_batch_size, max_num_choices_across_batch, embedding_size])

  # shape = (cur_batch_size, embedding_size)
  logits_normalized = tf.nn.l2_normalize(x = logits, axis = 1)
  # shape = (cur_batch_size, max_num_choices_across_batch, embedding_size)
  choices_cbow_normalized = tf.nn.l2_normalize(choices_cbow, axis = 2)
  # shape = (cur_batch_size, max_num_choices_across_batch)
  cosine_similarities = tf.transpose(
    a = tf.map_fn(
      fn = lambda x: tf.reduce_sum(
        input_tensor = tf.multiply(
          x = logits_normalized, 
          y = choices_cbow_normalized[:, x, :]), 
        axis = 1), 
      elems = tf.range(start = 0, limit = max_num_choices_across_batch, dtype = tf.int64), 
      dtype = tf.float64))

  # shape = (cur_batch_size,)
  predicted_answer_index = tf.argmax(input = cosine_similarities, axis = 1)
  
  # 3. Loss function, training/eval ops
  if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
    # shape = (cur_batch_size, embedding_size)
    answer_cbow = tf.map_fn(
      fn = lambda x: choices_cbow[x[0], x[1], :], 
      elems = [tf.range(start = 0, limit = cur_batch_size, dtype = tf.int64), labels], 
      dtype = tf.float64)
    
    loss = tf.losses.mean_squared_error(
      labels = answer_cbow, predictions = logits)
    train_op = tf.contrib.layers.optimize_loss(
      loss = loss,
      global_step = tf.train.get_global_step(),
      learning_rate = params['learning_rate'],
      optimizer = "Adam")
    eval_metric_ops = {
      "rmse": tf.metrics.root_mean_squared_error(
        labels = answer_cbow, predictions = logits),
      "accuracy": tf.metrics.accuracy(
        labels = labels, predictions = predicted_answer_index)
    }
    
    predictions_dict = None
    export_outputs = None
  else:
    # shape = (cur_batch_size,)
    predicted_answer_text = tf.gather(
      params = choices_split_sentences_strings_tensor, 
      indices = predicted_answer_index, 
      axis = 1)
  
    loss = None
    train_op = None
    eval_metric_ops = None

    # 4. Create predictions
    predictions_dict = {
      "cosine_similarities": cosine_similarities, 
      "predicted_answer_index": predicted_answer_index, 
      "question_text": features["question"],
      "predicted_answer_text": predicted_answer_text}

    # 5. Create export outputs
    export_outputs = {
      "predict_export_outputs": tf.estimator.export.PredictOutput(
        outputs = predictions_dict)}

  # 6. Return EstimatorSpec
  return tf.estimator.EstimatorSpec(
  mode = mode,
  predictions = predictions_dict,
  loss = loss,
  train_op = train_op,
  eval_metric_ops = eval_metric_ops,
  export_outputs = export_outputs,
  scaffold = tf.train.Scaffold(init_feed_dict = {
    embeddings_placeholder: pre_trained_embeddings_array}))

In [13]:
# Create our serving input function to accept the data at serving and send it in the right format to our custom estimator
def serving_input_fn():
  # Create placeholders to accept the data sent to the model at serving time
  # All features come in as a batch of strings, shape = (batch_size,)
  # This was so because of passing the arrays to online ml-engine prediction
  feature_placeholders = {
    feature: tf.placeholder(
      dtype = tf.string, 
      shape = [None]) 
    for feature in CSV_COLUMNS[0:-1]
  }

  # Create feature tensors
  features = feature_placeholders

  return tf.estimator.export.ServingInputReceiver(
    features = features, 
    receiver_tensors = feature_placeholders)

In [14]:
# Create estimator to train and evaluate
def train_and_evaluate(args):
  # Create our custome estimator using our model function
  estimator = tf.estimator.Estimator(
    model_fn = cbow_text_question_answer,
    model_dir = args["output_dir"],
    params = {
    "batch_size": args["batch_size"], 
    "dnn_hidden_units": args["dnn_hidden_units"], 
    "learning_rate": args["learning_rate"]})
  
  # Create train spec to read in our training data
  train_spec = tf.estimator.TrainSpec(
    input_fn = read_dataset(
      filename = args["train_file_pattern"], 
      mode = tf.estimator.ModeKeys.TRAIN, 
      batch_size = args["batch_size"],
      params = args),
    max_steps = args["train_steps"])
  
  # Create exporter to save out the complete model to disk
  exporter = tf.estimator.LatestExporter(
    name = "exporter", 
    serving_input_receiver_fn = serving_input_fn)
  
  # Create eval spec to read in our validation data and export our model
  eval_spec = tf.estimator.EvalSpec(
    input_fn = read_dataset(
      filename = args["eval_file_pattern"], 
      mode = tf.estimator.ModeKeys.EVAL, 
      batch_size = args["batch_size"],
      params = args),
    steps = None,
    start_delay_secs = args["start_delay_secs"], # start evaluating after N seconds
    throttle_secs = args["throttle_secs"],  # evaluate every N seconds
    exporters = exporter)
  
  # Create train and evaluate loop to train and evaluate our estimator
  tf.estimator.train_and_evaluate(
    estimator = estimator, 
    train_spec = train_spec, 
    eval_spec = eval_spec)

In [15]:
# Run the model
shutil.rmtree(arguments["output_dir"], ignore_errors = True) # start fresh each time
train_and_evaluate(arguments)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'trained_model_cbow', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb3e20c278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:S

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into trained_model_cbow/model.ckpt.
INFO:tensorflow:loss = 0.003289172, step = 1
INFO:tensorflow:global_step/sec: 3.10682
INFO:tensorflow:loss = 0.029770348, step = 101 (32.188 sec)
INFO:tensorflow:global_step/sec: 3.51616
INFO:tensorflow:loss = 0.01862548, step = 201 (28.440 sec)
INFO:tensorflow:global_step/sec: 3.76023
INFO:tensorflow:loss = 0.010306964, step = 301 (26.594 sec)
INFO:tensorflow:global_step/sec: 3.71767
INFO:tensorflow:loss = 0.009734049, step = 401 (26.899 sec)
INFO:tensorflow:global_step/sec: 3.76486
INFO:tensorflow:loss = 0.024016531, step = 501 (26.561 sec)
INFO:tensorflow:global_step/sec: 3.57167
INFO:tensorflow:loss = 0.0076896357, step = 601 (27.998 sec)
INFO:tensorflow:global_step/sec: 3.62561
INFO:tensorflow:loss = 0.009