Copyright 2016 The TF Codelab Contributors. All Rights Reserved.
Copyright 2016 The TensorFlow Authors. All Rights Reserved.
http://www.apache.org/licenses/LICENSE-2.0

This code was originaflly presented at GDGSpain DevFest.
Using character prediction from Tensorflow
https://github.com/bigpress/gameofthrones/blob/master/character-predictions.csv

Latest version is always available at: https://github.com/codelab-tf-got/code/
Codelab test is available at: https://codelab-tf-cot.github.io


#### Import Python 2-3 compatibility glue, ETL (pandas) and ML (TensorFlow/sklearn) libraries

In [122]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse, sys, tempfile

#### Import ETL tools (pandas)

In [123]:
import numpy as np
import pandas as pd

#### Import Machine Learning tools (TensorFlow/sklearn)

In [124]:
import tensorflow as tf
from tensorflow.contrib.learn.python.learn import learn_runner
from tensorflow.contrib.learn.python.learn.datasets import base
from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils

from sklearn import cross_validation # to split the train/test cases

#### Uncomment the logging lines to see logs in the console to get to know better what this code does.

In [125]:
import logging
logger = logging.getLogger('net_mk1')
# logger.setLevel(logging.DEBUG)
# ch = logging.StreamHandler()
# ch.setLevel(logging.DEBUG)
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# ch.setFormatter(formatter)
# logger.addHandler(ch)
## End set up logging


# Stop tensorflow from getting chatty with us
tf.logging.set_verbosity(tf.logging.ERROR)
# tf.logging.set_verbosity(tf.logging.WARN)
# tf.logging.set_verbosity(tf.logging.INFO)

FLAGS = None

#### Let's define some auxiliary functions:

Helper to filter elements not already on the haystack in O(n)

In [126]:
def only_existing(l, haystack):
  s = set(haystack)
  return [item for item in l if item in s]

Downloads a dataset for this codelab. Change the **local_path** if you want to store your dataset elsewhere from *'../dataset'* .The buckets here are managed by @ssice.

In [127]:
def get_dataset(filename, local_path='../dataset'):    
  gcs_base = 'https://storage.googleapis.com/'
  gcs_path = 'codelab-got.appspot.com/dataset/'
  return base.maybe_download(
    filename, local_path, gcs_base + gcs_path + filename
  )

#### We get the path to de dataset

In [128]:
dataset_file_name = get_dataset('character-predictions.csv', '../dataset')

#### The columns in the dataset are the following:

In [129]:

COLUMNS = 'S.No,actual,pred,alive,plod,name,title,male,culture,dateOfBirth,mother,father,heir,house,spouse,book1,book2,book3,book4,book5,isAliveMother,isAliveFather,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,boolDeadRelations,isPopular,popularity,isAlive'.split(',')

#### Target column is the actual isAlive variable

In [130]:
LABEL_COLUMN = 'isAlive'

#### We get the rest of columns and divide them in categorical, binary and continuous columns

In [131]:
CATEGORICAL_COLUMN_NAMES = only_existing([
    'male',
    'culture',
    'mother',
    'father',
    'title',
    'heir',
    'house',
    'spouse',
    'numDeadRelations',
    'boolDeadRelations',
], COLUMNS)

BINARY_COLUMNS = only_existing([
    'book1',
    'book2',
    'book3',
    'book4',
    'book5',
    'isAliveMother',
    'isAliveFather',
    'isAliveHeir',
    'isAliveSpouse',
    'isMarried',
    'isNoble',
    'isPopular',
], COLUMNS)

CONTINUOUS_COLUMNS = only_existing([
  'age',
  'popularity',
  'dateOfBirth',
], COLUMNS)

#### We also divide the columns in features or just unused.

In [132]:
FEATURE_COLUMNS = [
  col for col in COLUMNS
  if col in CONTINUOUS_COLUMNS \
  or col in BINARY_COLUMNS \
  or col in CATEGORICAL_COLUMN_NAMES
]

UNUSED_COLUMNS = [
  col
  for col in COLUMNS
  if col != LABEL_COLUMN \
  and col not in FEATURE_COLUMNS
]

print("We are using the feature colums: %s \n" % FEATURE_COLUMNS)
print("We are not using columns: %s" % UNUSED_COLUMNS)

We are using the feature colums: ['title', 'male', 'culture', 'dateOfBirth', 'mother', 'father', 'heir', 'house', 'spouse', 'book1', 'book2', 'book3', 'book4', 'book5', 'isAliveMother', 'isAliveFather', 'isAliveHeir', 'isAliveSpouse', 'isMarried', 'isNoble', 'age', 'numDeadRelations', 'boolDeadRelations', 'isPopular', 'popularity'] 

We are not using columns: ['S.No', 'actual', 'pred', 'alive', 'plod', 'name']


#### Load the base dataframe

In [133]:
# Load the base dataframe
df_base = pd.read_csv(dataset_file_name, sep=',', names=COLUMNS, skipinitialspace=True, skiprows=1)
df_base

Unnamed: 0,S.No,actual,pred,alive,plod,name,title,male,culture,dateOfBirth,...,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,boolDeadRelations,isPopular,popularity,isAlive
1,0,0,0.054,0.946,Viserys II Targaryen,,1,,,,...,0.0,,0,0,,11,1,1,0.605351,0
2,1,0,0.387,0.613,Walder Frey,Lord of the Crossing,1,Rivermen,208.0,,...,,1.0,1,1,97.0,1,1,1,0.896321,1
3,1,0,0.493,0.507,Addison Hill,Ser,1,,,,...,,,0,1,,0,0,0,0.267559,1
4,0,0,0.076,0.924,Aemma Arryn,Queen,0,,82.0,105.0,...,,0.0,1,1,23.0,0,0,0,0.183946,0
5,1,1,0.617,0.383,Sylva Santagar,Greenstone,0,Dornish,276.0,,...,,1.0,1,1,29.0,0,0,0,0.043478,1
6,1,0,0.021,0.979,Tommen Baratheon,,1,,,,...,1.0,,0,0,,5,1,1,1.000000,1
7,0,0,0.014,0.986,Valarr Targaryen,Hand of the King,1,Valyrian,183.0,209.0,...,,1.0,1,1,26.0,0,0,1,0.431438,0
8,0,0,0.036,0.964,Viserys I Targaryen,,1,,,,...,1.0,,0,0,,5,1,1,0.678930,0
9,0,1,0.724,0.276,Wilbert,Ser,1,,,298.0,...,,,0,1,,0,0,0,0.006689,0
10,1,0,0.391,0.609,Wilbert Osgrey,Ser,1,,,,...,,,0,1,,0,0,0,0.020067,1


#### We re-type the binary columns so that they are strings

In [134]:
# We re-type the binary columns so that they are strings
for col in BINARY_COLUMNS:
  df_base[col] = df_base[col].astype(str)

#### We get, for each categorical column, the number of unique elements it has

In [135]:
# We get, for each categorical column, the number of unique elements
# it has.
CATEGORICAL_COLUMNS = {
    col: len(df_base[col].unique()) + 1
    for col in CATEGORICAL_COLUMN_NAMES
}

In [136]:
# preset_deep_columns = [tf.contrib.layers.real_valued_column('age', ion=1, dtype=tf.int32)]
preset_deep_columns = []

#### Obtains the deep columns of the model. 
In our model, these are the binary columns (which are embedded with keys "0" and "1") and the categorical columns, which are embedded as 8-dimensional sparse columns in hash buckets.

In [137]:
def get_deep_columns():

  cc_input_var = {}
  cc_embed_var = {}
  cols = preset_deep_columns

  for cc in BINARY_COLUMNS:
    cols.append(
      tf.contrib.layers.embedding_column(
          tf.contrib.layers.sparse_column_with_keys(
            column_name=cc,
            keys=["0", "1"],
          ),
        dimension=8)
    )

  for cc, cc_size in CATEGORICAL_COLUMNS.items():
    cc_input_var[cc] = tf.contrib.layers.embedding_column(
      tf.contrib.layers.sparse_column_with_hash_bucket(
        cc,
        hash_bucket_size=cc_size,
      ),
      dimension=8
    )

    cols.append(cc_input_var[cc])

  for column in CONTINUOUS_COLUMNS:
    cols.append(tf.contrib.layers.real_valued_column(column, dimension=1, dtype=tf.float32))

  return cols

#### Get wide columns for our model.
In this case, wide columns are just the continuous columns.

In [138]:
def get_wide_columns():
  cols = []
  for column in CONTINUOUS_COLUMNS:
    cols.append(tf.contrib.layers.real_valued_column(column, dimension=1, dtype=tf.float32))

  logger.info("Got wide columns %s", cols)
  return cols

#### General estimator builder function.
The wide/deep part construction is below. This gathers both parts and joins the model into a single classifier.

In [139]:
def build_estimator(model_dir):
  wide_columns = get_wide_columns()
  deep_columns = get_deep_columns()

  if FLAGS.model_type == "wide":
    m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,
                                          feature_columns=wide_columns)
  elif FLAGS.model_type == "deep":
    m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,
                                       feature_columns=deep_columns,
                                       hidden_units=[100, 50])
  else:
    m = tf.contrib.learn.DNNLinearCombinedClassifier(
      model_dir=model_dir,
      linear_feature_columns=wide_columns,
      linear_optimizer=None, ## WATCH: Linear optimizer. By default, FTRL
      dnn_feature_columns=deep_columns,
      dnn_activation_fn=None, ## WATCH: Activation function for DNN (default: relu)
      dnn_hidden_units=[100, 50], ## WATCH: Hidden units for the DNN part
      dnn_dropout=None, ## WATCH: Dropout for the DNN
      dnn_optimizer=None, ## WATCH: Optimizer for DNN (Adagrad by default)
      fix_global_step_increment_bug = True,
    )
  return m

  #### Input builder function

In [140]:
def generate_input_fn(df):
  def _input_fn():
    # Creates a dictionary mapping from each continuous feature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}

    # Creates a dictionary mapping from each categorical feature column name (k)
    # to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {
      k: tf.SparseTensor(indices=[[i, 0] for i in range(df[k].size)],
                         values=df[k].values,
                         dense_shape=[df[k].size, 1])
      for k in (list(CATEGORICAL_COLUMNS.keys()) + BINARY_COLUMNS)
    }

    # Merges the two dictionaries into one.
    feature_cols = dict(continuous_cols)
    feature_cols.update(categorical_cols)

    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)

    # Returns the feature columns and the label.
    return feature_cols, label
  return _input_fn

#### Return the type of each column

In [141]:
def column_to_dtype(column):
  if column == LABEL_COLUMN:
    return tf.int32
  if column in CATEGORICAL_COLUMNS \
     or column in BINARY_COLUMNS:
    return tf.string
  else:
    return tf.float32

In [142]:
def serving_input_fn():
  feature_placeholders = {
    column: tf.placeholder(column_to_dtype(column), [None])
    for column in FEATURE_COLUMNS
  }

  features = {
    key: tf.expand_dims(tensor, -1)
    for key, tensor in feature_placeholders.items()
  }

  return input_fn_utils.InputFnOps(
    features,
    None,
    feature_placeholders
  )

In [143]:
def generate_experiment(output_dir, df_train, df_test):
  def _experiment_fn(output_dir):
    my_model = build_estimator(output_dir)
    experiment = tf.contrib.learn.Experiment(
      my_model,
      train_input_fn=generate_input_fn(df_train),
      eval_input_fn=generate_input_fn(df_test),
      train_steps=FLAGS.steps,
      export_strategies=[saved_model_export_utils.make_export_strategy(
        serving_input_fn,
        default_output_alternative_key=None
      )]
    )
    return experiment
  return _experiment_fn
      

#### Fill with a NaN element of the correct type to have a valid label to use in the neuron pipeline

In [144]:
def fill_dataframe(df_base):
  for col in CATEGORICAL_COLUMN_NAMES:
    df_base[col] = np.where(df_base[col].isnull(), 'NULL', df_base[col])
  for col in BINARY_COLUMNS:
    df_base[col] = np.where(df_base[col].isnull(), "0", df_base[col])
  for col in CONTINUOUS_COLUMNS:
    df_base[col] = np.where(df_base[col].isnull(), 0., df_base[col])

  for col in UNUSED_COLUMNS:
    df_base[col] = np.where(df_base[col].isnull(), 0, df_base[col])

#### Train and evaluate the model.

In [151]:
def train_and_eval(job_dir=None):
  fill_dataframe(df_base)
  logger.debug("Number of columns after removing nulls: %d (before: %d)",
               len(df_base.dropna(how='any', axis=0)),
               len(df_base))

  df_base[LABEL_COLUMN] = (
      df_base[LABEL_COLUMN].apply(lambda x: x)).astype(int)

  df_train, df_test = cross_validation.train_test_split(df_base, test_size=0.2, random_state=42)

  model_dir = tempfile.mkdtemp() if not FLAGS.model_dir else FLAGS.model_dir
  print("model directory = %s" % model_dir)

  if FLAGS.training_mode == 'manual':
    m = build_estimator(model_dir)
    m.fit(
      input_fn=lambda: input_fn(df_train),
      steps=FLAGS.steps
    )
    results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1)
    for key in sorted(results):
      print("%s: %s" % (key, results[key]))

  elif FLAGS.training_mode == 'learn_runner':
    experiment_fn = generate_experiment(
      model_dir, df_train, df_test
    )

    metrics, output_folder = learn_runner.run(experiment_fn, model_dir)
    for key in sorted(metrics):
      print("%s: %s" % (key, metrics[key]))
    print('Model exported to {}'.format(output_folder))

#### Main function

In [153]:
def main(_):
  train_and_eval()


if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument(
    "--training_mode",
    type=str,
    default="learn_runner",
    help="Mode to use for training (learn_runner or manual).",
  )
  parser.add_argument(
    "--model_dir",
    type=str,
    default="",
    help="Base directory for output models.",
  )
  parser.add_argument(
    "--model_type",
    type=str,
    default="wide_n_deep",
    help="Valid model types: {'wide', 'deep', 'wide_n_deep'}.",
  )
  parser.add_argument(
    "--steps",
    type=int,
    default=200,
    help="Number of training steps.",
  )

  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

model directory = /tmp/tmpg0q06ba5
accuracy: 0.997436
accuracy/baseline_label_mean: 0.758974
accuracy/threshold_0.500000_mean: 0.997436
auc: 1.0
auc_precision_recall: 1.0
global_step: 200
labels/actual_label_mean: 0.758974
labels/prediction_mean: 0.761462
loss: 0.0120042
precision/positive_threshold_0.500000_mean: 0.996633
recall/positive_threshold_0.500000_mean: 1.0
Model exported to [b'/tmp/tmpg0q06ba5/export/Servo/1511866336']


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
