## Building a Classification Model using TF 1.7

In this notebook, we will show the following:
* How to use **facets** to visualise your dataset
* Preprocess and transform raw data using **tf.transform** to .tfrecords
* Create **feature columns** based on the **transformed_metadata** produced by tf.tramsform
* Build a classification model using **DNNLinearCombinedClassifier**
* Use the **transform_fn** produced by tf.tramsform in json **serving_fn**
* Visualise you model evaluation using **TensorFlow Model Analysis**
* Use the SavedModel for predictions

Please run **"00 - install tools and libraries.ipynb"** first to perpare your environement, then restart your kernel

## Dataset: Bank Marketing - UCI Dataset Repository
https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed. 
* 45211 instances
* 20 (mixed) features
* 2 classes

In [None]:
%%bash

mkdir data
mkdir models
mkdir tmp

### Download and Save Raw Data

In [None]:
import urllib
import zipfile
import shutil
import os

DOWNLOAD_DATA = False

if DOWNLOAD_DATA:

    testfile = urllib.URLopener()
    testfile.retrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip", 
                      "data/bank-data.zip")

    print "Zip data was downladed."

    zip_ref = zipfile.ZipFile("data/bank-data.zip", 'r')
    zip_ref.extractall("data")
    zip_ref.close()

    print "Data was unzipped."

    shutil.move('data/bank-additional/bank-additional-full.csv', 'data/bank-train-01.csv')
    shutil.move('data/bank-additional/bank-additional.csv', 'data/bank-eval-01.csv')

    shutil.rmtree('data/bank-additional/', ignore_errors=True)

    os.remove('data/bank-data.zip')

print "Raw data file are ready."

In [None]:
%%bash 

ls data

In [None]:
%%bash 

head data/bank-train-01.csv

### Define Tutorial-wide Parameters

In [None]:
import os

class Params:
    pass

# Set to run on GCP
Params.GCP_PROJECT_ID = ''

# change to GCS location to run on GCP
Params.DATA_DIR = 'data'
Params.TRANSFORMED_DATA_DIR = 'data/transformed'

Params.RAW_DATA_DELIMITER = ';'
Params.RAW_TRAIN_DATA_FILE = os.path.join(Params.DATA_DIR, 'bank-train-01.csv')
Params.RAW_EVAL_DATA_FILE = os.path.join(Params.DATA_DIR, 'bank-eval-01.csv')

Params.TRANSFORMED_TRAIN_DATA_FILE_PREFIX = os.path.join(Params.TRANSFORMED_DATA_DIR, 'bank-train')
Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX = os.path.join(Params.TRANSFORMED_DATA_DIR, 'bank-eval')

# change to GCS location to run on GCP
Params.TEMP_DIR = 'tmp'

# change to GCS location to run on GCP
Params.MODELS_DIR = 'models'

Params.TRANSFORM_ARTEFACTS_DIR = os.path.join(Params.MODELS_DIR,'transform')

Params.TRANSFORM = True

Params.TRAIN = True

Params.EXTEND_FEATURE_COLUMNS = True

Params.RESUME_TRAINING = True

Params.DISPLAY_FACETS = False

## Load the data to Pandas Dataframe

In [None]:
import pandas as pd
headers = [
    'age', 'job', 'marital_status', 'education', 
    'has_credit_default', 'has_housing_loan',
    'has_personal_loan', 'contact_type', 'last_contact_month', 
    'last_contact_day_of_week', 'last_contact_duration', 
    'campaign_contact_count','days_since_last_contact', 
    'previous_campaign_contact_count', 'previous_campaign_outcome', 
    'employment_variation_rate', 'consumer_price_index','consumer_confidence_index', 
    'euribor3m', 'number_of_employees', 
    'has_subscribed'
]
data_train = pd.read_csv(Params.RAW_TRAIN_DATA_FILE, sep=";")
data_train.columns = headers
data_train.head(5)

In [None]:
data_train.describe()

## Visualise Dataset using Facets - Big Picture
visit: https://research.google.com/bigpicture/

* Use Stacked with categorical features to test the distribution  If used with numerical features will bucketise them.
* Use Scatter with numerical values to test correlations.
* Use Facets to slice and dice (vertically and horizontally).
* Use colour with the target feature.

In [None]:
from IPython.core.display import display, HTML

if Params.DISPLAY_FACETS:
    
    jsonstr = data_train.to_json(orient='records')

    HTML_TEMPLATE = """<link rel="import" href="/nbextensions/facets-dist/facets-jupyter.html">
            <facets-dive id="elem" height="600"></facets-dive>
            <script>
              var data = {jsonstr};
              document.querySelector("#elem").data = data;
            </script>"""
    html = HTML_TEMPLATE.format(jsonstr=jsonstr)
    display(HTML(html))

### Baseline Accuracy

In [None]:
class_dist_train = data_train.has_subscribed.value_counts()
base_accuracy = float(max(class_dist_train))/sum(class_dist_train)
print "baseline accuracy - train: {}%".format(round(base_accuracy*100,2))

data_eval = pd.read_csv(Params.RAW_EVAL_DATA_FILE, sep=";")
data_eval.columns = headers

class_dist_eval = data_eval.has_subscribed.value_counts()
base_accuracy = float(max(class_dist_eval))/sum(class_dist_eval)
print "baseline accuracy - eval: {}%".format(round(base_accuracy*100,2))

del data_train
del data_eval

## Import TensorFlow & related libraries

In [None]:
import tensorflow as tf
from tensorflow import data
import tensorflow_transform as tft
import tensorflow_model_analysis as tfma
import tensorflow_transform.coders as tft_coders

from tensorflow_transform.beam import impl
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tensorflow.contrib.learn.python.learn.utils import input_fn_utils

from tensorflow_transform.tf_metadata import metadata_io
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.saved import saved_transform_io
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

import apache_beam as beam

## 1. Define Raw Data Metadata

### 1.1 Declare raw features types

In [None]:
RAW_FEATURE_NAMES = [
    'age', 'job', 'marital_status', 'education', 
    'has_credit_default', 'has_housing_loan',
    'has_personal_loan', 'contact_type', 'last_contact_month', 
    'last_contact_day_of_week', 'last_contact_duration', 
    'campaign_contact_count','days_since_last_contact', 
    'previous_campaign_contact_count', 'previous_campaign_outcome', 
    'employment_variation_rate', 'consumer_price_index','consumer_confidence_index', 
    'euribor3m', 'number_of_employees', 
    'has_subscribed'
]


NUMERIC_FEATURE_NAMES = ['age', 'last_contact_duration', 'campaign_contact_count',
    'days_since_last_contact', 'previous_campaign_contact_count',
    'employment_variation_rate', 'consumer_price_index',
    'consumer_confidence_index', 'euribor3m', 'number_of_employees'
]

CATEGORICAL_FEATURE_NAMES = ['contact_type','education','has_credit_default',
    'has_housing_loan', 'has_personal_loan', 'job', 'last_contact_day_of_week',
    'last_contact_month', 'marital_status','previous_campaign_outcome'
]

TARGET_FEATURE_NAME = 'has_subscribed'

TARGET_LABELS = ['yes', 'no']

### 1.2 Create a tf.transform metadata object (including Schema)

In [None]:
def create_raw_metadata():
    
    column_schemas = {}
    
    # ColumnSchema for numeric features
    column_schemas.update({
      key: dataset_schema.ColumnSchema(
          tf.float32, [], dataset_schema.FixedColumnRepresentation())
      for key in NUMERIC_FEATURE_NAMES
    })
    
    # ColumnSchema for categorical features
    column_schemas.update({
      key: dataset_schema.ColumnSchema(
          tf.string, [], dataset_schema.FixedColumnRepresentation())
      for key in CATEGORICAL_FEATURE_NAMES
    })
    
    # ColumnSchema for target feature
    column_schemas[TARGET_FEATURE_NAME] = dataset_schema.ColumnSchema(
        tf.string, [], 
        dataset_schema.FixedColumnRepresentation()
    )
    
    # Dataset Metadata
    raw_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema(column_schemas)
    )
    
    return raw_metadata

#create_raw_metadata().schema.as_feature_spec()

## 2. Prepare raw data

### 2.1 Data processing functions

In [None]:
NUM_BUCKETS = 4

def preprocess(input_features):
    
    output_features = {}
    
    output_features[TARGET_FEATURE_NAME] = input_features[TARGET_FEATURE_NAME]
    
    for feature_name in NUMERIC_FEATURE_NAMES:
        
        output_features[feature_name+"_scaled"] = tft.scale_to_0_1(input_features[feature_name])
        
        quantiles = tft.quantiles(input_features[feature_name], num_buckets=NUM_BUCKETS, epsilon=0.01)
        output_features[feature_name+"_bucketized"] = tft.apply_buckets(input_features[feature_name], 
                                                                        bucket_boundaries=quantiles)

    for feature_name in CATEGORICAL_FEATURE_NAMES:
        
        tft.uniques(input_features[feature_name], vocab_filename=feature_name)
        output_features[feature_name] = input_features[feature_name]
        
#         output_features[feature_name+"_integerized"] = tft.string_to_int(input_features[feature_name],
#                                                           vocab_filename=feature_name)
    
    return output_features

### 2.2 Transformation Beam pipeline

In [None]:
import os

def run_transformation_pipeline(runner, options):
    
    options = beam.pipeline.PipelineOptions(flags=[], **options)
    
    print("Source raw train data files: {}".format(Params.RAW_TRAIN_DATA_FILE))
    print("Source raw train data files: {}".format(Params.RAW_EVAL_DATA_FILE))

    print("Sink transformed train data files: {}".format(Params.TRANSFORMED_TRAIN_DATA_FILE_PREFIX))
    print("Sink transformed data files: {}".format(Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX))
    print("Sink transform artefacts directory: {}".format(Params.TRANSFORM_ARTEFACTS_DIR))
   
    print("Temporary directory: {}".format(Params.TEMP_DIR))
    print("")

    
    with beam.Pipeline(runner, options=options) as pipeline:
        with impl.Context(Params.TEMP_DIR):
            
            raw_metadata = create_raw_metadata()
            converter = tft_coders.csv_coder.CsvCoder(column_names=RAW_FEATURE_NAMES,
                                                      delimiter=Params.RAW_DATA_DELIMITER, 
                                                      schema=raw_metadata.schema)
            
            ###### analyze & transform train #########################################################
            if(runner=='DirectRunner'):
                print("Transform training data....")
            
            step = 'train'

            # Read raw train data from csv files
            raw_train_data = (
              pipeline
              | '{} - Read Raw Data'.format(step) >> beam.io.textio.ReadFromText(Params.RAW_TRAIN_DATA_FILE)
              | '{} - Remove Empty Rows'.format(step) >> beam.Filter(lambda line: line)
              | '{} - Decode CSV Data'.format(step) >> beam.Map(converter.decode)
            
            )
            
            # create a train dataset from the data and schema
            raw_train_dataset = (raw_train_data, raw_metadata)
            
            # analyze and transform raw_train_dataset to produced transformed_train_dataset and transform_fn
            transformed_train_dataset, transform_fn = (
                raw_train_dataset 
                | '{} - Analyze & Transform'.format(step) >> impl.AnalyzeAndTransformDataset(preprocess)
            )
            
            # get data and schema separately from the transformed_train_dataset
            transformed_train_data, transformed_metadata = transformed_train_dataset

            # write transformed train data to sink
            _ = (
                transformed_train_data 
                | '{} - Write Transformed Data'.format(step) >> beam.io.tfrecordio.WriteToTFRecord(
                    file_path_prefix=Params.TRANSFORMED_TRAIN_DATA_FILE_PREFIX,
                    file_name_suffix=".tfrecords",
                    coder=tft_coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))
            )
            
            ###### transform eval ##################################################################
            
            if(runner=='DirectRunner'):
                print("Transform eval data....")
            
            step = 'eval'

            raw_eval_data = (
              pipeline
              | '{} - Read Raw Data'.format(step) >> beam.io.textio.ReadFromText(Params.RAW_EVAL_DATA_FILE)
              | '{} - Remove Empty Lines'.format(step) >> beam.Filter(lambda line: line)
              | '{} - Decode CSV Data'.format(step) >> beam.Map(converter.decode)
            
            )
            
            # create a eval dataset from the data and schema
            raw_eval_dataset = (raw_eval_data, raw_metadata)
            
            # transform eval data based on produced transform_fn (from analyzing train_data)
            transformed_eval_dataset = (
                (raw_eval_dataset, transform_fn) 
                | '{} - Transform'.format(step) >> impl.TransformDataset()
            )
            
            # get data from the transformed_eval_dataset
            transformed_eval_data, _ = transformed_eval_dataset
            
            # write transformed eval data to sink
            _ = (
                transformed_eval_data 
                | '{} - Write Transformed Data'.format(step) >> beam.io.tfrecordio.WriteToTFRecord(
                    file_path_prefix=Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX,
                    file_name_suffix=".tfrecords",
                    coder=tft_coders.example_proto_coder.ExampleProtoCoder(transformed_metadata.schema))
            )
            
        
            ###### write transformation metadata #######################################################
            if(runner=='DirectRunner'):
                print("Saving transformation artefacts ....")
            
            # write transform_fn as tf.graph
            _ = (
                transform_fn 
                | 'Write Transform Artefacts' >> transform_fn_io.WriteTransformFn(Params.TRANSFORM_ARTEFACTS_DIR)
            )

    if runner=='DataflowRunner':
        pipeline.run()

### 2.3 Run transformation pipeline

In [None]:
# %%writefile requirements.txt
# tensorflow-transform==0.6.0

In [None]:
import shutil
from datetime import datetime

if Params.TRANSFORM:
    
    tf.logging.set_verbosity(tf.logging.ERROR)

    runner = 'DirectRunner' # DirectRunner | DataflowRunner

    job_name = 'preprocess-data-tft-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S'))
    print 'Launching {} job {} ... hang on'.format(runner, job_name)
    print("")

    options = {
        'region': 'europe-west1',
        'staging_location': os.path.join(Params.DATA_DIR, 'tmp', 'staging'),
        'temp_location': os.path.join(Params.DATA_DIR, 'tmp'),
        'job_name': job_name,
        'project': Params.GCP_PROJECT_ID,
        'worker_machine_type': 'n1-standard-1',
        'max_num_workers': 20,
        'teardown_policy': 'TEARDOWN_ALWAYS',
        'no_save_main_session': True,
        'requirements_file': 'requirements.txt',
    }

    if runner == 'DirectRunner':

        shutil.rmtree(Params.TRANSFORM_ARTEFACTS_DIR, ignore_errors=True)
        shutil.rmtree(Params.TRANSFORMED_DATA_DIR, ignore_errors=True)
        shutil.rmtree(Params.TEMP_DIR, ignore_errors=True)


        run_transformation_pipeline(runner, options)
        print("Transformation is done!")
else:
    print("Transformation was skipped!")

In [None]:
%%bash

echo "** transformed data:"
ls data/transformed
echo ""

echo "** transform artefacts:"
ls models/transform
echo ""

echo "** transform assets:"
ls models/transform/transform_fn/assets

In [None]:
def get_vocabulary_file_by_name(transform_artefacts_dir, key):
    return os.path.join(
        transform_artefacts_dir,
        transform_fn_io.TRANSFORM_FN_DIR,
        'assets',
        key.replace('_integerized',''))


def get_vocabulary_size_by_name(transform_artefacts_dir, key):
    vocabulary = get_vocabulary_file_by_name(transform_artefacts_dir, key)
    with tf.gfile.Open(vocabulary, 'r') as f:
        return sum(1 for _ in f)

In [None]:
import tensorflow as tf
from tensorflow import data

print "tensorflow version: {}".format(tf.__version__)

## 3. Define Data Input Funtion for the Model

In [None]:
transformed_metadata = metadata_io.read_metadata(
    os.path.join(Params.TRANSFORM_ARTEFACTS_DIR,"transformed_metadata"))

transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

#print(transformed_feature_spec)

TRANSFORMED_NUMERIC_FEATURE_NAMES = [ 
    feature_name 
    for  feature_name in transformed_feature_spec.keys() 
    if feature_name.endswith('_scaled')
]
                                    

TRANSFORMED_BUCKETIZED_FEATURE_NAMES = [ 
    feature_name 
    for  feature_name in transformed_feature_spec.keys() 
    if feature_name.endswith('_bucketized')
]


TRANSFORMED_CATEGORICAL_FEATURE_NAMES = CATEGORICAL_FEATURE_NAMES

TRANSFORMED_INTEGERIZED_CATEGORICAL_FEATURE_NAMES = [ 
    feature_name 
    for  feature_name in transformed_feature_spec.keys() 
    if feature_name.endswith('_integerized')
]

print TRANSFORMED_NUMERIC_FEATURE_NAMES
print ""
print TRANSFORMED_BUCKETIZED_FEATURE_NAMES
print ""
print TRANSFORMED_CATEGORICAL_FEATURE_NAMES
print ""
print TRANSFORMED_INTEGERIZED_CATEGORICAL_FEATURE_NAMES

In [None]:
def parse_tf_example(tf_example):
    
    parsed_features = tf.parse_example(serialized=tf_example, features=transformed_feature_spec)
    target = parsed_features.pop(TARGET_FEATURE_NAME)
    
    return parsed_features, target

In [None]:
# to be applied in traing and serving
# ideally, you put this logic in preprocess_tft, to avoid transforming the records during training several times

def process_features(features):
    return features

In [None]:
def generate_tfrecords_input_fn(files_name_pattern, 
                                mode=tf.estimator.ModeKeys.EVAL,  
                                num_epochs=1, 
                                batch_size=500):
    
    def _input_fn():
    
        shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False

        file_names = data.Dataset.list_files(files_name_pattern)

        dataset = data.TFRecordDataset(filenames=file_names)
        
        if shuffle:
            dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)

        dataset = dataset.batch(batch_size)
        dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example))
        dataset = dataset.map(lambda features, target: (process_features(features), target))
        dataset = dataset.repeat(num_epochs)
        iterator = dataset.make_one_shot_iterator()

        features, target = iterator.get_next()
        return features, target
    
    return _input_fn

## 4. Create Feature Columns

In [None]:
from tensorflow.python.feature_column import feature_column

def extend_feature_columns(feature_columns, hparams):

    return feature_columns

In [None]:
def create_feature_columns(hparams):
    
    feature_columns = {}

    numeric_columns = {
        feature_name: tf.feature_column.numeric_column(feature_name)
        for feature_name in TRANSFORMED_NUMERIC_FEATURE_NAMES
    }
    
    bucketized_columns = {
        feature_name: tf.feature_column.categorical_column_with_identity(feature_name, num_buckets=NUM_BUCKETS+2)
        for feature_name in TRANSFORMED_BUCKETIZED_FEATURE_NAMES
    }
    
    categorical_columns = {
        feature_name: tf.feature_column.categorical_column_with_vocabulary_file(
            key=feature_name, 
            vocabulary_file=get_vocabulary_file_by_name(Params.TRANSFORM_ARTEFACTS_DIR, feature_name))
        for feature_name in TRANSFORMED_CATEGORICAL_FEATURE_NAMES}
    
    integerized_columns = {
        feature_name: tf.feature_column.categorical_column_with_identity(
            key=feature_name, 
            num_buckets=get_vocabulary_size_by_name(Params.TRANSFORM_ARTEFACTS_DIR, feature_name))
        for feature_name in TRANSFORMED_INTEGERIZED_CATEGORICAL_FEATURE_NAMES}
    
    if numeric_columns is not None:
        feature_columns.update(numeric_columns)
        
    if bucketized_columns is not None:
        feature_columns.update(bucketized_columns)
        
    if integerized_columns is not None:
        feature_columns.update(integerized_columns)
        
    if categorical_columns is not None:
        feature_columns.update(categorical_columns)

    if Params.EXTEND_FEATURE_COLUMNS:
        feature_columns = extend_feature_columns(feature_columns, hparams)
        
    return feature_columns

In [None]:
from tensorflow.python.feature_column import feature_column

def get_wide_deep_columns(hparams):
    
    feature_columns = list(create_feature_columns(hparams).values())
    
    dense_columns = list(
        filter(lambda column: 
                 isinstance(column, feature_column._NumericColumn) 
               | isinstance(column, feature_column._EmbeddingColumn)
               ,feature_columns
        )
    )

    categorical_columns = list(
        filter(lambda column: 
                 isinstance(column, feature_column._VocabularyListCategoricalColumn) 
               | isinstance(column, feature_column._VocabularyFileCategoricalColumn) 
               | isinstance(column, feature_column._IdentityCategoricalColumn) 
               | isinstance(column, feature_column._BucketizedColumn)
               ,feature_columns)
    )
    
    sparse_columns = list(
        filter(lambda column: 
                 isinstance(column,feature_column._HashedCategoricalColumn) 
               | isinstance(column, feature_column._CrossedColumn)
               , feature_columns)
    )

    indicator_columns = []
    
    if hparams.use_indicators: 
        indicator_columns = [
            tf.feature_column.indicator_column(column)
            for column in categorical_columns
        ]
    
    deep_feature_columns = dense_columns + indicator_columns
    wide_feature_columns = (categorical_columns + sparse_columns) if hparams.use_wide_columns else []
    
    return wide_feature_columns, deep_feature_columns

# get_wide_deep_columns(tf.contrib.training.HParams(
#     use_indicators = False,
#     use_wide_columns = True))

## 5. Define Estimators

### Define evaluation metrics

In [None]:
def parse_label_column(label_string_tensor):
    table = tf.contrib.lookup.index_table_from_tensor(
        tf.constant(TARGET_LABELS)
    )
    return table.lookup(label_string_tensor)


def metric_fn(labels, predictions):

    metrics = {}

    indices = parse_label_column(labels)
    pred_class = predictions['class_ids']
    metrics['mirco_accuracy'] = tf.metrics.mean_per_class_accuracy(
        labels=indices,
        predictions=pred_class,
        num_classes=len(TARGET_LABELS)
    )

    return metrics

### DNN Classification Model

In [None]:
def create_dnn_estimator(run_config, hparams):
    
    print "creating a dnn linear combined estimator..."
    print ""
    
    wide_feature_columns, deep_feature_columns = get_wide_deep_columns(hparams)
    
#     print "wide columns: {}".format(wide_feature_columns)
#     print ""
#     print "deep columns: {}".format(deep_feature_columns)
    
    estimator = tf.estimator.DNNLinearCombinedClassifier(
        
        n_classes= len(TARGET_LABELS),
        label_vocabulary=TARGET_LABELS,
        
        dnn_feature_columns = deep_feature_columns,
        linear_feature_columns = wide_feature_columns,
        
        dnn_hidden_units= hparams.hidden_units,
        
        dnn_optimizer= tf.train.AdamOptimizer(learning_rate=hparams.learning_rate),
        
        dnn_activation_fn= tf.nn.elu,
        dnn_dropout= hparams.dropout_prob,
        
        config= run_config
    )
    
    
    estimator = tf.contrib.estimator.add_metrics(
        estimator=estimator,
        metric_fn=metric_fn
    )
    
    return estimator

## 6. Train, Evaluate, and Export Experiment

### 6.1 Set HParams and RunConfig

In [None]:
TRAIN_SIZE = 41188
NUM_EPOCHS = 10
BATCH_SIZE = 500
TOTAL_STEPS = (TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS
EVAL_EVERY_SEC = 30

hparams  = tf.contrib.training.HParams(
    num_epochs = NUM_EPOCHS,
    batch_size = BATCH_SIZE,
    
    embedding_size = 3,
    
    use_indicators = False,
    use_wide_columns = True,
    learning_rate = 0.01,
    
    hidden_units=[16, 12, 8],
    dropout_prob = 0.0,
    
    max_steps = TOTAL_STEPS,

)

MODEL_NAME = 'dnn_estimator' # 'tree_estimator' | 'dnn_estimator'
model_dir = os.path.join(Params.MODELS_DIR, MODEL_NAME)

run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    log_step_count_steps=1000,
    save_checkpoints_secs=EVAL_EVERY_SEC,
    keep_checkpoint_max=3,
    model_dir=model_dir
)


print(hparams)
print("")
print("Model Directory:", run_config.model_dir)
print("Dataset Size:", TRAIN_SIZE)
print("Batch Size:", BATCH_SIZE)
print("Steps per Epoch:",TRAIN_SIZE/BATCH_SIZE)
print("Total Steps:", TOTAL_STEPS)

### 6.2 Define serving function

In [None]:
def generate_json_serving_fn():
    
    # get the feature_spec of raw data
    raw_metadata = create_raw_metadata()
    raw_placeholder_spec = raw_metadata.schema.as_batched_placeholders()
    raw_placeholder_spec.pop(TARGET_FEATURE_NAME)
    
    def _serving_fn():

        raw_input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(raw_placeholder_spec)
        raw_features, recevier_tensors, _ = raw_input_fn()

        # apply tranform_fn on raw features
        _, transformed_features = (
            saved_transform_io.partially_apply_saved_transform(
                os.path.join(Params.TRANSFORM_ARTEFACTS_DIR, transform_fn_io.TRANSFORM_FN_DIR),
            raw_features)
        )
        
        # apply the process_features function to transformed features
        transformed_features = process_features(transformed_features)
        
        return tf.estimator.export.ServingInputReceiver(
            transformed_features, raw_features)
    
    return _serving_fn

### 6.3 Create TrainSpec and EvalSpec

In [None]:
train_spec = tf.estimator.TrainSpec(
    input_fn = generate_tfrecords_input_fn(
        Params.TRANSFORMED_TRAIN_DATA_FILE_PREFIX+"*",
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs=hparams.num_epochs,
        batch_size=hparams.batch_size
    ),
    max_steps=hparams.max_steps,
    hooks=None
)

eval_spec = tf.estimator.EvalSpec(
    input_fn = generate_tfrecords_input_fn(
        Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX+"*",
        mode=tf.estimator.ModeKeys.EVAL,
        num_epochs=1,
        batch_size=hparams.batch_size
    ),
    exporters=[tf.estimator.LatestExporter(
        name="estimate", # the name of the folder in which the model will be exported to under export
        serving_input_receiver_fn=generate_json_serving_fn(),
        exports_to_keep=1,
        as_text=False)],
    steps=None,
    throttle_secs=EVAL_EVERY_SEC
)

### 6.4 Run experiment

In [None]:
if Params.TRAIN:
    if not Params.RESUME_TRAINING:
        print("Removing previous training artefacts...")
        shutil.rmtree(model_dir, ignore_errors=True)
    else:
        print("Resuming training...") 


    tf.logging.set_verbosity(tf.logging.INFO)

    time_start = datetime.utcnow() 
    print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 

    estimator = create_dnn_estimator(run_config, hparams)

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )

    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
else:
    print "Training was skipped!"

In [None]:
%%bash

ls models/dnn_estimator/export

## 7. Evaluate the model

In [None]:
TRAIN_SIZE = 41188
VALID_SIZE = 4119

tf.logging.set_verbosity(tf.logging.ERROR)

estimator = create_dnn_estimator(run_config, hparams)

train_metrics = estimator.evaluate(
    input_fn = generate_tfrecords_input_fn(
        files_name_pattern= Params.TRANSFORMED_TRAIN_DATA_FILE_PREFIX+"*", 
        mode= tf.estimator.ModeKeys.EVAL,
        batch_size= TRAIN_SIZE), 
    steps=1
)


print("############################################################################################")
print("# Train Measures: {}".format(train_metrics))
print("############################################################################################")

eval_metrics = estimator.evaluate(
    input_fn=generate_tfrecords_input_fn(
        files_name_pattern= Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX+"*", 
        mode= tf.estimator.ModeKeys.EVAL,
        batch_size= TRAIN_SIZE), 
    steps=1
)
print("")
print("############################################################################################")
print("# Valid Measures: {}".format(eval_metrics))
print("############################################################################################")


## 8. Model Evaluation Analysis Using TFMA

In [None]:
import tensorflow_model_analysis as tfma

### 8.1 Define Evaluate input function

In [None]:
def generate_eval_receiver_fn(transform_artefacts_dir):
    
    transformed_metadata = metadata_io.read_metadata(transform_artefacts_dir+"/transformed_metadata")
    transformed_feature_spec = transformed_metadata.schema.as_feature_spec()
    
    def _eval_receiver_fn():
        
        serialized_tf_example = tf.placeholder(
            dtype=tf.string, shape=[None], name='input_example_placeholder')

        receiver_tensors = {'examples': serialized_tf_example}
        transformed_features = tf.parse_example(serialized_tf_example, transformed_feature_spec)

        return tfma.export.EvalInputReceiver(
            features=transformed_features,
            receiver_tensors=receiver_tensors,
            labels=transformed_features[TARGET_FEATURE_NAME])

    return _eval_receiver_fn

### 8.2 Export Evaluation Saved Model

In [None]:
eval_model_dir = model_dir +"/export/evaluate"

shutil.rmtree(eval_model_dir, ignore_errors=True)

tfma.export.export_eval_savedmodel(
        estimator=estimator,
        export_dir_base=eval_model_dir,
        eval_input_receiver_fn=generate_eval_receiver_fn(Params.TRANSFORM_ARTEFACTS_DIR)
)

In [None]:
%%bash

ls models/dnn_estimator/export

### 8.3 Define Data SliceSpecs for Model Analysis

In [None]:
slice_spec = [tfma.SingleSliceSpec()]
for feature_name in TRANSFORMED_NUMERIC_FEATURE_NAMES + TRANSFORMED_BUCKETIZED_FEATURE_NAMES + TRANSFORMED_CATEGORICAL_FEATURE_NAMES:
    slice_spec += [tfma.SingleSliceSpec(columns=[feature_name])]

#print slice_spec
    
model_location = os.path.join(eval_model_dir, os.listdir(eval_model_dir)[0])
data_location = Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX+"*.tfrecords"

eval_result = tfma.run_model_analysis(
    model_location=model_location , 
    data_location=data_location, 
    file_format='tfrecords', 
    slice_spec=slice_spec, 
#     example_weight_key=None, 
#     output_path=None
)

# print eval_result

### 8.4 Visualise TF Model Analysis

In [None]:
tfma.view.render_slicing_metrics(
    result=eval_result, 
    slicing_column='job'
)

## 9. Predict Using the Serving Saved Model

In [None]:
%%bash

ls models/dnn_estimator/export/estimate

In [None]:
saved_model_base_dir=os.path.join(model_dir,'export/estimate')

SAVED_MODEL_DIR=os.path.join(saved_model_base_dir, os.listdir(saved_model_base_dir)[0])

def predict(instance):
 
    predictor_fn = tf.contrib.predictor.from_saved_model(
        export_dir=SAVED_MODEL_DIR,
        signature_def_key="predict"
    )
    
    instance = dict((k, [v]) for k, v in instance.items())
    value = predictor_fn(instance)
    return value

instance = {
    "age":56,
    "job":"housemaid",
    "marital_status":"married",
    "education":"basic.4y",
    "has_credit_default":"no",
    "has_housing_loan":"no",
    "has_personal_loan":"no",
    "contact_type":"telephone",
    "last_contact_month":"may",
    "last_contact_day_of_week":"mon",
    "last_contact_duration":261,
    "campaign_contact_count":1,
    "days_since_last_contact":999,
    "previous_campaign_contact_count":0,
    "previous_campaign_outcome":"nonexistent",
    "employment_variation_rate":1.1,
    "consumer_price_index":93.994,
    "consumer_confidence_index":-36.4,
    "euribor3m":4.857,
    "number_of_employees":5191.0,
    #"has_subscribed":"no"
}

prediction = predict(instance)
print(prediction)

## Congratulations! You have finished the tutorial.