In [1]:
!ls

sample_data


In [2]:
from google.colab import drive
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
import tensorflow as tf
from tensorflow import data
from datetime import datetime
import time
import multiprocessing
import shutil
import os
os.chdir("/content/drive/My Drive/lower_level_api")

print(tf.__version__)

MODEL_NAME = 'estimator-demo-'+ time.strftime('%m%d%H%M%S', time.localtime())

TRAIN_DATA_FILES_PATTERN = 'data/sms-spam/train-*.tsv'
VALID_DATA_FILES_PATTERN = 'data/sms-spam/valid-*.tsv'

VOCAB_LIST_FILE = 'data/sms-spam/vocab_list.tsv'
N_WORDS_FILE = 'data/sms-spam/n_words.tsv'

RESUME_TRAINING = False
MULTI_THREADING = True

# dataset metadata
MAX_DOCUMENT_LENGTH = 50

PAD_WORD = '#=KS=#'

HEADER = ['class', 'sms']
HEADER_DEFAULTS = [['NA'], ['NA']]

TEXT_FEATURE_NAME = 'sms'

TARGET_NAME = 'class'

WEIGHT_COLUNM_NAME = 'weight'

TARGET_LABELS = ['spam', 'ham']

with open(N_WORDS_FILE) as file:
    N_WORDS = int(file.read())+2
print(N_WORDS)

# data input func

def parse_tsv_row(tsv_row):
    
    columns = tf.decode_csv(tsv_row, record_defaults=HEADER_DEFAULTS, field_delim='\t')
    features = dict(zip(HEADER, columns))
    
    target = features.pop(TARGET_NAME)
    
    # giving more weight to "spam" records are the are only 13% of the training set
    features[WEIGHT_COLUNM_NAME] =  tf.cond( tf.equal(target,'spam'), lambda: 6.6, lambda: 1.0 ) 

    return features, target
  
def parse_label_column(label_string_tensor):
    table = tf.contrib.lookup.index_table_from_tensor(tf.constant(TARGET_LABELS))
    return table.lookup(label_string_tensor)

def input_fn(files_name_pattern, mode=tf.estimator.ModeKeys.EVAL, 
                 skip_header_lines=0, 
                 num_epochs=1, 
                 batch_size=200):
    
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    
    num_threads = multiprocessing.cpu_count() if MULTI_THREADING else 1
    
    buffer_size = 2 * batch_size + 1
   
    print("")
    print("* data input_fn:")
    print("================")
    print("Input file(s): {}".format(files_name_pattern))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")

    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TextLineDataset(filenames=file_names)
    
    dataset = dataset.skip(skip_header_lines)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size)
        
    dataset = dataset.map(lambda tsv_row: parse_tsv_row(tsv_row), 
                          num_parallel_calls=num_threads)
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.prefetch(buffer_size)
    
    iterator = dataset.make_one_shot_iterator()
    
    features, target = iterator.get_next()
    return features, parse_label_column(target)

1.11.0-rc1
11332


## 3. Define Model Function

In [0]:
def process_text(text_feature):
    
    # Load vocabolary lookup table to map word => word_id
    vocab_table = tf.contrib.lookup.index_table_from_file(vocabulary_file=VOCAB_LIST_FILE, 
                                                          num_oov_buckets=1, default_value=-1)
    # Get text feature
    smss = text_feature
    # Split text to words -> this will produce sparse tensor with variable-lengthes (word count) entries
    words = tf.string_split(smss)
    # Convert sparse tensor to dense tensor by padding each entry to match the longest in the batch
    dense_words = tf.sparse_tensor_to_dense(words, default_value=PAD_WORD)
    # Convert word to word_ids via the vocab lookup table
    word_ids = vocab_table.lookup(dense_words)
    # Create a word_ids padding
    padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0,0], [-1, MAX_DOCUMENT_LENGTH])
    
    # Return the final word_id_vector
    return word_id_vector


def model_fn(features, labels, mode, params):
    
    hidden_units = params.hidden_units
    output_layer_size = len(TARGET_LABELS)
    embedding_size = params.embedding_size
    forget_bias = params.forget_bias
    keep_prob = params.keep_prob
    
    # word_id_vector
    word_id_vector = process_text(features[TEXT_FEATURE_NAME]) 
    # print("word_id_vector: {}".format(word_id_vector)) # (?, MAX_DOCUMENT_LENGTH)
    
    # layer to take each word_id and convert it into vector (embeddings) 
    word_embeddings = tf.contrib.layers.embed_sequence(word_id_vector, vocab_size=N_WORDS, 
                                                 embed_dim=embedding_size) 
    #print("word_embeddings: {}".format(word_embeddings)) # (?, MAX_DOCUMENT_LENGTH, embbeding_size)
    
    # configure the RNN
    rnn_layers = [tf.nn.rnn_cell.LSTMCell(
        num_units=size, 
        forget_bias=params.forget_bias,
        activation=tf.nn.tanh) for size in hparams.hidden_units]

    # create a RNN cell composed sequentially of a number of RNNCells
    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
    
    input_layer = tf.unstack(word_embeddings, axis=1)
    # list of len(MAX_DOCUMENT_LENGTH), each element is (?,  embbeding_size)
    #print("input_layer: {}".format(input_layer)) 
    
    outputs, _ = tf.nn.static_rnn(cell=multi_rnn_cell, 
                                inputs=input_layer, 
                                dtype=tf.float32)
    
    # slice to keep only the last cell of the RNN
    rnn_output = outputs[-1]

    # Connect the output layer (logits) to the hidden layer (no activation fn)
    logits = tf.layers.dense(inputs=rnn_output, 
                             units=output_layer_size, 
                             activation=None)
    # print("logits: {}".format(logits)) # (?, output_layer_size)

    # Provide an estimator spec for `ModeKeys.PREDICT`.
    if mode == tf.estimator.ModeKeys.PREDICT:
        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(probabilities, 1)

        # Convert predicted_indices back into strings
        predictions = {
            'class': tf.gather(TARGET_LABELS, predicted_indices),
            'probabilities': probabilities
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        
        # Provide an estimator spec for `ModeKeys.PREDICT` modes.
        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)
    
    # weights
    weights = features[WEIGHT_COLUNM_NAME]

    # Calculate loss using softmax cross entropy
    loss = tf.losses.sparse_softmax_cross_entropy(
        logits=logits, labels=labels, 
        weights=weights
    )
    
    tf.summary.scalar('loss', loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        # Create Optimiser
        optimizer = tf.train.AdamOptimizer(params.learning_rate)

        # Create training operation
        train_op = optimizer.minimize(
            loss=loss, global_step=tf.train.get_global_step())

        # Provide an estimator spec for `ModeKeys.TRAIN` modes.
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss, 
                                          train_op=train_op)
        

    if mode == tf.estimator.ModeKeys.EVAL:
        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(probabilities, 1)

        # Return accuracy and area under ROC curve metrics
        labels_one_hot = tf.one_hot(
            labels,
            depth=len(TARGET_LABELS),
            on_value=True,
            off_value=False,
            dtype=tf.bool
        )
        
        eval_metric_ops = {
            'accuracy': tf.metrics.accuracy(labels, predicted_indices, weights=weights),
            'auroc': tf.metrics.auc(labels_one_hot, probabilities, weights=weights)
        }
        
        # Provide an estimator spec for `ModeKeys.EVAL` modes.
        return tf.estimator.EstimatorSpec(mode, 
                                          loss=loss, 
                                          eval_metric_ops=eval_metric_ops)

def create_estimator(run_config, hparams):
    estimator = tf.estimator.Estimator(model_fn=model_fn, 
                                  params=hparams, 
                                  config=run_config)
    
    print("")
    print("Estimator Type: {}".format(type(estimator)))
    print("")

    return estimator

## 4. Run Experiment

### a. Set HParam and RunConfig

In [33]:
TRAIN_SIZE = 4179
NUM_EPOCHS = 100
BATCH_SIZE = 250
EVAL_AFTER_SEC = 60
TOTAL_STEPS = int((TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS)

hparams  = tf.contrib.training.HParams(
    num_epochs = NUM_EPOCHS,
    batch_size = BATCH_SIZE,
    embedding_size = 200,
    forget_bias=1.0,
    keep_prob = 0.8,
    hidden_units=[24, 16],
    max_steps = TOTAL_STEPS,
    learning_rate = 0.01
)

model_dir = 'trained_models/{}'.format(MODEL_NAME)

run_config = tf.estimator.RunConfig(
    log_step_count_steps=5000,
    tf_random_seed=19830610,
    model_dir=model_dir
)

print(hparams)
print("Model Directory:", run_config.model_dir)
print("")
print("Dataset Size:", TRAIN_SIZE)
print("Batch Size:", BATCH_SIZE)
print("Steps per Epoch:",TRAIN_SIZE/BATCH_SIZE)
print("Total Steps:", TOTAL_STEPS)
print("That is 1 evaluation step after each",EVAL_AFTER_SEC,"training seconds")

[('batch_size', 250), ('embedding_size', 200), ('forget_bias', 1.0), ('hidden_units', [24, 16]), ('keep_prob', 0.8), ('learning_rate', 0.01), ('max_steps', 1671), ('num_epochs', 100)]
Model Directory: trained_models/estimator-demo-0925030157

Dataset Size: 4179
Batch Size: 250
Steps per Epoch: 16.716
Total Steps: 1671
That is 1 evaluation step after each 60 training seconds


In [0]:
def serving_input_fn():
    
    receiver_tensor = {
      'sms': tf.placeholder(tf.string, [None]),
    }
    
    features = {
      key: tensor
      for key, tensor in receiver_tensor.items()
    }
    
    return tf.estimator.export.ServingInputReceiver(
        features, receiver_tensor)

In [0]:
train_spec = tf.estimator.TrainSpec(
    input_fn = lambda: input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs=hparams.num_epochs,
        batch_size=hparams.batch_size
    ),
    max_steps=hparams.max_steps,
    hooks=None
)

eval_spec = tf.estimator.EvalSpec(
    input_fn = lambda: input_fn(
        VALID_DATA_FILES_PATTERN,
        mode=tf.estimator.ModeKeys.EVAL,
        batch_size=hparams.batch_size
    ),
    exporters=[tf.estimator.LatestExporter(
        name="predict", # the name of the folder in which the model will be exported to under export
        serving_input_receiver_fn=serving_input_fn,
        exports_to_keep=1,
        as_text=True)],
    steps=None,
    throttle_secs = EVAL_AFTER_SEC
)

### d. Run Experiment via train_and_evaluate

In [36]:
if not RESUME_TRAINING:
    print("Removing previous artifacts...")
    shutil.rmtree(model_dir, ignore_errors=True)
else:
    print("Resuming training...") 

    
tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow() 
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

estimator = create_estimator(run_config, hparams)

tf.estimator.train_and_evaluate(
    estimator=estimator,
    train_spec=train_spec, 
    eval_spec=eval_spec
)

time_end = datetime.utcnow() 
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
    

Removing previous artifacts...
Experiment started at 03:19:03
.......................................
INFO:tensorflow:Using config: {'_model_dir': 'trained_models/estimator-demo-0925030157', '_tf_random_seed': 19830610, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff47bef1fd0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

Estimator Type: <class 'tensorflow.python.estim

## 5. Evaluate the Model

In [37]:
TRAIN_SIZE = 4179
TEST_SIZE = 1393

train_input_fn = lambda: input_fn(files_name_pattern= TRAIN_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TRAIN_SIZE)

test_input_fn = lambda: input_fn(files_name_pattern= VALID_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TEST_SIZE)

estimator = create_estimator(run_config, hparams)

train_results = estimator.evaluate(input_fn=train_input_fn, steps=1)
print()
print("######################################################################################")
print("# Train Measures: {}".format(train_results))
print("######################################################################################")

test_results = estimator.evaluate(input_fn=test_input_fn, steps=1)
print()
print("######################################################################################")
print("# Test Measures: {}".format(test_results))
print("######################################################################################")

INFO:tensorflow:Using config: {'_model_dir': 'trained_models/estimator-demo-0925030157', '_tf_random_seed': 19830610, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff47bef1898>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

Estimator Type: <class 'tensorflow.python.estimator.estimator.Estimator'>


* data input_fn:
Input file(s): data/sms-spam/train-*.tsv
Batch size: 417

## 6. Predict Using Serving Function

In [38]:
import os

export_dir = model_dir +"/export/predict/"

saved_model_dir = export_dir + "/" + os.listdir(path=export_dir)[-1] 

print(saved_model_dir)
print("")

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
    signature_def_key="prediction"
)

output = predictor_fn(
    {
        'sms':[
            'ok, I will be with you in 5 min. see you then',
            'win 1000 cash free of charge promo hot deal sexy',
            'hot girls sexy tonight call girls waiting for chat'
        ]
        
    }
)
print(output)

trained_models/estimator-demo-0925030157/export/predict//1537845702

INFO:tensorflow:Restoring parameters from trained_models/estimator-demo-0925030157/export/predict//1537845702/variables/variables
{'class': array([b'ham', b'spam', b'spam'], dtype=object), 'probabilities': array([[8.4779622e-06, 9.9999154e-01],
       [9.9998772e-01, 1.2297025e-05],
       [9.9998629e-01, 1.3683886e-05]], dtype=float32)}


# Understanding memory and time usage

In [39]:
import numpy as np

variables = estimator.get_variable_names()
for var_name in variables:
  print(var_name, ": ", estimator.get_variable_value(var_name).shape)

EmbedSequence/embeddings :  (11332, 200)
EmbedSequence/embeddings/Adam :  (11332, 200)
EmbedSequence/embeddings/Adam_1 :  (11332, 200)
beta1_power :  ()
beta2_power :  ()
dense/bias :  (2,)
dense/bias/Adam :  (2,)
dense/bias/Adam_1 :  (2,)
dense/kernel :  (16, 2)
dense/kernel/Adam :  (16, 2)
dense/kernel/Adam_1 :  (16, 2)
global_step :  ()
rnn/multi_rnn_cell/cell_0/lstm_cell/bias :  (96,)
rnn/multi_rnn_cell/cell_0/lstm_cell/bias/Adam :  (96,)
rnn/multi_rnn_cell/cell_0/lstm_cell/bias/Adam_1 :  (96,)
rnn/multi_rnn_cell/cell_0/lstm_cell/kernel :  (224, 96)
rnn/multi_rnn_cell/cell_0/lstm_cell/kernel/Adam :  (224, 96)
rnn/multi_rnn_cell/cell_0/lstm_cell/kernel/Adam_1 :  (224, 96)
rnn/multi_rnn_cell/cell_1/lstm_cell/bias :  (64,)
rnn/multi_rnn_cell/cell_1/lstm_cell/bias/Adam :  (64,)
rnn/multi_rnn_cell/cell_1/lstm_cell/bias/Adam_1 :  (64,)
rnn/multi_rnn_cell/cell_1/lstm_cell/kernel :  (40, 64)
rnn/multi_rnn_cell/cell_1/lstm_cell/kernel/Adam :  (40, 64)
rnn/multi_rnn_cell/cell_1/lstm_cell/ker

In [40]:
param_count = np.sum([np.prod(estimator.get_variable_value(var_name).shape) for var_name in estimator.get_variable_names()])
param_count

6871977.0