In [7]:
import os
import string
import tempfile
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.python.keras.datasets import imdb
from tensorflow.python.keras.preprocessing import sequence
from tensorboard import summary as summary_lib

tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)

1.5.0


In [8]:
vocabuloryFile = open('simple-engish-wiki-10-article-vocabulory.txt','r')
fileContent = vocabuloryFile.read()
vocabulory_read = fileContent.split('\n')
#print(vocabulory_read[0])  
vocabuloryFile.close()

vocab_to_id_dict = dict(zip(vocabulory_read, range(len(vocabulory_read))))
#print(vocab_to_id_dict)
id_to_vocab_dict = dict(zip(range(len(vocabulory_read)), vocabulory_read))
#print(id_to_vocab_dict)

In [9]:
train_features = []
train_labels = []
for row in np.random.choice(vocabulory_read,size=(10000,10)):
    train_features.append([vocab_to_id_dict[word] for word in row])
    train_labels.append(1)

for row in np.random.choice(vocabulory_read,size=(10000,5)):
    train_features.append([vocab_to_id_dict[word] for word in row])
    train_labels.append(0)
    
test_features = []
test_labels = []

for row in np.random.choice(vocabulory_read,size=(1000,10)):
    test_features.append([vocab_to_id_dict[word] for word in row])
    test_labels.append(1)

for row in np.random.choice(vocabulory_read,size=(1000,5)):
    test_features.append([vocab_to_id_dict[word] for word in row])
    test_labels.append(0)
    
train_features = np.array(train_features)
train_labels = np.array(train_labels)

test_features = np.array(test_features)
test_labels = np.array(test_labels)

print(test_features)
print(test_labels)

vocab_size = len(vocabulory_read)
sentence_size = 10
embedding_size = 50
model_dir = tempfile.mkdtemp()

# we assign the first indices in the vocabulary to special tokens that we use
# for padding, as start token, and for indicating unknown words
pad_id = 0
start_id = 1
oov_id = 2
index_offset = 2

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(train_features, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
y_train = train_labels
x_test = sequence.pad_sequences(test_features, 
                                maxlen=sentence_size,
                                truncating='post',
                                padding='post', 
                                value=pad_id)

y_test = test_labels
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)

[list([1671, 3671, 3109, 687, 1413, 3272, 3637, 1680, 2158, 4236])
 list([1653, 342, 1875, 498, 1709, 3842, 2213, 2003, 3986, 2234])
 list([1670, 3646, 3898, 481, 3304, 3433, 2412, 1661, 314, 517]) ...
 list([2328, 1469, 1652, 786, 4370]) list([4236, 1701, 1472, 3341, 180])
 list([3090, 2370, 2581, 975, 45])]
[1 1 1 ... 0 0 0]
Pad sequences (samples x time)
x_train shape: (20000, 10)
x_test shape: (2000, 10)


In [10]:
x_len_train = np.array([min(len(x), sentence_size) for x in train_features])
x_len_test = np.array([min(len(x), sentence_size) for x in test_features])

def parser(x, length, y):
    features = {"x": x, "len": length}
    return features, y

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train, y_train))
    dataset = dataset.shuffle(buffer_size=len(train_features))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_test, x_len_test, y_test))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

In [11]:
all_classifiers = {}
def train_and_evaluate(classifier):
    # Save a reference to the classifier to run predictions later
    all_classifiers[classifier.model_dir] = classifier
    classifier.train(input_fn=train_input_fn, steps=2500)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    predictions = np.array([p['logistic'][0] for p in classifier.predict(input_fn=eval_input_fn)])
        
    # Reset the graph to be able to reuse name scopes
    tf.reset_default_graph() 
    # Add a PR summary in addition to the summaries that the classifier writes
    pr = summary_lib.pr_curve('precision_recall', predictions=predictions, labels=y_test.astype(bool), num_thresholds=21)
    with tf.Session() as sess:
        writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'), sess.graph)
        writer.add_summary(sess.run(pr), global_step=0)
        writer.close()
#     # Un-comment code to download experiment data from Colaboratory
#     from google.colab import files
#     model_name = os.path.basename(os.path.normpath(classifier.model_dir))
#     ! zip -r {model_name + '.zip'} {classifier.model_dir}
#     files.download(model_name + '.zip')

In [12]:
head = tf.contrib.estimator.binary_classification_head()

def lstm_model_fn(features, labels, mode):    
    # [batch_size x sentence_size x embedding_size]
    inputs = tf.contrib.layers.embed_sequence(
        features['x'], vocab_size, embedding_size,
        initializer=tf.random_uniform_initializer(-1.0, 1.0))

    # create an LSTM cell of size 100
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(100)
    
    # create the complete LSTM
    _, final_states = tf.nn.dynamic_rnn(
        lstm_cell, inputs, sequence_length=features['len'], dtype=tf.float32)

    # get the final hidden states of dimensionality [batch_size x sentence_size]
    outputs = final_states.h

    logits = tf.layers.dense(inputs=outputs, units=1)

    # This will be None when predicting
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])

    optimizer = tf.train.AdamOptimizer()

    def _train_op_fn(loss):
        return optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())

    return head.create_estimator_spec(
        features=features,
        labels=labels,
        mode=mode,
        logits=logits,
        train_op_fn=_train_op_fn)


lstm_classifier = tf.estimator.Estimator(model_fn=lstm_model_fn,
                                         model_dir=os.path.join(model_dir, 'lstm'))
train_and_evaluate(lstm_classifier)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp2k4pfc94/lstm', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9a515e4e48>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp2k4pfc94/lstm/model.ckpt.
INFO:tensorflow:loss = 69.71657, step = 1
INFO:tensorflow:global_step/sec: 18.6358
INFO:tensorflow:loss = 0.7091995, step = 101 (5.367 sec)
INFO:tensorflow:global_step/sec: 17.2314
INFO:tensorflow:loss = 0.09826494, step = 201 (5.803 sec)
INFO:tensorflow:global_step/sec: 20.3255
INFO:tensorf