In [60]:
import os
import string
import tempfile
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.python.keras.datasets import imdb
from tensorflow.python.keras.preprocessing import sequence
from tensorboard import summary as summary_lib

tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)

1.5.0


In [61]:
wiki_read = pd.read_csv('simple-wikipedia-sentences-10-article.csv')
print(wiki_read.head(10))

vocabuloryFile = open('simple-engish-wiki-10-article-vocabulory.txt','r')
fileContent = vocabuloryFile.read()
vocabulory_read = fileContent.split('\n')
#print(vocabulory_read[0])  
vocabuloryFile.close()

vocab_to_id_dict = dict(zip(vocabulory_read, range(len(vocabulory_read))))
#print(vocab_to_id_dict)
id_to_vocab_dict = dict(zip(range(len(vocabulory_read)), vocabulory_read))
#print(id_to_vocab_dict)

   class                                           sentence
0      1  Roanoke Island is an island that is part of Da...
1      1  Roanoke was first settled around 8,000 B.C. Vi...
2      1  At time of European Contact the island was hom...
3      1  It was located on what is now the coast of Nor...
4      1  Walter Raleigh and others lived there from 158...
5      1  It was eventually given up, and nobody knows why.
6      1  Virginia Dare, the first English child born in...
7      1  There was a Civil War battle fought on this is...
8      1  Union forces had the advantage because they ou...
9      1  The Confederate Army beveling that the swamps ...


In [66]:
train_features = []
train_labels = []
for index, row in wiki_read.iterrows():
    train_features.append([vocab_to_id_dict[word] for word in row['sentence'].split(' ')])
    train_labels.append(row['class'])

test_features = train_features[10:200]
test_labels = train_labels[10:200]

train_features = np.array(train_features)
train_labels = np.array(train_labels)

test_features = np.array(test_features)
test_labels = np.array(test_labels)

print(test_features)
print(test_labels)

vocab_size = len(vocabulory_read)
sentence_size = 25
embedding_size = 50
model_dir = tempfile.mkdtemp()

# we assign the first indices in the vocabulary to special tokens that we use
# for padding, as start token, and for indicating unknown words
pad_id = 0
start_id = 1
oov_id = 2
index_offset = 2

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(train_features, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
y_train = train_labels
x_test = sequence.pad_sequences(test_features, 
                                maxlen=sentence_size,
                                truncating='post',
                                padding='post', 
                                value=pad_id)

y_test = test_labels
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)

[list([1362, 2423, 3422, 1646, 3274, 4409, 1296, 4020, 3192, 423, 371, 3274, 4409, 4379, 759, 4281, 3065, 3373])
 list([1362, 2423, 3591, 4020, 3274, 4347, 3305, 3274, 4305])
 list([2402, 1373, 23, 3069, 1203, 3906, 3274, 1273, 3397, 2765, 3290, 771, 1373, 1698, 3956, 3569])
 list([3130, 3274, 1655, 3274, 1441, 3397, 1373, 2383])
 list([1985, 2765, 3274, 244, 4419, 246, 826, 1154, 3192, 2318, 1211, 122, 2124, 3065, 1441, 4113])
 list([3121, 604, 3274, 4050, 2765, 4408, 1373, 3481, 1698, 3956, 2204, 1211, 1530, 551, 3274, 2094, 2765, 1028, 4116])
 list([4408, 2412, 3823, 4433, 4029, 290, 3054, 3627, 2701, 1856, 551, 1942, 1848, 721, 1028, 17, 539, 2541, 3790])
 list([4408, 3163, 3274, 4457, 4050, 4020, 3274, 4347, 3956, 3274, 1777, 2514, 2765, 44, 2540])
 list([3273, 3274, 1122, 2765, 3274, 4347, 3194, 3163, 4029, 4050, 1544, 2499, 3055, 3163, 2976, 1266, 3571, 3881])
 list([2513, 1848, 1721, 659, 2339, 3274, 1352, 81, 2412, 717, 3274, 1030, 810, 12, 2169, 2736, 721, 3274, 3891, 3956, 3

In [67]:
x_len_train = np.array([min(len(x), sentence_size) for x in train_features])
x_len_test = np.array([min(len(x), sentence_size) for x in test_features])

def parser(x, length, y):
    features = {"x": x, "len": length}
    return features, y

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train, y_train))
    dataset = dataset.shuffle(buffer_size=len(train_features))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_test, x_len_test, y_test))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

In [68]:
all_classifiers = {}
def train_and_evaluate(classifier):
    # Save a reference to the classifier to run predictions later
    all_classifiers[classifier.model_dir] = classifier
    classifier.train(input_fn=train_input_fn, steps=2500)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    predictions = np.array([p['logistic'][0] for p in classifier.predict(input_fn=eval_input_fn)])
        
    # Reset the graph to be able to reuse name scopes
    tf.reset_default_graph() 
    # Add a PR summary in addition to the summaries that the classifier writes
    pr = summary_lib.pr_curve('precision_recall', predictions=predictions, labels=y_test.astype(bool), num_thresholds=21)
    with tf.Session() as sess:
        writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'), sess.graph)
        writer.add_summary(sess.run(pr), global_step=0)
        writer.close()
#     # Un-comment code to download experiment data from Colaboratory
#     from google.colab import files
#     model_name = os.path.basename(os.path.normpath(classifier.model_dir))
#     ! zip -r {model_name + '.zip'} {classifier.model_dir}
#     files.download(model_name + '.zip')

In [69]:
head = tf.contrib.estimator.binary_classification_head()

def lstm_model_fn(features, labels, mode):    
    # [batch_size x sentence_size x embedding_size]
    inputs = tf.contrib.layers.embed_sequence(
        features['x'], vocab_size, embedding_size,
        initializer=tf.random_uniform_initializer(-1.0, 1.0))

    # create an LSTM cell of size 100
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(100)
    
    # create the complete LSTM
    _, final_states = tf.nn.dynamic_rnn(
        lstm_cell, inputs, sequence_length=features['len'], dtype=tf.float32)

    # get the final hidden states of dimensionality [batch_size x sentence_size]
    outputs = final_states.h

    logits = tf.layers.dense(inputs=outputs, units=1)

    # This will be None when predicting
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])

    optimizer = tf.train.AdamOptimizer()

    def _train_op_fn(loss):
        return optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())

    return head.create_estimator_spec(
        features=features,
        labels=labels,
        mode=mode,
        logits=logits,
        train_op_fn=_train_op_fn)


lstm_classifier = tf.estimator.Estimator(model_fn=lstm_model_fn,
                                         model_dir=os.path.join(model_dir, 'lstm'))
train_and_evaluate(lstm_classifier)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpj6ka397k/lstm', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f4c44379780>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpj6ka397k/lstm/model.ckpt.
INFO:tensorflow:loss = 69.166145, step = 1
INFO:tensorflow:global_step/sec: 9.61782
INFO:tensorflow:loss = 8.005314, step = 101 (10.399 sec)
INFO:tensorflow:global_step/sec: 10.12
INFO:tensorflow:loss = 0.9029907, step = 201 (9.882 sec)
INFO:tensorflow:global_step/sec: 10.0151
INFO:tensorflo