In [5]:
import numpy as np
import tensorflow as tf
import pandas as pd
import tensorflow.keras as keras
import os
from collections import Counter
tf.config.optimizer.set_jit(True)

# Files
trainingDataFilename = "data/Train.csv"

# Read training data from CSV
df = pd.read_csv(trainingDataFilename, usecols=['text', 'label'], dtype={'text': 'str', 'label': 'int64'})
dfX = df.loc[:, 'text']
dfY = df.loc[:, 'label']
dfX = dfX[:10000]
dfY = dfY[:10000]

# Build vocabulary
vocabulary = Counter()
for title in dfX:
    words = title.split()
    validWords = filter(lambda x: len(x) <= 14, words)
    vocabulary.update(validWords)

# Truncate vocabulary
vocab_size = 5000
truncatedVocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

# Print out the 5 most common words and the number of times they occur
#print("5 most common words:", vocabulary.most_common()[:5])

# Convert words to tensor
words = tf.constant(truncatedVocabulary)

# Assign each word an ID
word_ids = tf.range(len(truncatedVocabulary), dtype=tf.int64)

# Create KeyValueTensor
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

# Create lookup table
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

# Test the lookup table
#testArr = "China and Iraq are in the dataset".split()
#testRes = table.lookup(tf.constant(testArr))
#print("Test result:", testRes)

print("done");

done


In [None]:
# Save vocabulary to disk

numpy_data = np.array([words, word_ids])
df = pd.DataFrame(data=numpy_data)
print('Vocabulary shape:', df.shape)
print('Saving vocabulary to latest_vocabulary.csv.')
df.to_csv(vocabularyFilename, sep='\t')

In [7]:
# Convert sentences to arrays of word ids

data = []
for title in dfX:
    sample = tf.strings.split(title)
    processed = table.lookup(sample)
    data.append(processed.numpy())

# Create a ragged tensor and then convert it to a padded dense tensor
ragged = tf.ragged.constant(data)
ragged = ragged.to_tensor(default_value=0)

# Make dataset
features = tf.constant(ragged)
print(features.shape)
labels = tf.constant(dfY)
print(labels.shape)

# Find the largest feature vector length
featureLen = tf.shape(features)[1]
print('Length of longest entry (to be used for the max padding of zeroes later):', featureLen)

# Convert to dataset
train_set = tf.data.Dataset.from_tensor_slices((features, labels)).batch(32).prefetch(1)
#print(next(train_set.batch(32).as_numpy_iterator())[0][0])

print('done')

(10000, 2470)
(10000,)
Length of longest title (featureLen) which will be used for the max padding of zeroes later: tf.Tensor(2470, shape=(), dtype=int32)
done


In [3]:
# Create and train the model

embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None], mask_zero=True),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(train_set, batch_size=32, epochs=5)
print('done')

In [3]:
# Save the model to disk

model.save("saved_models/latest)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
done


In [13]:
# Test the model

testArr = tf.strings.split("Not sure if I understood the premise at all. It was sort of weird. Probably won't see this again.")
test = table.lookup(testArr)
zero_padding = tf.zeros(tf.shape(features)[1] - tf.shape(test)[0], dtype=tf.int64)
padded = tf.concat([test, zero_padding],0)
padded = a_padded.numpy().reshape(1,-1);
prediction = model.predict(padded)
print(prediction)
print("Review is positive:", prediction[0] > 0.94)


[[0.12612972]]
Review is positive: [False]
