In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import tensorflow.keras as keras
import os
from collections import Counter
tf.config.optimizer.set_jit(True)

filename = "./Train.csv"
df = pd.read_csv(filename, usecols=['text', 'label'], dtype={'text': 'str', 'label': 'int64'})
dfX = df.loc[:, 'text']
dfY = df.loc[:, 'label']
dfX = dfX[:10000]
dfY = dfY[:10000]

vocabulary = Counter()
for title in dfX:
    words = title.split()
    validWords = filter(lambda x: len(x) <= 14, words)
    vocabulary.update(validWords)

# Truncate vocabulary
vocab_size = 5000
truncatedVocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

# Print out the 10 most common words and the number of times they occur
print("Most common words:", vocabulary.most_common()[:10])

# Convert words to tensor
words = tf.constant(truncatedVocabulary)

# Assign each word an ID
word_ids = tf.range(len(truncatedVocabulary), dtype=tf.int64)

# Create KeyValueTensor
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

# Create lookup table
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

# Test the lookup table
#testArr = "China and Iraq are in the dataset".split()
#testRes = table.lookup(tf.constant(testArr))
#print("Test result:", testRes)

print("done");

Most common words: [('the', 112780), ('a', 61522), ('and', 60362), ('of', 56663), ('to', 52397), ('is', 40621), ('in', 34059), ('I', 26135), ('that', 25365), ('this', 23018)]
done


In [2]:
# Convert sentences to arrays of word ids
data = []
for title in dfX:
    sample = tf.strings.split(title)
    processed = table.lookup(sample)
    data.append(processed.numpy())

# Create a ragged tensor and then convert it to a padded dense tensor
ragged = tf.ragged.constant(data)
ragged = ragged.to_tensor(default_value=0)

# Make dataset
features = tf.constant(ragged)
print(features.shape)
labels = tf.constant(dfY)
print(labels.shape)

train_set = tf.data.Dataset.from_tensor_slices((features, labels)).batch(32).prefetch(1)
#print(train_set)

#print(next(train_set.batch(32).as_numpy_iterator())[0][0])
print('done')


(10000, 2470)
(10000,)
done


In [3]:
# Create model
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None], mask_zero=True),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, batch_size=32, epochs=5)
print('done')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
done


In [13]:
testArr = tf.strings.split("Not sure if I understood the premise at all. It was sort of weird. Probably won't see this again.")
test = table.lookup(testArr)
zero_padding = tf.zeros(tf.shape(features)[1] - tf.shape(test)[0], dtype=tf.int64)
a_padded = tf.concat([test, zero_padding],0)
a_padded = a_padded.numpy().reshape(1,-1);
#print("Prediction input: ", a_padded)
prediction = model.predict(a_padded)
print(prediction)
print("Review is positive:", prediction[0] > 0.94)


[[0.12612972]]
Review is positive: [False]


In [5]:
# Save model
model.save('C:/Users/JP/Desktop/Projects/imdb_sentiment/saved_models/latest')




INFO:tensorflow:Assets written to: C:/Users/JP/Desktop/Projects/imdb_sentiment/saved_models/latest\assets


INFO:tensorflow:Assets written to: C:/Users/JP/Desktop/Projects/imdb_sentiment/saved_models/latest\assets
