In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

data = keras.datasets.imdb

# num_words takes the 10000 most frequent words and leaves out the rest
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=10000)

# this outputs a list of integers and these integers each point to a certain
# word, it is set as an integer so it is easier for the computer to read
# print(train_data)

# generally you would have to make this reference library but it is
# already made for us by imdb
word_index = data.get_word_index()

# k stands for key here (convention) and v stand for value (convention)
# it is v+3 because we have 3 sepcial characters
word_index = {k:(v+3) for k, v in word_index.items()}

# these values are values that you can assign to certain values to make
# it consistent to read

# the padding tag is to add padding to the empty length to the review
# so that they are all the same
word_index['<PAD>'] = 0 
word_index['<START>'] = 1
# unk stands for unknown
word_index['<UNK>'] = 2
word_index['<UNUSED>'] = 3

# this just swaps all the values and the keys, it is a special thing that
# tensorflow makes you do with their reference libraries
# what this does by swapping the key and values is that the integer
# points to a word rather than what would've been the default
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# this creates the padding for after the review so they are all the same length and limits the size of a review as well
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index['<PAD>'], padding='post', maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index['<PAD>'], padding='post', maxlen=250)

# this function applies the reverse_word_index and tries
def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

# print(decode_review(test_data[0]))
# the models are not different lengths now as you can see here
# print(len(test_data[0]), len(test_data[1]))

# this is the model

# how these layers work:
# the neuron will be from 0-1 (sigmoid) do give us ap ercentage of how positive or negative our model is
# 1 = most positive and vice versa

# full explanation is in pycharm under the text classification file
model = keras.Sequential()
model.add(keras.layers.Embedding(10000, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))

model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# there are 25000 reviews in the training data
# we are splitting the train data into two sets of xy and the same thing for the test data because this means that after each
# epoch it will not be tested on the same training data again and again to avoid false hope at a improving network, only 
# improving from memory
x_val = train_data[:10000]
x_train = train_data[10000:]

y_val = train_labels[:10000]
y_train = train_labels[10000:]

# batch size is how many movie reviews we will load in each time so it isn't overloaded and/or memorises it
fitModel = model.fit(x_train, y_train, epochs=30, batch_size=512, validation_data=(x_val, y_val), verbose=1)

results = model.evaluate(test_data, test_labels)

print(results)

test_review = test_data[0]
predict = model.predict([test_review])
print('Review: ')
print(decode_review(test_review))
print('Prediction: ' + str(predict[0]))
print('Actual: ' + str(test_labels[0]))
print(results)