In [1]:
# Dataset
# https://drive.google.com/file/d/192jeGRTCZZfet8ufHPfaMn05T7Biklfw/view?usp=sharing

In [1]:
import tensorflow as tf
from tensorflow import keras

In [2]:
from tensorflow.keras.preprocessing import text_dataset_from_directory

# Assumes you're in the root level of the dataset directory.
# If you aren't, you'll need to change the relative paths here.
train_data = text_dataset_from_directory("D:\\Training\\Movie Review\\movie-reviews-dataset\\train")
test_data = text_dataset_from_directory("D:\\Training\\Movie Review\\movie-reviews-dataset\\test")

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [4]:
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace

def prepareData(dir):
  data = text_dataset_from_directory(dir)
  return data.map(
    lambda text, label: (regex_replace(text, '<br />', ' '), label),
  )

In [5]:
train_data = prepareData("D:\\Training\\Movie Review\\movie-reviews-dataset\\train")
test_data = prepareData("D:\\Training\\Movie Review\\movie-reviews-dataset\\test")

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [6]:
for text_batch, label_batch in train_data.take(1):
    print(text_batch.numpy()[0])
    print(label_batch.numpy()[0]) # 0 = negative, 1 = positive

b'The best film on the battle of San Antonio, Texas in March 1836, was John Wayne\'s 1960 epic THE ALAMO. In a one shot job as director producer, that temporarily financially strapped him, Wayne demonstrated that he was talented in movie making outside of his icon-like acting ability personifying the West.  I have commented on that film in a review the other night, and I pointed out that Wayne and James Edward Grant (the screenwriter) tackled some points that were barely mentioned in earlier films about the battle. They did bring in the issue of slavery. They also finally discussed the contribution of local Mexican land owner Juan Seguin as an important leader in the War for Independence on par with Crockett, Bowie, Travis, Austin, and Houston.   But there was one weakness (though well hidden) in the film. Wayne worked hard to cast it properly, thinking of many people for lead roles in it. But, he did not properly handle the leader of the enemy forces, General Antonio De Santa Anna. Th

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input

model = Sequential()
model.add(Input(shape=(1,), dtype="string"))

In [8]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_tokens = 1000
max_len = 100
vectorize_layer = TextVectorization(
  # Max vocab size. Any words outside of the max_tokens most common ones
  # will be treated the same way: as "out of vocabulary" (OOV) tokens.
  max_tokens=max_tokens,
  # Output integer indices, one per string token
  output_mode="int",
  # Always pad or truncate to exactly this many tokens
  output_sequence_length=max_len,
)

In [9]:
# Call adapt(), which fits the TextVectorization layer to our text dataset.
# This is when the max_tokens most common words (i.e. the vocabulary) are selected.
train_texts = train_data.map(lambda text, label: text)
vectorize_layer.adapt(train_texts)

In [10]:
model.add(vectorize_layer)

In [11]:
from tensorflow.keras.layers import Embedding
model.add(Embedding(max_tokens + 1, 128))

In [12]:
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
model.add(LSTM(64))

In [13]:
model.add(Embedding(max_tokens + 1, 128))

# ----- 4. RECURRENT LAYER
model.add(LSTM(64))

# ----- 5. DENSE HIDDEN LAYER
model.add(Dense(64, activation="relu"))

# ----- 6. OUTPUT
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 128)          128128    
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 embedding_1 (Embedding)     (None, 64, 128)           128128    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                        

In [14]:
# from keras.layers import SimpleRNN
# # build model
# model.add(SimpleRNN(128, return_sequences=True))
# # model.add(SimpleRNN(128, return_sequences=True))
# model.add(SimpleRNN(128, return_sequences=False))
# model.add(Dense(20))
# model.add(Dense(64, activation="relu"))
# model.add(Dense(1, activation="sigmoid"))
# model.build()
# model.summary()

In [15]:
# model.add(Dense(64, activation="relu"))

In [16]:
# model.add(Dense(1, activation="sigmoid"))

In [17]:
model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'],
)

In [18]:
model.fit(train_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x18a7a142b50>

In [19]:
model.save('D:\\Training\\Movie Review\\movie-reviews-dataset')



INFO:tensorflow:Assets written to: D:\Training\Movie Review\movie-reviews-dataset\assets


INFO:tensorflow:Assets written to: D:\Training\Movie Review\movie-reviews-dataset\assets


In [20]:
from tensorflow import keras
model = keras.models.load_model('D:\\Training\\Movie Review\\movie-reviews-dataset')

In [21]:
# Should print a very high score like 0.98.
print(model.predict([
  "i loved it! highly recommend it to anyone and everyone looking for a great movie to watch.",
]))

# Should print a very low score like 0.01.
print(model.predict([
  "this was awful! i hated it so much, nobody should watch this. the acting was terrible, the music was terrible, overall it was just bad.",
]))

[[0.49607146]]
[[0.49607146]]
