In [34]:
# Dataset
# https://drive.google.com/file/d/192jeGRTCZZfet8ufHPfaMn05T7Biklfw/view?usp=sharing

In [27]:
from tensorflow.keras.preprocessing import text_dataset_from_directory

# Assumes you're in the root level of the dataset directory.
# If you aren't, you'll need to change the relative paths here.
train_data = text_dataset_from_directory("movie-reviews-dataset/test")
test_data = text_dataset_from_directory("movie-reviews-dataset/train")

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [28]:
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace

def prepareData(dir):
  data = text_dataset_from_directory(dir)
  #for filtering
  return data.map(
    lambda text, label: (regex_replace(text, '<br />', ' '), label),
  )

In [29]:
train_data = prepareData("movie-reviews-dataset/test")
test_data = prepareData("movie-reviews-dataset/train")

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [30]:
for text_batch, label_batch in train_data.take(1):
    print(text_batch.numpy()[0])
    print("\n")
    print(label_batch.numpy()[0]) # 0 = negative, 1 = positive

b'Hope the summary line won\'t irritate you that much (it\'s a little homage to the Chappelle Show/Charlie Murphy, but also to the character Daywalker). But I\'ll try to put all the things I liked about the movie in one paragraph and everything I didn\'t like in another paragraph, so it will be easier to read!   Let\'s start with the good things! The quote "strong bloody violence" (which is used by rating boards, to describe the content of a movie, does fit here very well. This is not a movie for kids! Or for the faint of hearted! It has Blade as a central character (Wesley Snipes is phenomenal) and a crazy enough story thread to hold/justify the action scenes! The original idea is also very engaging and intelligent. The action scenes are great here too.  OK over to the things I didn\'t like. The overall story is too thin. It\'s enough as I\'ve written above to hold the action scenes together, but there could be more. And a character like Blade deserves more (imo). The drama therefor i

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input

model = Sequential()
model.add(Input(shape=(1,), dtype="string"))

In [32]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_tokens = 1000
max_len = 100
vectorize_layer = TextVectorization(
  # Max vocab size. Any words outside of the max_tokens most common ones
  # will be treated the same way: as "out of vocabulary" (OOV) tokens.
  max_tokens=max_tokens,
  # Output integer indices, one per string token
  output_mode="int",
  # Always pad or truncate to exactly this many tokens
  output_sequence_length=max_len,
)

In [33]:
# Call adapt(), which fits the TextVectorization layer to our text dataset.
# This is when the max_tokens most common words (i.e. the vocabulary) are selected.
train_texts = train_data.map(lambda text, label: text)
vectorize_layer.adapt(train_texts)

In [35]:
model.add(vectorize_layer)

In [37]:
from tensorflow.keras.layers import Embedding
model.add(Embedding(max_tokens + 1, 128))

In [13]:
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
model.add(LSTM(64))

In [14]:
model.add(Embedding(max_tokens + 1, 128))

# ----- 4. RECURRENT LAYER
model.add(LSTM(64))

# ----- 5. DENSE HIDDEN LAYER
model.add(Dense(64, activation="relu"))

# ----- 6. OUTPUT
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 128)          128128    
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 embedding_1 (Embedding)     (None, 64, 128)           128128    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                        

In [15]:
model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'],
)

In [16]:
#LSTM
model.fit(train_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21b166b6670>

In [18]:
model.save('LSTM')



INFO:tensorflow:Assets written to: LSTM\assets


INFO:tensorflow:Assets written to: LSTM\assets


In [22]:
from tensorflow import keras
model_LSTM = keras.models.load_model('LSTM')

In [24]:
# Should print a very high score like 0.98.
print(model_LSTM.predict([
  "i loved it! highly recommend it to anyone and everyone looking for a great movie to watch.",
]))

# Should print a very low score like 0.01.
print(model_LSTM.predict([
  "this was awful! i hated it so much, nobody should watch this. the acting was terrible, the music was terrible, overall it was just bad.",
]))

[[0.50081134]]
[[0.50081134]]


In [38]:
from keras.layers import SimpleRNN
# # build model
model.add(SimpleRNN(128, return_sequences=True))
model.add(SimpleRNN(128, return_sequences=True))
model.add(SimpleRNN(128, return_sequences=False))
model.add(Dense(20))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.build()
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 100)              0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 100, 128)          128128    
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 100, 128)          32896     
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 100, 128)          32896     
                                                                 
 simple_rnn_4 (SimpleRNN)    (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 20)                2580      
                                                      

In [39]:
model.add(Dense(64, activation="relu"))

In [40]:
model.add(Dense(1, activation="sigmoid"))

In [41]:
model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'],
)

In [42]:
#Simple RNN
model.fit(train_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21b37158610>

In [43]:
model.save('SimpleRNN')

INFO:tensorflow:Assets written to: SimpleRNN\assets


INFO:tensorflow:Assets written to: SimpleRNN\assets


In [44]:
from tensorflow import keras
model = keras.models.load_model('SimpleRNN')

In [45]:
# Should print a very high score like 0.98.
print(model.predict([
  "i loved it! highly recommend it to anyone and everyone looking for a great movie to watch.",
]))

# Should print a very low score like 0.01.
print(model.predict([
  "this was awful! i hated it so much, nobody should watch this. the acting was terrible, the music was terrible, overall it was just bad.",
]))

[[0.50095457]]
[[0.50095457]]
