In [1]:
# Dataset
# https://drive.google.com/file/d/192jeGRTCZZfet8ufHPfaMn05T7Biklfw/view?usp=sharing

In [3]:
from tensorflow.keras.preprocessing import text_dataset_from_directory

# Assumes you're in the root level of the dataset directory.
# If you aren't, you'll need to change the relative paths here.
train_data = text_dataset_from_directory("movie-reviews-dataset/test")
test_data = text_dataset_from_directory("/movie-reviews-dataset\\train")

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [4]:
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace

def prepareData(dir):
  data = text_dataset_from_directory(dir)
  return data.map(
    lambda text, label: (regex_replace(text, '<br />', ' '), label),
  )

In [5]:
train_data = prepareData("E:\\rnn_c2c\\movie-reviews-dataset\\test")
test_data = prepareData("E:\\rnn_c2c\\movie-reviews-dataset\\train")

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [6]:
for text_batch, label_batch in train_data.take(1):
    print(text_batch.numpy()[0])
    print(label_batch.numpy()[0]) # 0 = negative, 1 = positive

b'Although this was a film of only less than forty minutes, it is one of best directed and acted stories I have ever seen. It accomplishes in less than 45 minutes what most films cannot in more than 90.  It is the story of two brothers, one 18 and the other 10. They come from a poor farm family in Mississippi. Both are caught up in war and the conflict of duty verses love of family.  It brought tears to my eyes especially because the entire film is so well acted and directed, plus it tells the story of so many wars where one serves and the other left behind.  I can fully recommend this film as beyond superb !'
1


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input

model = Sequential()
model.add(Input(shape=(1,), dtype="string"))

In [8]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_tokens = 1000
max_len = 100
vectorize_layer = TextVectorization(
  # Max vocab size. Any words outside of the max_tokens most common ones
  # will be treated the same way: as "out of vocabulary" (OOV) tokens.
  max_tokens=max_tokens,
  # Output integer indices, one per string token
  output_mode="int",
  # Always pad or truncate to exactly this many tokens
  output_sequence_length=max_len,
)

In [9]:
# Call adapt(), which fits the TextVectorization layer to our text dataset.
# This is when the max_tokens most common words (i.e. the vocabulary) are selected.
train_texts = train_data.map(lambda text, label: text)
vectorize_layer.adapt(train_texts)

In [10]:
model.add(vectorize_layer)

In [11]:
from tensorflow.keras.layers import Embedding
model.add(Embedding(max_tokens + 1, 128))

In [12]:
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
model.add(LSTM(64))

In [13]:
model.add(Embedding(max_tokens + 1, 128))

# ----- 4. RECURRENT LAYER
model.add(LSTM(64))

# ----- 5. DENSE HIDDEN LAYER
model.add(Dense(64, activation="relu"))

# ----- 6. OUTPUT
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 128)          128128    
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 embedding_1 (Embedding)     (None, 64, 128)           128128    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                        

In [14]:
# from keras.layers import SimpleRNN
# # build model
# model.add(SimpleRNN(128, return_sequences=True))
# # model.add(SimpleRNN(128, return_sequences=True))
# model.add(SimpleRNN(128, return_sequences=False))
# model.add(Dense(20))
# model.add(Dense(64, activation="relu"))
# model.add(Dense(1, activation="sigmoid"))
# model.build()
# model.summary()

In [15]:
# model.add(Dense(64, activation="relu"))

In [16]:
# model.add(Dense(1, activation="sigmoid"))

In [17]:
model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'],
)

In [None]:
model.fit(train_data, epochs=10)

Epoch 1/10
Epoch 2/10

In [None]:
model.save('E:\\rnn_c2c\\movie-reviews-dataset')

In [None]:
from tensorflow import keras
model = keras.models.load_model('E:\\rnn_c2c\\movie-reviews-dataset')

In [None]:
# Should print a very high score like 0.98.
print(model.predict([
  "i loved it! highly recommend it to anyone and everyone looking for a great movie to watch.",
]))

# Should print a very low score like 0.01.
print(model.predict([
  "this was awful! i hated it so much, nobody should watch this. the acting was terrible, the music was terrible, overall it was just bad.",
]))