In [None]:
import tensorflow as tf
import pandas as pd
import re
import string

from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, GlobalMaxPool1D
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential

# Load the data
data = pd.read_csv(r'..\Data\archive\IMDB Dataset.csv')

# The first 100 reviews are going to be used for testing

data=data.sample(frac=1)
data_train = data.iloc[100:]
data_test = data.iloc[:100]

# Preprocess the data

def custom_standardization(input_data):     #format the text removing HTML
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation), '') # Removes all punctuation from the text.

max_features = 10000

tokenizer=tokens = Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(data_train['review'])

# convert the text to sequences
x_train = tokenizer.texts_to_sequences(data_train['review'])

x_test = tokenizer.texts_to_sequences(data_test['review'])

# adding padding to get all to the same length

maxlen= max([len(x) for x in x_train])

x_train= sequence.pad_sequences(x_train, maxlen=maxlen, padding ='post')
x_test= sequence.pad_sequences(x_test, maxlen=maxlen, padding ='post')


# obtaining the labels (transforming them into numbers)

y_train = data_train['sentiment'].map({'positive':1, 'negative':0}).values

y_test = data_test['sentiment'].map({'positive':1, 'negative':0}).values

# Define the model

embed_size = 128

input= Input(shape=(maxlen,))

model= Sequential([
    Embedding(max_features, embed_size), #embedding layer: 1000 words and 128 features (converts the input into vectors of fixed size and vocabulary)
    LSTM(60, return_sequences=True),  # identify the important features in the text and ignoring the unimportant ones (60 neurons, each token generates an output)
    GlobalMaxPool1D(), # takes only the biggest value for each feature coming from the output of the LSTM
    Dense(50, activation='relu'), #50 neurons in the hidden layer with relu activation function
    Dropout(0.1), #to prevent overfitting
    Dense(2, activation='sigmoid') # 2 classes in the final layer: positive and negative
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model

batch_size = 100
epochs = 3

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))

Epoch 1/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3475s[0m 7s/step - accuracy: 0.7759 - loss: 0.4370 - val_accuracy: 0.8800 - val_loss: 0.3187
Epoch 2/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3658s[0m 7s/step - accuracy: 0.9309 - loss: 0.1805 - val_accuracy: 0.8800 - val_loss: 0.3371
Epoch 3/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5434s[0m 11s/step - accuracy: 0.9610 - loss: 0.1161 - val_accuracy: 0.9000 - val_loss: 0.3534
Epoch 4/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6427s[0m 13s/step - accuracy: 0.9778 - loss: 0.0733 - val_accuracy: 0.8700 - val_loss: 0.4705
Epoch 5/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5025s[0m 10s/step - accuracy: 0.9868 - loss: 0.0439 - val_accuracy: 0.8700 - val_loss: 0.5129


<keras.src.callbacks.history.History at 0x24494d953c0>

In [10]:
batch_size = 100
epochs = 3

# split the test set into evaluation and validation set

validation_split = 0.5
validation_size= int(len(x_test)*validation_split)

x_val = x_test[:validation_size]
y_val = y_test[:validation_size]

x_eval = x_test[validation_size:]
y_eval = y_test[validation_size:]

# model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val))

# Evaluate the model

score = model.evaluate(x_eval, y_eval, batch_size=batch_size)    #easy to do since dataset already formatted

print(f'Test loss: {score[0]} - test accuracy: {score[1]}')


# Manual test with some handwriten reviews

sample_reviews_plain = [
    'This movie is fantastic! I really enjoyed it and I would recommend it to everyone!',
    'That acting was terrible! I can\'t believe I wasted my time watching this movie!',
    'The film didn\'t meet my expectations. I was very disappointed with it.',
    'The film didn\'t start for 30 minutes but it was worth the wait.',
    'The movie was good but the ending was terrible.',
    'I love this movie',
    'I hate this movie',
    'I don\'t like this movie, it was boring',
    'I don\'t hate this movie, it was almost good'
]

# Format the reviews

sample_reviews = tokenizer.texts_to_sequences(sample_reviews_plain)
sample_reviews = sequence.pad_sequences(sample_reviews, maxlen=maxlen, padding='post')

predictions = model.predict(sample_reviews)
predicted_sentiments = (predictions > 0.5).astype(int)

for review, prediction in zip(sample_reviews_plain, predicted_sentiments):
    print(f'Review: {review} -- Sentiment: {"Positive" if prediction[1] == 1 else "Negative"}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 416ms/step - accuracy: 0.8600 - loss: 0.3673
Test loss: 0.3673033118247986 - test accuracy: 0.8600000143051147
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step
Review: This movie is fantastic! I really enjoyed it and I would recommend it to everyone! -- Sentiment: Positive
Review: That acting was terrible! I can't believe I wasted my time watching this movie! -- Sentiment: Negative
Review: The film didn't meet my expectations. I was very disappointed with it. -- Sentiment: Negative
Review: The film didn't start for 30 minutes but it was worth the wait. -- Sentiment: Positive
Review: The movie was good but the ending was terrible. -- Sentiment: Negative
Review: I love this movie -- Sentiment: Positive
Review: I hate this movie -- Sentiment: Negative
Review: I don't like this movie, it was boring -- Sentiment: Negative
Review: I don't hate this movie, it was almost good -- Sentiment: Positive
