In [32]:
#Import all the libraries needed
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [33]:
#Preview dataset

In [34]:
import pathlib
from google.colab import drive
drive.mount('/content/drive')
data_dir = pathlib.Path('/content/drive/My Drive/MSC/DataSet/IMDB Dataset_10000.csv')
data = pd.read_csv(data_dir)

print(data.head)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<bound method NDFrame.head of                                                  review sentiment
0     One of the other reviewers has mentioned that ...  positive
1     A wonderful little production. <br /><br />The...  positive
2     I thought this was a wonderful way to spend ti...  positive
3     Basically there's a family where a little boy ...  negative
4     Petter Mattei's "Love in the Time of Money" is...  positive
...                                                 ...       ...
9994  First off, this is the worst movie I've ever s...  negative
9995  Fun, entertaining movie about WWII German spy ...  positive
9996  Give me a break. How can anyone say that this ...  negative
9997  This movie is a bad movie. But after watching ...  negative
9998  This is a movie that was probably made to ente...  negative

[9999 rows x 2 columns]>


In [6]:
#Declaring the english stop words

In [35]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
#Preprocessing and Encoding labels

In [36]:
def load_dataset():
    x_data = data['review']       # Reviews/Input
    y_data = data['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0       [one, reviewers, mentioned, watching, oz, epis...
1       [a, wonderful, little, production, the, filmin...
2       [i, thought, wonderful, way, spend, time, hot,...
3       [basically, family, little, boy, jake, thinks,...
4       [petter, mattei, love, time, money, visually, ...
                              ...                        
9994    [first, worst, movie, i, ever, seen, that, may...
9995    [fun, entertaining, movie, wwii, german, spy, ...
9996    [give, break, how, anyone, say, good, hockey, ...
9997    [this, movie, bad, movie, but, watching, endle...
9998    [this, movie, probably, made, entertain, middl...
Name: review, Length: 9999, dtype: object 

Sentiment
0       1
1       1
2       1
3       0
4       1
       ..
9994    0
9995    1
9996    0
9997    0
9998    0
Name: sentiment, Length: 9999, dtype: int64


  y_data = y_data.replace('negative', 0)


In [37]:
#train to test split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
9919    [unbelievable, great, cast, fair, acting, inte...
9518    [maybe, need, head, examined, thought, pretty,...
5230    [jean, pierre, melville, le, cercle, rouge, fo...
3322    [by, string, solid, wwii, propaganda, pieces, ...
5770    [from, today, point, view, quite, ridiculous, ...
                              ...                        
8955    [this, highly, underrated, film, good, writing...
4357    [brokedown, palace, story, two, best, friends,...
7476    [this, documentary, attempts, comedy, never, q...
7351    [actress, patty, duke, wrote, insightful, funn...
8276    [maybe, i, looked, history, irish, troubles, s...
Name: review, Length: 7999, dtype: object 

7810    [contains, spoilers, the, british, director, j...
2966    [why, this, insipid, uninspired, embarrassing,...
5799    [though, i, saw, movie, years, ago, impact, ne...
9078    [love, characters, story, line, very, funny, p...
8180    [sorry, say, disappointed, film, it, rushed, i...
                  

In [11]:
#Function for getting the maximum review length, by calculating the mean of all the reviews length (using numpy.mean)

In [38]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [14]:
#Tokenize and Pad/Truncate Reviews
#post, pad or truncate the words in the back of a sentence
#pre, pad or truncate the words in front of a sentence

In [41]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()
#pad_sequences enforces fixed-length inputs for neural nets. Uses mean (rounded up) to balance padding/truncation:
#    padding='post': Adds zeros after content

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

AttributeError: 'numpy.ndarray' object has no attribute 'translate'

In [16]:
#Build the model

In [27]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))

model.build(input_shape=(None, max_length))
print(total_words)
print(model.summary())


47420




None


In [18]:
#Set hyperparameters
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
#Model Training

In [19]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:
#Model Training

In [20]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
[1m62/63[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 89ms/step - accuracy: 0.5008 - loss: 0.6933
Epoch 1: accuracy improved from -inf to 0.50481, saving model to models/LSTM.h5




[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 89ms/step - accuracy: 0.5010 - loss: 0.6933
Epoch 2/5
[1m62/63[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 91ms/step - accuracy: 0.5941 - loss: 0.6927
Epoch 2: accuracy improved from 0.50481 to 0.60333, saving model to models/LSTM.h5




[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 91ms/step - accuracy: 0.5944 - loss: 0.6927
Epoch 3/5
[1m62/63[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 91ms/step - accuracy: 0.6317 - loss: 0.6778
Epoch 3: accuracy improved from 0.60333 to 0.63533, saving model to models/LSTM.h5




[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 91ms/step - accuracy: 0.6318 - loss: 0.6776
Epoch 4/5
[1m62/63[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 90ms/step - accuracy: 0.6629 - loss: 0.6020
Epoch 4: accuracy improved from 0.63533 to 0.69271, saving model to models/LSTM.h5




[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.6639 - loss: 0.6012
Epoch 5/5
[1m62/63[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 95ms/step - accuracy: 0.8508 - loss: 0.4397
Epoch 5: accuracy improved from 0.69271 to 0.82510, saving model to models/LSTM.h5




[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 95ms/step - accuracy: 0.8500 - loss: 0.4461


<keras.src.callbacks.history.History at 0x7bffa6a84b50>

In [42]:
#Model testing

In [22]:
pred = model.predict(x=x_test)
y_pred = (pred >= 0.5) * 1

#y_pred = model.predict(x_test)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1


print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
Correct Prediction: 1158
Wrong Prediction: 842
Accuracy: 57.9
