# Sentence Classification

classify movie review into positive or negative.

In [21]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np
import pandas as pd

tf.random.set_seed(1)
np.random.seed(1)

In [3]:
movie_reviews = [
         {'review': 'this is the best movie', 'sentiment': 'positive'},
         {'review': 'i recommend you watch this movie', 'sentiment': 'positive'},
         {'review': 'it was waste of money and time', 'sentiment': 'negative'},
         {'review': 'the worst movie ever', 'sentiment': 'negative'}
    ]
df = pd.DataFrame(movie_reviews)

In [4]:
df

Unnamed: 0,review,sentiment
0,this is the best movie,positive
1,i recommend you watch this movie,positive
2,it was waste of money and time,negative
3,the worst movie ever,negative


In [8]:
def get_vocab2int(df):
    d = {}
    vocab = set()
    df['review'].str.split().apply(vocab.update)
    for idx, word in enumerate(vocab):
        d[word] = idx

    return d

vocab2_int = get_vocab2int(df)
vocab_size = len(vocab2_int)

print(vocab2_int)

{'movie': 0, 'and': 1, 'money': 2, 'it': 3, 'was': 4, 'time': 5, 'is': 6, 'the': 7, 'recommend': 8, 'ever': 9, 'worst': 10, 'i': 11, 'of': 12, 'you': 13, 'this': 14, 'waste': 15, 'best': 16, 'watch': 17}


In [7]:
# encode words into integer
reviews = df['review'].tolist()
encoded_reviews = []
for review in reviews:
    tokens = review.split(" ")
    review_encoding = []
    for token in tokens:
        review_encoding.append(vocab2_int[token])
    encoded_reviews.append(review_encoding)

print(encoded_reviews)

[[14, 6, 7, 16, 0], [11, 8, 13, 17, 14, 0], [3, 4, 15, 12, 2, 1, 5], [7, 10, 0, 9]]


In [9]:
# encoded reviews
print(encoded_reviews[0])
print(encoded_reviews[1])
print(encoded_reviews[2])
print(encoded_reviews[3])

[14, 6, 7, 16, 0]
[11, 8, 13, 17, 14, 0]
[3, 4, 15, 12, 2, 1, 5]
[7, 10, 0, 9]


In [10]:
def get_max_length(df):
    max_length = 0
    for row in df['review']:
        if len(row.split(" ")) > max_length:
            max_length = len(row.split(" "))
    return max_length

# max_length is used for max sequence of input
max_length = get_max_length(df)

In [11]:
# if review is short, fill in zero padding and make all sentence length to be same as max_length
padded_reviews_encoding = pad_sequences(encoded_reviews, maxlen = max_length, padding = 'post')

In [14]:
sentiments = df['sentiment'].tolist()
def sentiment_encode(sentiment):
    if sentiment == 'positive':
        return [1, 0]
    else:
        return [0, 1]
    
# encoded sentiment
encoded_sentiments = [sentiment_encode(smt) for smt in sentiments]

print(encoded_sentiments)

[[1, 0], [1, 0], [0, 1], [0, 1]]


In [16]:
# RNN model
model = Sequential()
model.add(Embedding(vocab_size, 3, input_length = max_length))
model.add(SimpleRNN(32))
model.add(Dense(2, activation = 'softmax'))



In [31]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [32]:
callbacks = [EarlyStopping(monitor = 'loss', patience = 3, mode = 'min'),
             ModelCheckpoint(filepath = 'RNN2_tf.keras', monitor = 'loss', save_best_only = True)]

In [33]:
train_X = np.array(padded_reviews_encoding)
train_Y = np.array(encoded_sentiments)

In [34]:
print("Train..")
model.fit(train_X, train_Y, epochs = 1000, callbacks = callbacks)

Train..
Epoch 1/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 1.0000 - loss: 0.4160
Epoch 2/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 203ms/step - accuracy: 1.0000 - loss: 0.3930
Epoch 3/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step - accuracy: 1.0000 - loss: 0.3693
Epoch 4/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step - accuracy: 1.0000 - loss: 0.3450
Epoch 5/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step - accuracy: 1.0000 - loss: 0.3204
Epoch 6/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step - accuracy: 1.0000 - loss: 0.2954
Epoch 7/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step - accuracy: 1.0000 - loss: 0.2705
Epoch 8/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 320ms/step - accuracy: 1.0000 - loss: 0.2457
Epoch 9/1000
[1m1/1[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x252a8692690>

In [35]:
from tensorflow.keras.models import load_model
model = load_model("RNN2_tf.keras")

In [36]:
score, acc = model.evaluate(train_X, train_Y, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

1/1 - 1s - 872ms/step - accuracy: 1.0000 - loss: 2.8193e-05
Test score: 2.8192594982101582e-05
Test accuracy: 1.0


In [37]:
model.predict(train_X)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 449ms/step


array([[9.9996865e-01, 3.1385240e-05],
       [9.9997532e-01, 2.4627907e-05],
       [2.5265286e-05, 9.9997473e-01],
       [3.1458323e-05, 9.9996853e-01]], dtype=float32)