In [30]:
import numpy as np
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import pandas as pd
import re
import string

In [31]:
dataset = pd.read_csv(r'dataset\IMDB Dataset.csv')
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [32]:
dataset.isna().sum()

review       0
sentiment    0
dtype: int64

In [33]:
max_tokens = dataset['review'].apply(lambda x: len(x.split())).max()
print(f"Max tokens in a review: {max_tokens}")

Max tokens in a review: 2470


In [34]:
split = int(len(dataset['review']) * 0.8)
train_data = dataset[: split]
valid_data = dataset[split: ]
batch_size = 32
seed = 42
sequence_length = 1500
max_feature = 10000

In [39]:
def custom_standard(inp):
    lower = tf.strings.lower(inp)
    lower = tf.strings.regex_replace(lower, '<br />', ' ')
    return tf.strings.regex_replace(lower,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [40]:
vectorizer_train = tf.keras.layers.TextVectorization(
    standardize=custom_standard,
    max_tokens=max_feature,
    output_mode='int',
    output_sequence_length=sequence_length
)
vectorizer_val = tf.keras.layers.TextVectorization(
    standardize=custom_standard,
    max_tokens=max_feature,
    output_mode='int',
    output_sequence_length=sequence_length
)

vectorizer_train.adapt(train_data['review'])
vectorizer_val.adapt(valid_data['review'])

In [43]:
def create_data_train(inp, label):
    text = vectorizer_train(inp)
    label = tf.where(label == 'positive', 1, 0)

    return text, label

def create_data_val(inp, label):
    text = vectorizer_val(inp)
    label = tf.where(label == 'positive', 1, 0)

    return text, label

In [47]:
train_Data = tf.data.Dataset.from_tensor_slices(
    (list(train_data['review']), list(train_data['sentiment']))
)
valid_Data = tf.data.Dataset.from_tensor_slices(
    (list(valid_data['review']), list(list(valid_data['sentiment'])))
)

valid_Data = (
    valid_Data.map(create_data_val, num_parallel_calls = tf.data.AUTOTUNE)
    .prefetch(tf.data.AUTOTUNE)
    .padded_batch(batch_size = batch_size)
)
train_Data = (
    train_Data.map(create_data_train, num_parallel_calls = tf.data.AUTOTUNE)
    .prefetch(tf.data.AUTOTUNE)
    .padded_batch(batch_size = batch_size)
)

In [56]:
for batch in train_Data.take(1):
    text, label = batch[0][0], batch[1][1]
    # label = tf.strings.reduce_join(label).numpy().decode('utf-8')
    print(label)
    print(text)


tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor([28  5  2 ...  0  0  0], shape=(1500,), dtype=int64)


In [57]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_feature, 16),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')])

model.summary()

In [59]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics = ['accuracy'])

In [60]:
epochs = 10
history = model.fit(
    train_Data,
    validation_data=valid_Data,
    epochs=epochs)

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.5193 - loss: 0.6928 - val_accuracy: 0.5063 - val_loss: 0.6895
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.5780 - loss: 0.6763 - val_accuracy: 0.5768 - val_loss: 0.6801
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6489 - loss: 0.6414 - val_accuracy: 0.5496 - val_loss: 0.6819
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.7005 - loss: 0.5902 - val_accuracy: 0.5250 - val_loss: 0.7361
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.7532 - loss: 0.5326 - val_accuracy: 0.5840 - val_loss: 0.6777
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.7902 - loss: 0.4824 - val_accuracy: 0.5840 - val_loss: 0.6861
Epoch 7/10
[1m1

In [63]:
prediction_model = tf.keras.Sequential([
    vectorizer_train,
    model,
    tf.keras.layers.Activation('sigmoid')
])

prediction_model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy'])

In [66]:
sen = tf.constant(["The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."])
print(prediction_model.predict(sen))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[[0.5824875]
 [0.559584 ]
 [0.5508228]]
