### Training a binary classifier with the IMDB Reviews Dataset

In [1]:
#importing libraries

import tensorflow as tf
import tensorflow_datasets as tfds
import io


In [2]:
#downloading the dataset

imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
print(info)



[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\mouni\tensorflow_datasets\imdb_reviews\plain_text\1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\mouni\tensorflow_datasets\imdb_reviews\plain_text\incomplete.U78U5P_1.0.0\imdb_reviews-trai…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\mouni\tensorflow_datasets\imdb_reviews\plain_text\incomplete.U78U5P_1.0.0\imdb_reviews-test…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\mouni\tensorflow_datasets\imdb_reviews\plain_text\incomplete.U78U5P_1.0.0\imdb_reviews-unsu…

[1mDataset imdb_reviews downloaded and prepared to C:\Users\mouni\tensorflow_datasets\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.[0m
tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir='C:\\Users\\mouni\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shap

In [3]:
print(imdb)

{Split('train'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>, Split('test'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>, Split('unsupervised'): <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}


In [9]:
#few sample reviews
for element in imdb['train'].take(4):
    print(element)

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on

we can see that each example is a 2-element tuple of tensors containing the text first, then the label.

In [10]:
#getting the train and test sets
train_dataset, test_dataset = imdb['train'], imdb['test']

##### Generating padded sequences

we will convert the strings into integer sequences, then pad them to a uniform length.

In [13]:
#parameters

vocab_size = 10000
max_length = 120
embedding_dim = 16
padding_type = 'pre'
trucn_type = 'post'


We will generate vocabulary using training set only, we should not include the test set because that is meant to represent data that the model hasn't seen before.

In [19]:
#instantiate the vectorization layer

vectorize_layer = tf.keras.layers.TextVectorization(max_tokens = vocab_size)

#getting the review and label for training dataset and testing datasets
train_reviews = train_dataset.map(lambda review, label : review)
test_reviews = test_dataset.map(lambda review, label : review)

train_labels = train_dataset.map(lambda review, label: label)
test_labels = test_dataset.map(lambda review, label: label)

#generating vocabulary based on training reviews
vectorize_layer.adapt(train_reviews)



In [20]:
def padding_func(sequences):
    """generates padded sequences from a tf.data.Dataset"""

    #put all elements in a single ragged batch
    sequences = sequences.ragged_batch(batch_size=sequences.cardinality())
    #need to specify the batch size and it has to match the number of all elements in the dataset.
    #nstead of specifying a specific number, you can also use the cardinality() method. This computes the number of elements in a tf.data.Dataset.

    #output a tensor from the single batch
    sequences = sequences.get_single_element()

    #pad the sequences
    padded_sequences = tf.keras.utils.pad_sequences(sequences.numpy(),
                                                    maxlen=max_length,
                                                    padding=padding_type,
                                                    truncating=trucn_type)
    
    #converting back to tf.data.Dataset
    padded_sequences = tf.data.Dataset.from_tensor_slices(padded_sequences)
    
    return padded_sequences


In [21]:
#applying layer to train and test data

train_sequences = train_reviews.map(lambda text: vectorize_layer(text)).apply(padding_func)
test_sequences = test_reviews.map(lambda text: vectorize_layer(text)).apply(padding_func)

In [25]:
# view 2 examples of train_sequences

for i in train_sequences.take(2):
    print(i)


tf.Tensor(
[   0    0    0    0   11   14   34  412  384   18   90   28    1    8
   33 1322 3560   42  487    1  191   24   85  152   19   11  217  316
   28   65  240  214    8  489   54   65   85  112   96   22 5596   11
   93  642  743   11   18    7   34  394 9522  170 2464  408    2   88
 1216  137   66  144   51    2    1 7558   66  245   65 2870   16    1
 2860    1    1 1426 5050    3   40    1 1579   17 3560   14  158   19
    4 1216  891 8040    8    4   18   12   14 4059    5   99  146 1241
   10  237  704   12   48   24   93   39   11 7339  152   39 1322    1
   50  398   10   96 1155  851  141    9], shape=(120,), dtype=int32)
tf.Tensor(
[   0    0    0    0    0    0    0    0   10   26   75  617    6  776
 2355  299   95   19   11    7  604  662    6    4 2129    5  180  571
   63 1403  107 2410    3 3905   21    2    1    3  252   41 4781    4
  169  186   21   11 4259   10 1507 2355   80    2   20   14 1973    2
  114  943   14 1740 1300  594    3  356  180  446    6 

In [26]:
#re-combining the sequences with the labels for training

train_dataset_vectorized = tf.data.Dataset.zip(train_sequences, train_labels)
test_dataset_vectorized = tf.data.Dataset.zip(test_sequences, test_labels)

In [27]:
#viewing 2 exomaples from train_dataset_vectorized

for example in train_dataset_vectorized.take(2):
    print(example)
    print()


(<tf.Tensor: shape=(120,), dtype=int32, numpy=
array([   0,    0,    0,    0,   11,   14,   34,  412,  384,   18,   90,
         28,    1,    8,   33, 1322, 3560,   42,  487,    1,  191,   24,
         85,  152,   19,   11,  217,  316,   28,   65,  240,  214,    8,
        489,   54,   65,   85,  112,   96,   22, 5596,   11,   93,  642,
        743,   11,   18,    7,   34,  394, 9522,  170, 2464,  408,    2,
         88, 1216,  137,   66,  144,   51,    2,    1, 7558,   66,  245,
         65, 2870,   16,    1, 2860,    1,    1, 1426, 5050,    3,   40,
          1, 1579,   17, 3560,   14,  158,   19,    4, 1216,  891, 8040,
          8,    4,   18,   12,   14, 4059,    5,   99,  146, 1241,   10,
        237,  704,   12,   48,   24,   93,   39,   11, 7339,  152,   39,
       1322,    1,   50,  398,   10,   96, 1155,  851,  141,    9],
      dtype=int32)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)

(<tf.Tensor: shape=(120,), dtype=int32, numpy=
array([   0,    0,    0,    0,    0,    0

In [28]:
#lastlu optimizing dataset for training

shuffle_buffer_size =1000
prefetch_buffer_size = tf.data.AUTOTUNE

train_dataset_final = (train_dataset_vectorized
                       .cache()
                       .shuffle(shuffle_buffer_size)
                       .prefetch(prefetch_buffer_size))

test_dataset_final =(test_dataset_vectorized
                     .cache()
                     .prefetch(prefetch_buffer_size))

##### Building and compiling the model

In [30]:
model = tf.keras.models.Sequential([
    tf.keras.Input(shape=(max_length,)),
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=['accuracy'])

model.summary()

#### Training the model

In [32]:
model.fit(train_dataset_final,
          epochs=5,
          validation_data = test_dataset_final,
    )

Epoch 1/5


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("sequential_1/Cast:0", shape=(120,), dtype=float32) with name 'keras_tensor' and path ''. Expected shape (None, 120), but input has incompatible shape (120,)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(120,), dtype=int32)
  • training=True
  • mask=None
  • kwargs=<class 'inspect._empty'>