In [1]:
#Donwloading dataset
import requests
import tarfile
import os

output_dir = '../data'
os.makedirs(output_dir, exist_ok=True)

url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

tar_path = os.path.join(output_dir, 'aclImdb_v1.tar.gz')

response = requests.get(url, stream=True)
if response.status_code  == 200:
    with open(tar_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print(f'Downloaded file: {tar_path}')
else:
    print(f'Donwloaded error {response.status_code}')
    
with tarfile.open(tar_path, 'r:gz') as tar_ref:
    tar_ref.extractall(output_dir)
    print(f'Extracted files in: {output_dir}')
    
imdb_dir = os.path.join(output_dir, 'aclImdb')
if os.path.exists(imdb_dir):
    for item in os.listdir(imdb_dir):
        print(f' - {item}')
else:
    print('Error: The folder aclImdb not found')

Downloaded file: ../data\aclImdb_v1.tar.gz
Extracted files in: ../data
 - imdb.vocab
 - imdbEr.txt
 - README
 - test
 - train


In [4]:
import os, pathlib, shutil, random

base_dir = pathlib.Path('../data/aclImdb')
val_dir = base_dir/'val'
train_dir = base_dir/'train'
for category in ('neg', 'pos'):
    os.makedirs(val_dir/category)
    files = os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir/category/fname,
                    val_dir/category/fname)

In [6]:
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    '../data/aclImdb/train', batch_size=batch_size
)

val_ds = keras.utils.text_dataset_from_directory(
    '../data/aclImdb/val', batch_size=batch_size
)

test_ds = keras.utils.text_dataset_from_directory(
    '../data/aclImdb/test', batch_size=batch_size
)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [11]:
for inputs, targets in train_ds:
    print('inputs.shape:', inputs.shape)
    print('inputs.dtype:', inputs.dtype)
    print('targets.shape:', targets.shape)
    print('targets.dtype:', targets.dtype)
    print('inputs[0]:', inputs[0])
    print('inputs[0]:', targets[0])
    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'The name of this film alone made me want to see just what it was all about, so I taped this film during the early hours of the AM. If you ever wanted to see what miners had to go through during the early days and actually see a dramatic scene when the mine crumbles in on the men. This film clearly wants to show that Germany and France can work together and be friends after WW I and how the Germans came to the aid of the French miners much to the unbelief of the French townsfolk. The actors were all outstanding, with unusual scenes in the mine with a horse and a small young boy who worked in the mine. There is an old old retired miner who manges to go down the mine by ladder when the elevator breaks down. If you are a real film buff, this is a film you will not want to miss.', shape=(), dtype=string)
inputs[0]: tf.Tensor(1, shape=(), dtype=int32)


In [15]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(
    max_tokens = 20000,
    output_mode = 'multi_hot'
)
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_lgram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_lgram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_lgram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)


In [18]:
for inputs, targets in binary_lgram_train_ds:
    print('inputs.shape:', inputs.shape)
    print('inputs.dtype:', inputs.dtype)
    print('targets.shape:', targets.shape)
    print('targets.dtype:', targets.dtype)
    print('inputs[0]', inputs[0])
    print('target[0]', targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'int64'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0] tf.Tensor([1 1 1 ... 0 0 0], shape=(20000,), dtype=int64)
target[0] tf.Tensor(1, shape=(), dtype=int32)


In [19]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [22]:
model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint('binary_lgram.keras',
                                    save_best_only=True)
]
model.fit(binary_lgram_train_ds.cache(),
          validation_data=binary_lgram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)
model = keras.models.load_model('binary_lgram.keras')
print(f'Test acc: {model.evaluate(binary_lgram_test_ds)[1]:.3f}')

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.8285 - loss: 0.4053 - val_accuracy: 0.8782 - val_loss: 0.3028
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9010 - loss: 0.2644 - val_accuracy: 0.8868 - val_loss: 0.2985
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9190 - loss: 0.2351 - val_accuracy: 0.8852 - val_loss: 0.3183
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9269 - loss: 0.2214 - val_accuracy: 0.8862 - val_loss: 0.3386
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9295 - loss: 0.2082 - val_accuracy: 0.8854 - val_loss: 0.3578
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9334 - loss: 0.2065 - val_accuracy: 0.8846 - val_loss: 0.3690
Epoch 7/10
[1m625/625[0m 

In [23]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='multi_hot')

In [25]:
text_vectorization.adapt(text_only_train_ds)
binary_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)
binary_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)
binary_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4
)

model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint('binary_2gram.keras',
                                   save_best_only=True)
]
model.fit(binary_2gram_train_ds.cache(),
          validation_data=binary_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)
model = keras.models.load_model('binary_2gram.keras')
print(f'Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}')

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.8439 - loss: 0.3761 - val_accuracy: 0.8840 - val_loss: 0.2882
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9135 - loss: 0.2414 - val_accuracy: 0.8900 - val_loss: 0.2914
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9319 - loss: 0.2056 - val_accuracy: 0.8906 - val_loss: 0.3131
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9415 - loss: 0.1852 - val_accuracy: 0.8922 - val_loss: 0.3307
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9457 - loss: 0.1789 - val_accuracy: 0.8926 - val_loss: 0.3426
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9520 - loss: 0.1637 - val_accuracy: 0.8932 - val_loss: 0.3651
Epoch 7/10
[1m625/625[0m 

In [28]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="tf_idf",
)

In [30]:
text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint('tfidf_2gram.keras',
                                    save_best_only=True)
]
model.fit(tfidf_2gram_train_ds.cache(),
          validation_data=tfidf_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)
model = keras.models.load_model('tfidf_2gram.keras')
print(f'Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}')

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7327 - loss: 0.5572 - val_accuracy: 0.8670 - val_loss: 0.3591
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8262 - loss: 0.4007 - val_accuracy: 0.8608 - val_loss: 0.3277
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8601 - loss: 0.3502 - val_accuracy: 0.8800 - val_loss: 0.3443
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8792 - loss: 0.3029 - val_accuracy: 0.8774 - val_loss: 0.3341
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8888 - loss: 0.2768 - val_accuracy: 0.8798 - val_loss: 0.3395
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8989 - loss: 0.2655 - val_accuracy: 0.8696 - val_loss: 0.3690
Epoch 7/10
[1m625/625[0m 

In [31]:
from tensorflow.keras import layers

max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=max_length
)
text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [34]:
for inputs, targets in int_train_ds:
    print('inputs.shape:', inputs.shape)
    print('inputs.dtype:', inputs.dtype)
    print('targets.shape:', targets.shape)
    print('targets.dtype:', targets.dtype)
    print('inputs[0]:', inputs[0])
    print('inputs[0]:', targets[0])

inputs.shape: (32, 600)
inputs.dtype: <dtype: 'int64'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(
[ 1017  2197    58    27    53  2608     5    11   796   298     5    25
   300    22    63     7     9     2   112    18   298     5     9    19
    30    79     2    63   566   298     5  3186  4133  7184  3186     7
   318  1290    22    63    80     9    14   420    33  2197    19    79
    80     9    67     2  3121     6    81     2   219   148    54    44
     9   413    42   127   694 17094    13    72    32   118     2    64
     5     2  1809     5 10567     3    25   114     6  8391    25  2065
   338    38    10   468   138    82     2  1340     5     2    64    10
    76   187   357    23    12     2   112   171     5    11  3186   298
     7    22     2  2678   733    41     2  1325  1357    19     2   150
  7552     5   251   108    10   789   467   165     4   124     1    71
   715     1  6908  1118   926    23   175   138   359    17  2851 

In [39]:
import tensorflow as tf
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"])
model.summary()

ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.ops`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```
