* What's the topic of this text? (text classification)
* Does this text contain abuse? (moderation)
* Does this text sound positive or negative? (sentiment analysis)
* What should be the next word in this incomplete sentence? (language modelling)
* How would you say this in Dutch? (translation)
* Produce a summary of this article in one paragraph. (summarization)

# What needs to be done to process text for neural networks?
* Standardizing; convert to lower case, remove punctuation
* Split the text into units (tokens), such as characters, words, groups of words, clauses in sentences, etc
* Convert all tokens to a tensor. This means (typically) indexing the tokens.

### Example
The cat sat on the mat.
the cat sat on the mat
["cat", "sat", "on", "mat"]
[2, 34, 53, 8]
(one-hot encoding very common)

é -> e
è -> e

# Three ways of handling tokens
## Word-level tokenization
Tokens are space-separated substrings (or puncuation-separated if appropriate). A variant also splits into subwords, which is especially important for agglutinating and composing lanugages, such as Finnish or Swedish. 
## N-gram tokenization
Tokens are groups of N consecutive words. For example, "the cat", "he was", "over there" -- these are 2-grams or "bigrams".
## Character-level tokenization
Each character is its own token. In practice, useful for languages with rich writing systems or pictographic writing (cyrillic, chinese)

Dataset to use:
https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [2]:
import os, pathlib, shutil, random
base_dir = pathlib.Path("../../Data/aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
    os.makedirs(val_dir / category, exist_ok=True)
    files = os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir/category/fname)

In [3]:
import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(train_dir, batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory(val_dir, batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory(base_dir / "test", batch_size=batch_size)

2024-12-13 12:44:45.187618: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-13 12:44:45.236919: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-13 12:44:45.273238: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734090285.311765   14177 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734090285.324097   14177 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-13 12:44:45.404229: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

Found 10240 files belonging to 2 classes.


2024-12-13 12:44:47.703752: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Found 14760 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [3]:
for inputs, targets in train_ds:
    print(f"inputs: {inputs.shape}, {inputs.dtype}")
    print(f"targets: {targets.shape}, {targets.dtype}")
    break

inputs: (32,), <dtype: 'string'>
targets: (32,), <dtype: 'int32'>


In [7]:
from keras import layers
text_vectorization = layers.TextVectorization(max_tokens=20000, output_mode="multi_hot")
text_only_train_ds = train_ds.map(lambda x, _: x)
text_vectorization.adapt(text_only_train_ds)


In [None]:

binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
binary_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))

In [5]:
def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
    return model


In [6]:
model = get_model()
model.summary()

In [7]:
callbacks = [ 
    keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True) 
]
model.fit(binary_1gram_train_ds.cache(), validation_data=binary_1gram_val_ds.cache(), epochs=10, callbacks=callbacks)

Epoch 1/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7417 - loss: 0.5249 - val_accuracy: 0.8845 - val_loss: 0.3007
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8899 - loss: 0.2871 - val_accuracy: 0.8884 - val_loss: 0.2819
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9151 - loss: 0.2288 - val_accuracy: 0.8913 - val_loss: 0.2859
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9302 - loss: 0.1973 - val_accuracy: 0.8871 - val_loss: 0.3014
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9354 - loss: 0.1829 - val_accuracy: 0.8857 - val_loss: 0.3199
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9437 - loss: 0.1720 - val_accuracy: 0.8868 - val_loss: 0.3318
Epoch 7/10
[1m400/400[0m 

<keras.src.callbacks.history.History at 0x7f06f38c5c10>

In [8]:
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8831 - loss: 0.2907
Test acc: 0.883


In [9]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'int64'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([0 1 1 ... 0 0 0], shape=(20000,), dtype=int64)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


In [10]:
text_vectorization = layers.TextVectorization(ngrams=2, max_tokens=20000, output_mode="tf_idf")

text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
tfidf_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
tfidf_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


2024-12-13 12:43:22.470434: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [11]:
model = get_model()
callbacks = [keras.callbacks.ModelCheckpoint("tfidf_2gram.keras", save_best_only=True)]
model.fit(tfidf_2gram_train_ds.cache(), validation_data=tfidf_2gram_val_ds.cache(), epochs=10, callbacks=callbacks)

Epoch 1/10


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6985 - loss: 0.7429 - val_accuracy: 0.8900 - val_loss: 0.2975
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8576 - loss: 0.3333 - val_accuracy: 0.8938 - val_loss: 0.2797
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8844 - loss: 0.2682 - val_accuracy: 0.8885 - val_loss: 0.2918
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8957 - loss: 0.2390 - val_accuracy: 0.8827 - val_loss: 0.3021
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9074 - loss: 0.2193 - val_accuracy: 0.8933 - val_loss: 0.3130
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9019 - loss: 0.2114 - val_accuracy: 0.8817 - val_loss: 0.3493
Epoch 7/10
[1m400/400[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x7f06f1961a60>

In [12]:
model = keras.models.load_model("tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8936 - loss: 0.2880
Test acc: 0.890


In [9]:
max_length = 600
max_tokens = 20000

text_vectorization = layers.TextVectorization(max_tokens = max_tokens, output_mode="int",output_sequence_length=max_length)
text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


In [10]:
import tensorflow as tf

class MyLayer(keras.Layer):
    def call(self, x):
        return tf.one_hot(x, depth=max_tokens)

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = MyLayer()(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x =  layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])


In [11]:
callbacks = [
    keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras", save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model("one_hot_bidir_lstm.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10


2024-12-13 12:46:14.983064: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1536000000 exceeds 10% of free system memory.
2024-12-13 12:46:15.173429: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1536000000 exceeds 10% of free system memory.
2024-12-13 12:46:15.421076: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1536000000 exceeds 10% of free system memory.


[1m  1/320[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m29:35[0m 6s/step - accuracy: 0.4375 - loss: 0.6934

2024-12-13 12:46:19.168068: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1536000000 exceeds 10% of free system memory.
2024-12-13 12:46:19.353714: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1536000000 exceeds 10% of free system memory.


[1m  5/320[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m20:36[0m 4s/step - accuracy: 0.4603 - loss: 0.6930

: 