In [None]:
#Unzip downloaded file
!tar -xf aclImdb_v1.tar.gz

In [None]:
import os, pathlib, shutil, random

base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    #Create seed
    random.Random(1337).shuffle(files)
    #Use 20% of training files for validation
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    #Move files to validation - neg/pos
    for fname in val_files:
        shutil.move(train_dir / category / fname, 
                    val_dir / category / fname)

In [1]:
#tensorflow as a utility similar to their image dataset from directory for text
from tensorflow import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size = batch_size
)

val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size = batch_size
)

test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size = batch_size
)

Found 75000 files belonging to 3 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [2]:
#show shape and dtype of dataset
for inputs, targets in train_ds:
    print("input shape: ", inputs.shape)
    print("input dtype: ", inputs.dtype)
    print("targets shape: ", targets.shape)
    print("targets dtype: ", targets.dtype)
    print("inputs[0]: ", inputs[0])
    print("targets[0]: ", targets[0])
    break

input shape:  (32,)
input dtype:  <dtype: 'string'>
targets shape:  (32,)
targets dtype:  <dtype: 'int32'>
inputs[0]:  tf.Tensor(b'I watched this film recently on DVD and I have to say I wasn\'t impressed. I know it\'s taboo to knock independent films, but this one felt devoid of entertainment.<br /><br />The premise was interesting, but the execution of it fell short. I found myself thinking "okay, they\'re just getting into it, the story will pick up soon". Before I knew it, the film was over and the story never picked up. I can\'t say I found the acting all that impressive either. It was pretty bad. Not Star Wars prequel trilogy bad, but bad nonetheless.<br /><br />I\'m not sure what the running time was, I\'ll assume two hours (because it\'s a safe estimate). Anyway, when the film was finished, I felt as though I deserved some kind of recognition for the will power I exerted in not stopping the film and walking away halfway through.<br /><br />Again, I was thoroughly unimpressed, a

In [3]:
from tensorflow.keras.layers import TextVectorization
#Test out multi-hot encoded binary word vector unigram
#Use 20k most frequent words and encode as multi-hot binary vectors
text_vectorization = TextVectorization(
    max_tokens = 20000,
    output_mode = "multi_hot"
)

#dataset prep w/ only raw text inputs (no label)
text_only_train_ds = train_ds.map(lambda x, y:x)
#use dataset to index the dataset vocab via adapt() method
text_vectorization.adapt(text_only_train_ds)

#prep processed versions of data. specifiy number of cpu cores
binary_1gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls = 4)
binary_1gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls = 4)
binary_1gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls = 4)

In [4]:
for inputs, targets in binary_1gram_train_ds:
    #Inputs are batchs of 20k vectors
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(2, shape=(), dtype=int32)


In [5]:
from tensorflow import keras 
from tensorflow.keras import layers
  
def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model

In [6]:
model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_1gram.keras",
                                    save_best_only=True)
]
#Cache the datasets in memory so that preprocessing is only done once during firt epoch
#Preprocessed data is reused
model.fit(binary_1gram_train_ds.cache(),                   
          validation_data=binary_1gram_val_ds.cache(),     
          epochs=10,
          callbacks=callbacks)
model = keras.models.load_model("binary_1gram.keras") 
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 20000)]           0         
_________________________________________________________________
dense (Dense)                (None, 16)                320016    
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.500


### Reconfigure For Bigrams

In [7]:
text_vectorization = TextVectorization(ngrams = 2,
                                      max_tokens = 20000,
                                      output_mode = "multi_hot")

In [8]:
#Train/test bigram model
text_vectorization.adapt(text_only_train_ds)
binary_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
 
model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_2gram.keras",
                                    save_best_only=True)
]
model.fit(binary_2gram_train_ds.cache(),
          validation_data=binary_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)
model = keras.models.load_model("binary_2gram.keras")
print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 20000)]           0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                320016    
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.500


In [9]:
#Count word occurrences in text
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="count"
)

### TF-IDF Example

In [10]:
#Math example
def tfidf(term, document, dataset):
    term_freq = document.count(term)
    doc_freq = math.log(sum(doc.count(term) for doc in dataset) + 1)
    return term_freq / doc_freq

In [14]:
#Built into textvectorization
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="tf_idf",
)

In [None]:
#Example Model - Not working right now!
#.adapt() learns TF-IDF weights in addition to the vocab
text_vectorization.adapt(text_only_train_ds)    
 
tfidf_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
 
model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("tfidf_2gram.keras",
                                    save_best_only=True)
]
model.fit(tfidf_2gram_train_ds.cache(),
          validation_data=tfidf_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)
model = keras.models.load_model("tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

### Exporting a Model that Processes Raw Strings

In [None]:
#For production models can reuse the TextVectorization layer and add to trained model
inputs = keras.Input(shape=(1,), dtype="string")   
processed_inputs = text_vectorization(inputs)      
outputs = model(processed_inputs)                  
inference_model = keras.Model(inputs, outputs)

In [None]:
import tensorflow as tf
raw_text_data = tf.convert_to_tensor([
    ["That was an excellent movie, I loved it."],
])
predictions = inference_model(raw_text_data) 
print(f"{float(predictions[0] * 100):.2f} percent positive")

## Sequence Model Approach

In [17]:
from tensorflow.keras import layers
  
max_length = 600 
max_tokens = 20000 
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,     
)
text_vectorization.adapt(text_only_train_ds)
 
int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [18]:
#Sequence LSTM Model built on one-hot encoded vector sequences
import tensorflow as tf
#one input is a sequence of ints
inputs = keras.Input(shape=(None,), dtype="int64")   
#Encode the int into binary 20k dimensional vectors
embedded = tf.one_hot(inputs, depth=max_tokens)
#Add bidirectional LSTM
x = layers.Bidirectional(layers.LSTM(32))(embedded)   
x = layers.Dropout(0.5)(x) 
#Classification layer
outputs = layers.Dense(1, activation="sigmoid")(x)    
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
tf.one_hot (TFOpLambda)      (None, None, 20000)       0         
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                5128448   
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 5,128,513
Trainable params: 5,128,513
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Too large to run. each input matrix is of size (600,20000) i.e.
#600 words per sample w/ 20,000 possible words -- better to use word embeddings
callbacks = [
    keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras",
                                    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
          callbacks=callbacks)
model = keras.models.load_model("one_hot_bidir_lstm.keras") 
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

### Embeddings

In [20]:
embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)

In [21]:
#Deep learning model with embedding layer trained from scratch
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()
  
callbacks = [
    keras.callbacks.ModelCheckpoint("embeddings_bidir_gru.keras",
                                    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
          callbacks=callbacks)
model = keras.models.load_model("embeddings_bidir_gru.keras") 
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 256)         5120000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                73984     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 5,194,049
Trainable params: 5,194,049
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test

### Masking Embedded Layers

#### Our input contains words that are padded at the end and our RNN learns bidirectionally so when it is learning one of these words backwards it has alot of zeros. Can use masking which attaches metadata to the word telling the model to ignore these zeros.

In [None]:
#Masking enabled
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(
    #Set mask_zero to true!
    input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("embeddings_bidir_gru_with_masking.keras",
                                    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
          callbacks=callbacks)
model = keras.models.load_model("embeddings_bidir_gru_with_masking.keras") 
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

### Attention Scores

In [None]:
#Example layer that computes attention scores. 
num_heads = 4 
embed_dim = 256 
mha_layer = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
outputs = mha_layer(inputs, inputs, inputs)

### Transformer Encoder

In [None]:
vocab_size = 20000 
embed_dim = 256 
num_heads = 2 
dense_dim = 32 
  
inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(vocab_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
#Returns full sequences neeed to reduce to a single vector for classification via pooling.
x = layers.GlobalMaxPooling1D()(x)                          
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()