In [1]:
import os
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("HUb version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.3.3
Eager mode:  True
HUb version:  0.12.0
GPU is available


In [2]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'stack_overflow/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [3]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'stack_overflow/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [4]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'stack_overflow/test',
    batch_size=batch_size
)

Found 8000 files belonging to 4 classes.


In [5]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = raw_train_ds.shuffle(10000).cache().prefetch(buffer_size=AUTOTUNE)
val_ds = raw_val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = raw_test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [6]:
embedding = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
                           dtype=tf.string, trainable=True)

In [7]:
for text_batch, label_batch in train_ds.take(1):
    print(hub_layer(text_batch))

tf.Tensor(
[[ 0.49353632 -0.09106945  0.24746539 ... -0.18103167  0.16304702
   0.04224155]
 [ 1.1388122   0.22926368  0.15456666 ... -0.3098295   0.33862117
  -0.16470738]
 [ 0.7710682   0.00624186  0.32207936 ... -0.1814717   0.31932423
  -0.29602945]
 ...
 [ 0.9618884   0.23284051  0.43258935 ... -0.18743123  0.5764672
  -0.27091295]
 [ 0.09871595  0.02998851  0.0455527  ...  0.05348487  0.5648976
  -0.5657317 ]
 [ 0.637       0.02195277  0.09196584 ... -0.11805531  0.16524407
  -0.04510384]], shape=(32, 128), dtype=float32)


In [8]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(4))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 128)               124642688 
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 68        
Total params: 124,644,820
Trainable params: 124,644,820
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [10]:
history = model.fit(train_ds,
                    epochs=10,
                    validation_data=val_ds,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
results = model.evaluate(test_ds, verbose=2)

for name, value in zip(model.metrics_names, results):
    print("%s: %.3f" % (name, value))

250/250 - 1s - loss: 0.6016 - accuracy: 0.8357
loss: 0.602
accuracy: 0.836
