# Tutorial on TensorFlow Datasets

## Tensorflow Datasets

### Import dependencies

In [4]:
import tensorflow as tf
import tensorflow_datasets as tfds

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Create a dataset for a NLP task
dataset, info = tfds.load('imdb_reviews', as_supervised=True, with_info=True)

In [14]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir='/home/james/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>

### What is a tf.data.Dataset?

In [23]:
print(dir(dataset)), print(type(dataset))

['_GeneratorState', '__abstractmethods__', '__annotations__', '__bool__', '__class__', '__class_getitem__', '__debug_string__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__tf_tracing_type__', '__weakref__', '_abc_impl', '_add_trackable_child', '_add_variable_with_custom_getter', '_apply_debug_options', '_as_serialized_graph', '_checkpoint_dependencies', '_common_args', '_consumers', '_convert_variables_to_tensors', '_copy_trackable_to_cpu', '_deferred_dependencies', '_deserialization_dependencies', '_deserialize_from_proto', '_export_to_saved_model_graph', '_flat_shapes', '_flat_structure', '_flat_types', '_functions', '_gather_saveables_for_checkpoint', '_graph', '

(None, None)

#### Dictionary features

In [7]:
print(dataset.keys())

dict_keys(['train', 'test', 'unsupervised'])


In [9]:
print(dataset['test'], dataset['train'], dataset['unsupervised'])

<_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))> <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))> <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>


In [11]:
## look at some examples
for example in dataset['train'].take(2):
  text, label = example
  print(f'{text=}')
  print(f'{label=}')

text=<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">
label=<tf.Tensor: shape=(), dtype=int64, numpy=0>
text=<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and co

2024-04-30 14:59:31.144356: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2024-04-30 14:59:31.144612: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [22]:
print(f'{info.features=}')

info.features=FeaturesDict({
    'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
    'text': Text(shape=(), dtype=string),
})


### How to build a custom dataset

In [20]:
# create a custom dataset from scratch
sentences = [
    'This is a positive sentence',
    'This is a negative sentence',
    'This is a neutral sentence'
]

labels = [1, 0, 2]

assert len(sentences) == len(labels), 'The dimensions of the sentences and labels do not match'

dataset = tf.data.Dataset.from_tensor_slices((sentences, labels))

In [21]:
for example in dataset.take(3):
    text, label = example
    print(f'{text=}')
    print(f'{label=}')

text=<tf.Tensor: shape=(), dtype=string, numpy=b'This is a positive sentence'>
label=<tf.Tensor: shape=(), dtype=int32, numpy=1>
text=<tf.Tensor: shape=(), dtype=string, numpy=b'This is a negative sentence'>
label=<tf.Tensor: shape=(), dtype=int32, numpy=0>
text=<tf.Tensor: shape=(), dtype=string, numpy=b'This is a neutral sentence'>
label=<tf.Tensor: shape=(), dtype=int32, numpy=2>


2024-04-30 15:05:39.067616: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [24]:
## Tokenize the sentences
from tensorflow.keras.layers import TextVectorization

text_vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=5)

text_vectorizer.adapt(sentences)

In [25]:
import random
example_sentence = random.choice(sentences)
print(f'{example_sentence=}')
print(f'{text_vectorizer([example_sentence])=}')
print(f'{text_vectorizer.get_vocabulary()[:5]=}')

example_sentence='This is a negative sentence'
text_vectorizer([example_sentence])=<tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[2, 4, 5, 8, 3]])>
text_vectorizer.get_vocabulary()[:5]=['', '[UNK]', 'this', 'sentence', 'is']


In [26]:
text_vectorizer.get_config()

{'name': 'text_vectorization',
 'trainable': True,
 'dtype': 'float32',
 'max_tokens': 10000,
 'standardize': 'lower_and_strip_punctuation',
 'split': 'whitespace',
 'ngrams': None,
 'output_mode': 'int',
 'output_sequence_length': 5,
 'pad_to_max_tokens': False,
 'sparse': False,
 'ragged': False,
 'vocabulary': None,
 'idf_weights': None,
 'encoding': 'utf-8',
 'vocabulary_size': 9}

In [28]:
embedding_layer = tf.keras.layers.Embedding(input_dim=10000,
                                            output_dim=28,
                                            mask_zero=True,
                                            name='embedding_layer'
                                            )

print(f'Sentence before vectorization: {example_sentence}, length: {len(example_sentence)}')
vectorized_sentence = text_vectorizer([example_sentence])
print(f'Sentence after vectorization: {vectorized_sentence}, shape: {vectorized_sentence.shape}')
embedded_sentence = embedding_layer(vectorized_sentence)
print(f'Sentence after embedding: {embedded_sentence}, shape: {embedded_sentence.shape}')

Sentence before vectorization: This is a negative sentence, length: 27
Sentence after vectorization: [[2 4 5 8 3]], shape: (1, 5)
Sentence after embedding: [[[ 0.04026623 -0.03034171  0.01767038  0.00791168  0.02143748
   -0.03262838  0.00373763 -0.03083758  0.00420485  0.02143753
    0.0067477   0.04109366 -0.02439394 -0.03621243  0.0085892
    0.02006624 -0.01461221  0.03531739  0.02747994  0.00864632
   -0.03965163 -0.02802268 -0.04038435  0.02284969 -0.00252801
    0.00727002  0.03268513  0.04425709]
  [-0.02892669 -0.04278717  0.00684432 -0.0057906  -0.00945543
    0.02323644 -0.00010985 -0.02145585 -0.03023958  0.00703824
   -0.04264001  0.00870859  0.03945338 -0.04450097 -0.0049087
    0.01590634 -0.01702728  0.02580483  0.01759857  0.03590829
   -0.02435064 -0.04636157 -0.04112358 -0.03153938  0.04775974
    0.03534048 -0.03626079  0.03865022]
  [ 0.01169432  0.00275372 -0.02713203 -0.04588832  0.00971036
    0.03649792  0.0355723  -0.04311568 -0.00829047  0.03936441
   -0.0228

In [29]:
train_dataset = tf.data.Dataset.from_tensor_slices((sentences, labels))

train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)