In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

if __name__ == '__main__':
    ''' Note: Tensorflow datasets provides many commonly used datasets, on which you apply the tf.data. If you want to
    use a custom dataset. You want to use tf.data. We will see later.
    '''
    # Lets load a simple MNIST datasets, ds_train is a dataset
    (ds_train, ds_test), ds_info = tfds.load(
        'mnist',  # The name of the dataset, you can see on doc
        split=['train', 'test'],  # Here we are telling that first is to split for train and then test, some datasets
        # also has for validation, then you can also write validation in there. MNIST doesn't have that, you have to
        # check manually what the splits for the datasets are.
        shuffle_files=True,  # TFDS stores the data in TFRecord format in multiple files so we are going to shuffle
        # it so that we don't see fix sequence because of GD as it wants data to be IID. Even though the batches
        # inside are shuffled but still!
        as_supervised=True,  # It will return a tuple (X, Y) otherwise it will return a dictionary.
        with_info=True
    )
    # fig = tfds.show_examples(ds_train, ds_info, rows=4, cols=4)  # To use this u must set as_supervised=False!!,
    # as it expects a dictionary
    print(ds_info)

    for X, Y in ds_train:
        print(X.shape, end="\n\n")
        print(Y.shape)
        break

    print(ds_info.splits)


    def normalize(image, label):  # To normalize the image
        return tf.divide(tf.cast(image, tf.float32), 255.0), label


    ds_train = ds_train.map(normalize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Since ds_train contains number of (X, Y) tuple hence image = X, label = Y, We are not using tf_pyfunction
    # because in above function we are using tf tensor operation not normal python operations.

    ds_train = ds_train.cache()
    # Caches the elements in this dataset. The first time the dataset is iterated over, its elements will be cached
    # either in the specified file or in memory. Subsequent iterations will use the cached data.

    ds_train = ds_train.shuffle(buffer_size=ds_info.splits["train"].num_examples)
    # Randomly shuffles the elements of this dataset.
    # This dataset fills a buffer with buffer_size elements, then randomly samples elements from this buffer,
    # replacing the selected elements with new elements. For perfect shuffling,
    # a buffer size greater than or equal to the full size of the dataset is required.
    ds_train = ds_train.batch(batch_size=64)

    ds_train = ds_train.prefetch(
        tf.data.experimental.AUTOTUNE)  # Creates a Dataset that prefetches elements from this dataset.
    # Most dataset input pipelines should end with a call to prefetch.
    # This allows later elements to be prepared while the current element is being processed.

    ds_test = ds_test.map(normalize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_test = ds_test.batch(batch_size=64)
    ds_test = ds_test.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    # Lets define a Sequential Model
    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(10)  # I didn't use softmax so I will set from_logits=True
    ])

    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=["accuracy"])

    history = model.fit(ds_train, epochs=5, verbose=2)
    print(model.evaluate(ds_test))

    # Lets do for text data on sentiment analysis

    # This movie was terrible --> 0
    # This movie was wonderful --> 1

    (ds_train, ds_test), ds_info = tfds.load(
        "imdb_reviews",
        split=["train", "test"],
        shuffle_files=True,
        as_supervised=True,
        with_info=True
    )
    print(ds_info)

    for text, label in ds_train:
        print(text, end="\n")
        print(label)
        break

    # Below is to create Tensorflow dataset for text. One of the standard ways using tensorflow dataset.
    # We will see later how to use it for custom dataset as well.

    """Prepare the dataset for training:
    Next, you will standardize, tokenize, and vectorize the data using the
     tf.keras.layers.TextVectorization layer.

    Standardization refers to preprocessing the text, typically to remove
     punctuation or HTML elements to simplify the dataset.

    Tokenization refers to splitting strings into tokens
     (for example, splitting a sentence into individual
      words by splitting on whitespace).

    Vectorization refers to converting tokens into numbers
     so they can be fed into a neural network.

    Note that:

    The default standardization converts text to lowercase and removes punctuation 
    (standardize='lower_and_strip_punctuation').
    The default tokenizer splits on whitespace (split='whitespace').
    The default vectorization mode is 'int' (output_mode='int').
    This outputs integer indices (one per token).
    This mode can be used to build models that take word order into account.
    You can also use other modes—like 'binary'—to build bag-of-words models.

    Bag of Words:  
    A representation of the words in a phrase or passage, irrespective of order. 
    For example, bag of words represents the following three phrases identically:

    the dog jumps
    jumps the dog
    dog jumps the
    Each word is mapped to an index in a sparse vector, where the vector has an 
    index for every word in the vocabulary. For example,
    the phrase the dog jumps is mapped into a feature vector with non-zero values at
    the three indices corresponding to the words the, dog, and jumps.
    The non-zero value can be any of the following:

    A 1 to indicate the presence of a word.

    A count of the number of times a word appears in the bag.
    For example, if the phrase were the maroon dog is a dog with maroon fur,
    then both maroon and dog would be represented as 2,
    while the other words would be represented as 1.

    Some other value, such as the logarithm of the count of the number of times a 
    word appears in the bag.

    Example: John","likes","to","watch","movies","Mary","likes","movies","too"

    BOW_FREQ =  {"John":1,"likes":2,"to":1,"watch":1,"movies":2,"Mary":1,"too":1};

    """
    VOCAB_SIZE = 10000
    MAX_SEQUENCE_LENGTH = 250
    vectorize_layer = tf.keras.layers.TextVectorization(
        max_tokens=VOCAB_SIZE,
        output_mode='int',
        output_sequence_length=MAX_SEQUENCE_LENGTH)
    # For the 'int' mode, in addition to maximum vocabulary size, you need to set
    # an explicit maximum sequence length (MAX_SEQUENCE_LENGTH), which will cause
    # the layer to pad or truncate sequences to exactly output_sequence_length
    # values.

    # Make a text-only dataset (without labels), then call `TextVectorization.adapt`.
    train_text = ds_train.map(lambda text, labels: text)
    vectorize_layer.adapt(train_text)

    # During adapt(), the layer will build a vocabulary of all string tokens seen in
    # the dataset, sorted by occurance count, with ties broken by sort order of the
    # tokens (high to low). At the end of adapt(), if max_tokens is set, the
    # voculary wil be truncated to max_tokens size. For example, adapting a layer
    # with max_tokens=1000 will compute the 1000 most frequent tokens occurring
    # in the input dataset. If output_mode='tf-idf', adapt() will also learn the
    # document frequencies of each token in the input dataset.

    def vectorize_text(text, label):
        text = tf.expand_dims(text, -1)
        return vectorize_layer(text), label


    ds_train = ds_train.map(vectorize_text)
    ds_test = ds_test.map(vectorize_text)

    ds_train = ds_train.cache()
    ds_train = ds_train.shuffle(10000)
    ds_train = ds_train.batch(batch_size=64)
    ds_train = ds_train.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    ds_test = ds_test.cache()
    ds_test = ds_test.batch(batch_size=64)
    ds_test = ds_test.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    for text, label in ds_train:
        print(text.shape)
        print(label.shape)
        break
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=64, input_length=MAX_SEQUENCE_LENGTH),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(loss=tf.keras.losses.binary_crossentropy,
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1),
                  metrics=['accuracy'])
    # clip norm to avoid exploding gradient problem if appears

    model.fit(ds_train, epochs=10)
    model.evaluate(ds_test)

