# Train - Dev - Test 

In [1]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from zipfile import ZipFile

In [None]:
#GOOGLE COLAB VERSION
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
else:
    print(gpu_info)

In [2]:
#JUPYTER VERSION
comb_path = "../Data/Combined_Images"

#GOOGLE COLAB VERSION
from google.colab import drive
drive.mount('/content/drive')

# Unzipping the dataset file facial-age.zip

combined_images_path = "/content/drive/MyDrive/Combined_Images.zip"

with ZipFile(combined_images_path, 'r') as myzip:
    myzip.extractall()
    print('Done unzipping Combined_Images.zip')

comb_path = '../content/Combined_Images'  

In short, there is not a good way to get the size/length; tf.data.Dataset is built for pipelines of data, so has an iterator structure (in my understanding and according to my read of the Dataset ops code. From the programmer's guide:

A tf.data.Iterator provides the main way to extract elements from a dataset. The operation returned by Iterator.get_next() yields the next element of a Dataset when executed, and typically acts as the interface between input pipeline code and your model.

And, by their nature, iterators do not have a convenient notion of size/length; see here: Getting number of elements in an iterator in Python

In [3]:
batch_size = 32

train_ds = tf.keras.utils.image_dataset_from_directory(
  comb_path,
  validation_split=0.2,
  subset="training", #If should be return the training set (80%) or the validation set (20%)
  seed=41, #Seed should guarantee that train_ds and val_ds doesn't have common images
  shuffle=True,
  image_size=(200, 200),
  batch_size=batch_size)

val_ds = tf.keras.utils.image_dataset_from_directory(
  comb_path,
  validation_split=0.2,
  subset="validation",
  seed=41, 
  shuffle=True,
  image_size=(200, 200),
  batch_size=batch_size)

test_dataset = val_ds.take(106)
val_ds = val_ds.skip(106)

print('Batches for training -->', train_ds.cardinality())
print('Batches for validating -->', val_ds.cardinality())
print('Batches for testing -->', test_dataset.cardinality())

Found 33884 files belonging to 8 classes.
Using 27108 files for training.
Found 33884 files belonging to 8 classes.
Using 6776 files for validation.
Batches for training --> tf.Tensor(848, shape=(), dtype=int64)
Batches for validating --> tf.Tensor(106, shape=(), dtype=int64)
Batches for testing --> tf.Tensor(106, shape=(), dtype=int64)


In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.Rescaling(1./255, input_shape=(200, 200, 3)),
  tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(8)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

epochs=3
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs
)

In [None]:
train_label = np.concatenate([y for x, y in train_ds], axis=0)
val_label = np.concatenate([y for x, y in val_ds], axis=0)
test_label = np.concatenate([y for x, y in test_dataset], axis=0)

train_counts = pd.Series(train_label).value_counts().sort_index()
val_counts = pd.Series(val_label).value_counts().sort_index()
test_counts = pd.Series(test_label).value_counts().sort_index()

print('Training Set: ', len(train_label))
print('Validation Set: ', len(val_label))
print('Test Set: ', len(test_label))

print(train_counts, '\n-------------------')
print(val_counts, '\n-------------------')
print(test_counts, '\n-------------------')