# A Simple TF 2.2 notebook

This is intended as a simple, short introduction to the operations competitors will need to perform with TPUs.

In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
from kaggle_datasets import KaggleDatasets
import numpy as np

print("Tensorflow version " + tf.__version__)

2022-12-12 06:45:25.061766: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-12-12 06:45:25.061893: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Tensorflow version 2.4.1


# Detect my accelerator

In [2]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470


2022-12-12 06:45:30.977417: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-12-12 06:45:30.980476: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-12-12 06:45:30.980522: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-12 06:45:30.980556: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (c2d72b5537a9): /proc/driver/nvidia/version does not exist
2022-12-12 06:45:30.983786: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

REPLICAS:  8


# Get my data path

In [3]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path() # you can list the bucket with "!gsutil ls $GCS_DS_PATH"

# Set some parameters

In [4]:
IMAGE_SIZE = [512, 512] # at this size, a GPU will run out of memory. Use the TPU
EPOCHS = 25
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

NUM_TRAINING_IMAGES = 12753*2
NUM_TEST_IMAGES = 7382
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE

# Load my data

This data is loaded from Kaggle and automatically sharded to maximize parallelization.

In [5]:
# with augmentation

def decode_image(image_data, augmentation = False):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    #image = tf.cast(image, tf.float32) / 127.5 - 1  # convert image to floats in [-1, 1] range (for Xception)
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    if augmentation:
        image = tf.image.random_flip_left_right(image)
        image = tf.image.random_contrast(image, 0.7, 1.5)
        image = tf.image.random_brightness(image, 0.3)
        image = tf.image.rot90(image, np.random.randint(1,4))
    
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_labeled_tfrecord_wa(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'], augmentation=True)
    label = tf.cast(example['class'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "id": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum # returns a dataset of image(s)

def load_dataset(filenames, labeled=True, ordered=False, augmentation=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_unlabeled_tfrecord if not labeled 
                          else read_labeled_tfrecord_wa if augmentation else read_labeled_tfrecord)
#     dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

@tf.function
def get_training_dataset(augmentation=False):
    dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-{}x{}/train/*.tfrec'.format(*IMAGE_SIZE)), 
                           labeled=True, augmentation=augmentation)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

def get_validation_dataset():
    dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-{}x{}/val/*.tfrec'.format(*IMAGE_SIZE)), 
                           labeled=True, ordered=False)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    return dataset

def get_test_dataset(ordered=False):
    dataset = load_dataset(tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-{}x{}/test/*.tfrec'.format(*IMAGE_SIZE)), 
                           labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

training_dataset = get_training_dataset()
augmented_dataset = get_training_dataset(augmentation=True)
# augmented_dataset_1 = get_training_dataset(augmentation=True)
validation_dataset = get_validation_dataset()
train_aug_ds = training_dataset.concatenate(augmented_dataset)
# train_aug_ds = train_aug_ds.concatenate(augmented_dataset_1)

2022-12-12 06:45:38.181974: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.
2022-12-12 06:45:38.564688: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.
2022-12-12 06:45:38.838302: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.


In [6]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 10))
# for i, (image, label) in enumerate(augmented_dataset.take(9)):
#     ax = plt.subplot(3, 3, i + 1)
#     plt.imshow(image[0])
#     plt.title(int(label[0]))
#     plt.axis("off")

In [7]:
# for tf version above 2.5
#data_augmentation = tf.keras.Sequential(
#    [tf.keras.layers.RandomFlip(), 
#     tf.keras.layers.RandomRotation(0.3),
#     tf.keras.layers.RandomContrast(0.3)]
#)



In [8]:
#for images, labels in training_dataset.take(1):
#    plt.figure(figsize=(10, 10))
#    first_image = images[0]
#    for i in range(9):
#        ax = plt.subplot(3, 3, i + 1)
#        augmented_image = data_augmentation(
#            first_image, training=True
#        )
#        plt.imshow((augmented_image.numpy()*255).astype('int32'))
#        plt.title(int(labels[0]))
#        plt.axis("off")

# Build a model on TPU (or GPU, or CPU...) with Tensorflow 2.1!

In [9]:
EPOCHS = 15
with strategy.scope():    
    #pretrained_model = tf.keras.applications.VGG16(weights='imagenet', include_top=False ,input_shape=[*IMAGE_SIZE, 3])
    # - 0.68024 - pretrained_model = tf.keras.applications.InceptionV3(weights='imagenet', include_top=False ,input_shape=[*IMAGE_SIZE, 3])
    # - 0.72673 - pretrained_model = tf.keras.applications.Xception(weights='imagenet', include_top=False ,input_shape=[*IMAGE_SIZE, 3])
    # - 0.82843 - pretrained_model = tf.keras.applications.DenseNet201(weights='imagenet', include_top=False ,input_shape=[*IMAGE_SIZE, 3])
    pretrained_model = tf.keras.applications.DenseNet201(weights='imagenet', include_top=False ,input_shape=[*IMAGE_SIZE, 3])
    # pretrained_model = tf.keras.applications.Xception(weights='imagenet', include_top=False ,input_shape=[*IMAGE_SIZE, 3])
    pretrained_model.trainable = False # tramsfer learning
    
    model = tf.keras.Sequential([
        pretrained_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dropout(0.2),
#         tf.keras.layers.Flatten(),
#         tf.keras.layers.Dense(1024, activation='relu'),
#         tf.keras.layers.Dropout(0.2),
#         tf.keras.layers.Dense(512, activation='relu'),
#         tf.keras.layers.Dropout(0.2),
#         tf.keras.layers.Dense(256, activation='relu'),
#         tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(104, activation='softmax')
    ])
    
    #f1_score = tfa.metrics.F1Score(104, 'macro')
        
model.compile(
#     optimizer='adam',
    optimizer=tf.keras.optimizers.Nadam(),
    loss = 'sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy']
)

historical = model.fit(train_aug_ds, 
          steps_per_epoch=STEPS_PER_EPOCH, 
          epochs=EPOCHS, 
          validation_data=validation_dataset)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/15


2022-12-12 06:49:55.627189: W ./tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h:57] Ignoring an error encountered when deleting remote tensors handles: Invalid argument: Unable to find the relevant tensor remote_handle: Op ID: 62520, Output num: 0
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1670827795.623832118","description":"Error received from peer ipv4:10.0.0.2:8470","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Unable to find the relevant tensor remote_handle: Op ID: 62520, Output num: 0","grpc_status":3}


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [10]:
with strategy.scope():    
    pretrained_model.trainable = True # fine-tuning

model.compile(
    optimizer=tf.keras.optimizers.Nadam(),
    loss = 'sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'])

EPOCHS = 3

historical = model.fit(train_aug_ds, 
          steps_per_epoch=STEPS_PER_EPOCH, 
          epochs=EPOCHS, 
          validation_data=validation_dataset)

Epoch 1/3


2022-12-12 07:25:46.468022: W ./tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h:57] Ignoring an error encountered when deleting remote tensors handles: Invalid argument: Unable to find the relevant tensor remote_handle: Op ID: 110046, Output num: 0
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1670829946.467932091","description":"Error received from peer ipv4:10.0.0.2:8470","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Unable to find the relevant tensor remote_handle: Op ID: 110046, Output num: 0","grpc_status":3}


Epoch 2/3
Epoch 3/3


In [11]:
with strategy.scope():    
    pretrained_model.trainable = True # fine-tuning

model.compile(
    optimizer=tf.keras.optimizers.Nadam(1e-5),
    loss = 'sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'])

EPOCHS = 15

historical = model.fit(train_aug_ds, 
          steps_per_epoch=STEPS_PER_EPOCH, 
          epochs=EPOCHS, 
          validation_data=validation_dataset)

Epoch 1/15


2022-12-12 07:36:19.132052: W ./tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h:57] Ignoring an error encountered when deleting remote tensors handles: Invalid argument: Unable to find the relevant tensor remote_handle: Op ID: 147868, Output num: 0
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1670830579.131667899","description":"Error received from peer ipv4:10.0.0.2:8470","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Unable to find the relevant tensor remote_handle: Op ID: 147868, Output num: 0","grpc_status":3}


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [12]:
# def display_images(digits, predictions, labels, title):
#     n = 10

#     indexes = np.random.choice(len(predictions), size=n)
#     n_digits = digits[indexes]
#     n_predictions = predictions[indexes]
#     n_predictions = n_predictions.reshape((n,))
#     n_labels = labels[indexes]
 
#     fig = plt.figure(figsize=(20, 4))
#     plt.title(title)
#     plt.yticks([])
#     plt.xticks([])

#     for i in range(10):
#         ax = fig.add_subplot(1, 10, i+1)

#         plt.xlabel(n_predictions[i] if n_labels[i]==n_predictions[i] else str(n_predictions[i]) + '/' + str(n_labels[i]),
#                    color='black' if n_labels[i]==n_predictions[i] else 'red')
#         plt.xticks([])
#         plt.yticks([])
#         plt.imshow(n_digits[i])

In [13]:
# small_ds = list(validation_dataset.take(1).as_numpy_iterator())
# plt.figure(figsize=(10, 10))
# for i, (image, label) in enumerate(zip(small_ds[0][0], small_ds[0][1])):
#     ax = plt.subplot(3, 3, i + 1)
#     plt.imshow(image)
#     plt.title(int(label))
#     plt.axis("off")
#     if i == 8:
#         break

# probabilities = model.predict(small_ds[0][0], batch_size=8)
# probabilities = np.argmax(probabilities, axis = 1)

# print(probabilities)
# print(small_ds[0][1])
# print(probabilities==small_ds[0][1])
# display_images(small_ds[0][0], probabilities, small_ds[0][1], "Bad predictions indicated in red.")

# Compute your predictions on the test set!

This will create a file that can be submitted to the competition.

In [14]:
test_ds = get_test_dataset(ordered=True) # since we are splitting the dataset and iterating separately on images and ids, order matters.

print('Computing predictions...')
test_images_ds = test_ds.map(lambda image, idnum: image)
probabilities = model.predict(test_images_ds)
predictions = np.argmax(probabilities, axis=-1)
print(predictions)

print('Generating submission.csv file...')
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')

2022-12-12 08:05:32.099652: I tensorflow/core/platform/cloud/google_auth_provider.cc:180] Attempting an empty bearer token since no token was retrieved from files, and GCE metadata check was skipped.


Computing predictions...


2022-12-12 08:06:45.385020: W ./tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h:57] Ignoring an error encountered when deleting remote tensors handles: Invalid argument: Unable to find the relevant tensor remote_handle: Op ID: 170812, Output num: 0
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1670832405.384504108","description":"Error received from peer ipv4:10.0.0.2:8470","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":"Unable to find the relevant tensor remote_handle: Op ID: 170812, Output num: 0","grpc_status":3}


[67 52 81 ... 14 40 79]
Generating submission.csv file...
