# A Simple TF 2.2 notebook

This is intended as a simple, short introduction to the operations competitors will need to perform with TPUs.

Current Version: 13

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

from kaggle_datasets import KaggleDatasets

print(f"Tensorflow version: {tf.__version__}")

# Detect my accelerator

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print(f'Strategy: {strategy}')
print("REPLICAS: ", strategy.num_replicas_in_sync)

# Get my data path

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path() # you can list the bucket with "!gsutil ls $GCS_DS_PATH"

# Set some parameters

In [None]:
## Version: 1
# IMAGE_SIZE = [192, 192] # at this size, a GPU will run out of memory. Use the TPU

## Version: 8
IMAGE_SIZE = [224, 224] # at this size, a GPU will run out of memory. Use the TPU

## Version: 7
# INPUT_IMAGE_SIZE = [224, 224] # Model Input Shape

## Version: 1
# EPOCHS = 5

## Version: 4
# EPOCHS = 12

## Version: 5
EPOCHS = 25

BATCH_SIZE = 16 * strategy.num_replicas_in_sync

NUM_TRAINING_IMAGES = 12753
NUM_TEST_IMAGES = 7382
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE

# Load my data

This data is loaded from Kaggle and automatically sharded to maximize parallelization.

In [None]:
# def decode_image(image_data):
#     image = tf.image.decode_jpeg(image_data, channels=3)
    
#     ## Version: 1
# #     image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    
#     image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU

#     ## Version: 7
# #     image = tf.image.resize(image, INPUT_IMAGE_SIZE, method='nearest')
    
#     ## Version: 8
#     # image = tf.cast(image, tf.float32)
#     # image = (image - 127.5) / 127.5
    
#     return image

def rescale_01(image, _):
    return tf.cast(image, tf.float32) / 255.0, _  # convert image to floats in [0, 1] range

def rescale_11(image, _):
    image = tf.cast(image, tf.float32)
    image = (image - 127.5) / 127.5
    return image, _

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "id": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum # returns a dataset of image(s)

def load_dataset(filenames, labeled=True, ordered=False, rescale='01'):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord)
    dataset = dataset.map(rescale_01 if rescale == '01' else rescale_11) # Rescale dataset
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_dataset_name(image_size):
    size = image_size[0]
    return f'/tfrecords-jpeg-{size}x{size}'

def get_training_dataset(rescale='01'):
    ds_name = get_dataset_name(IMAGE_SIZE)
    dataset = load_dataset(
        tf.io.gfile.glob(f'{GCS_DS_PATH}{ds_name}/train/*.tfrec'), labeled=True, rescale=rescale
    )
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

def get_validation_dataset(rescale='01'):
    ds_name = get_dataset_name(IMAGE_SIZE)
    dataset = load_dataset(
        tf.io.gfile.glob(f'{GCS_DS_PATH}{ds_name}/val/*.tfrec'),
        labeled=True,
        ordered=False,
        rescale=rescale
    )
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    return dataset

def get_test_dataset(rescale='01', ordered=False):
    ds_name = get_dataset_name(IMAGE_SIZE)
    dataset = load_dataset(
        tf.io.gfile.glob(f'{GCS_DS_PATH}{ds_name}/test/*.tfrec'),
        labeled=False,
        ordered=ordered,
        rescale=rescale
    )
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

training_dataset_01 = get_training_dataset()
validation_dataset_01 = get_validation_dataset()

training_dataset_11 = get_training_dataset(rescale='11')
validation_dataset_11 = get_validation_dataset(rescale='11')

# Build a model on TPU (or GPU, or CPU...) with Tensorflow 2.1!

In [None]:
## Version: 1
# with strategy.scope():    
#     pretrained_model = tf.keras.applications.VGG16(weights='imagenet', include_top=False ,input_shape=[*IMAGE_SIZE, 3])
#     pretrained_model.trainable = False # tramsfer learning
    
#     model = tf.keras.Sequential([
#         pretrained_model,
#         tf.keras.layers.GlobalAveragePooling2D(),
#         tf.keras.layers.Dense(104, activation='softmax')
#     ])

## Version: 2
# try:
#     import autokeras as ak
#     print(f'[autokeras] Not Available')
# except:
#     ! pip install autokeras

# import autokeras as ak

# with strategy.scope():   
#     clf = ak.ImageClassifier()
#     clf.fit(training_dataset, validation_data=validation_dataset)

## Version: 3
# try:
#     import tfimm
# except:
#     ! pip install tfimm timm

# import tfimm

# with strategy.scope():   
#     model = tfimm.create_model("vit_tiny_patch16_224", pretrained="timm", nb_classes=104)

# preprocess = tfimm.create_preprocessing("vit_tiny_patch16_224", dtype="float32")
# model.compile(
#     optimizer='adam',
#     loss = 'sparse_categorical_crossentropy',
#     metrics=['sparse_categorical_accuracy']
# )

# historical = model.fit(
#           training_dataset.map(lambda x,y: (preprocess(x), y)),
#           steps_per_epoch=STEPS_PER_EPOCH, 
#           epochs=EPOCHS, 
#           validation_data=validation_dataset)

## Version: 4
# model_url = 'https://tfhub.dev/google/imagenet/mobilenet_v1_100_192/feature_vector/5'

## Version: 6
# model_url = 'https://tfhub.dev/google/imagenet/mobilenet_v2_100_192/feature_vector/5'

## Version: 7
# model_url = 'https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_ft1k_b0/feature_vector/2'

## Version: 8
# model_url = 'https://tfhub.dev/sayakpaul/vit_b16_fe/1'

## Version: 9
# model_url = 'https://tfhub.dev/sayakpaul/vit_s16_fe/1'

## Version: 10
# model_url = 'https://tfhub.dev/google/imagenet/mobilenet_v3_large_100_224/feature_vector/5'

## Version: 13
model_url = 'https://tfhub.dev/sayakpaul/vit_b8_fe/1'

## Version: 7
with strategy.scope():
    model = tf.keras.Sequential(
        [
            hub.KerasLayer(
                model_url,
#                 trainable=True,
                trainable=False, # Version: 12
            ),
            tf.keras.layers.Dense(104, activation='softmax')
        ]
    )

## Version: 4
# with strategy.scope():
#     model = tf.keras.Sequential(
#         [
#             hub.KerasLayer(
#                 model_url,
# #                 trainable=True,
#                 trainable=False, # Version: 11
#                 arguments=dict(batch_norm_momentum=0.997)
#             ),
#             tf.keras.layers.Dense(104, activation='softmax')
#         ]
#     )

## Version: 1
model.compile(
    optimizer='adam',
    loss = 'sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy']
)

historical = model.fit(training_dataset, 
          steps_per_epoch=STEPS_PER_EPOCH, 
          epochs=EPOCHS, 
          validation_data=validation_dataset)

# Compute your predictions on the test set!

This will create a file that can be submitted to the competition.

In [None]:
test_ds = get_test_dataset(ordered=True) # since we are splitting the dataset and iterating separately on images and ids, order matters.

print('Computing predictions...')
test_images_ds = test_ds.map(lambda image, idnum: image)
probabilities = model.predict(test_images_ds) # Version: 1
# probabilities = clf.predict(test_images_ds) # Version: 2

print(f'probabilities.shape: {probabilities.shape}')
predictions = np.argmax(probabilities, axis=-1)

## Version: 13
predictions = predictions[2:]

print(predictions, predictions.shape)
np.savetxt('predictions.txt', predictions, fmt='%d')

print('Generating submission.csv file...')
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(predictions.shape[0]))).numpy().astype('U') # all in one batch

print(test_ids, test_ids.shape)
np.savetxt('test_ids.txt', test_ids, fmt='%s')

np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')