## Installation of packages

In [None]:
%pip install numpy -q
%pip install keras -q

%pip install tensorflow -q
%pip install tensorboard_plugin_profile -q

%pip install networkx -q

%pip install wandb -q
%pip install kaggle -q

%pip install matplotlib -q
%pip install ipympl -q


## Setup Tensorboard

In [4]:
from os.path    \
    import      \
    isdir,      \
    join

from os         \
    import      \
    walk,       \
    remove

from random                     \
    import                      \
    SystemRandom

import wandb

from wandb.keras                \
    import                      \
    WandbMetricsLogger,         \
    WandbCallback

from keras.utils                \
    import                      \
    image_dataset_from_directory

from keras.backend              \
    import                      \
    clear_session

from keras.models               \
    import                      \
    load_model

from keras.optimizers           \
    import SGD

from keras.callbacks            \
    import TensorBoard

from keras                      \
    import mixed_precision

import tensorflow

from tensorflow.python.ops      \
  import summary_ops_v2

from keras.losses \
    import SparseCategoricalCrossentropy




In [5]:
is_using_tensorboard: bool = True
location_to_tensorboard: str = '/tmp/tensorboard'

if is_using_tensorboard:
    if isdir(
        location_to_tensorboard
    ):
        for root,           \
            directories,    \
            files           \
                in walk(
                    location_to_tensorboard
                ):
            
            for file in files:
                full_path_to_file: str = join(
                    root, 
                    file
                )

                remove(
                    full_path_to_file
                )


## Setup Matplotlib

In [None]:
use_ipympl: bool = False

if use_ipympl:
    %matplotlib ipympl

## Packages used

In [6]:
global_policy = mixed_precision.Policy(
    'float32'
)

mixed_precision.set_global_policy(
    global_policy
)

tensorflow.keras                                        \
    .mixed_precision                                    \
    .set_global_policy(
    global_policy
)

gpu_memory_growth: bool = True

def zero() -> int:
    return 0


physical_devices = tensorflow.config.list_physical_devices(
    str(
        'gpu'
    ).upper()
)

selected_physical_device = physical_devices[
    zero()
]

tensorflow.config.experimental.set_memory_growth(
    selected_physical_device,
    gpu_memory_growth
)

In [7]:
location_of_model: str = '/opt/models/O2RM'

def get_location_of_model() -> str:
    global location_of_model
    return location_of_model


In [8]:
def load_old_model_weights():
    global          \
        model,      \
        location_of_model

    if isdir(
        location_of_model
    ):
        model.load_weights(
            location_of_model
        )

In [9]:
clear_session()

model = load_model(
    location_of_model
)

model.compile(
    optimizer=SGD(
        learning_rate=0.00024
    ),
    loss=SparseCategoricalCrossentropy(
        from_logits=True
    ),
    metrics=[
        'accuracy'
    ]
)

In [10]:
def generate_seed():
    return SystemRandom().randint(
        1,
        32767
    )

def refresh_seed():
    global dataset_seed
    dataset_seed = generate_seed()

In [13]:
training_history: list = list()

training_dataset = None
validation_dataset = None

dataset_seed: int = generate_seed()

location_of_dataset: str = '/opt/dataset/numbers'

width: int = 512
height: int = 512
channels: int = 3

number_of_labels: int = 10

batches: int = 60
epochs: int = 12

validation_split: float = 0.15

use_multiprocessing: bool = True
process_workers: int = 8

tensorflow_verbose: int = 1

training_labels: list = list()
validation_labels: list = list()

In [14]:
wandb_entity: str = 'designermadsen'
wandb_project: str = 'O2RM'

wandb_jobtype: str = 'Training'

wandb_tags: list = [
    'Nvidia',
    'Linux',
    'Ubuntu',
    'Development',
    'Random',
    'test-driven',
    'Bare-Metal',
    'TensorFlow'   
]

wandb_use_callback: bool = False
wandb_measure_metrics: bool = True
wandb_compute_flops: bool = True

In [15]:
configuration: dict = {
    'vision': 
    {
        'width': width,
        'height': height,
        'channels': channels
    },

    'dataset': 
    {
        'number of labels': number_of_labels,
        'batches': batches,
        'epochs': epochs,
        'seed': dataset_seed,
        'using multiprocessing': use_multiprocessing,
        'using tensorboard': is_using_tensorboard,
        'workers': process_workers,
    },

    'wandb':
    {
        'job type': wandb_jobtype,
        'tags': wandb_tags,
        'measure metrics': wandb_measure_metrics,
        'use callbacks': wandb_use_callback,
        'compute flops': wandb_compute_flops
    },

    'gpu':
    {
        'memory growth': gpu_memory_growth
    }
}

In [16]:
def get_training_dataset():
    global training_dataset
    return training_dataset

def get_validation_dataset():
    global validation_dataset
    return validation_dataset

In [17]:
def setup_wandb_callback():
    global wandb_compute_flops

    return WandbCallback(
        monitor='val_loss', 
        verbose=0,
    
        save_weights_only=True,
        log_weights=True,
        log_gradients=True,

        save_graph=True,
        save_model=True,
    
        training_data=get_training_dataset(),
        validation_data=get_validation_dataset(),
        log_evaluation=True,
    
        compute_flops=wandb_compute_flops,    
        
        input_type='image',
        output_type='label',
        
        labels=get_training_dataset().class_names,
        predictions=15
    )            
    

In [18]:
def setup_tensorboard():
    global location_to_tensorboard
    return TensorBoard(
        location_to_tensorboard,
        histogram_freq=0,
        write_graph=True,
        write_images=True, 
        write_steps_per_second=True,
        update_freq=True, 
        profile_batch=False,
        embeddings_freq=0
    )

In [19]:
def setup_wandb_metrics():
    return WandbMetricsLogger()

In [20]:
def callbacks() -> list:
    global                      \
        wandb_measure_metrics,  \
        wandb_use_callback,     \
        wandb_compute_flops,    \
        is_using_tensorboard

    callback_list: list = list()

    if is_using_tensorboard:
        callback_list.append(
            setup_tensorboard()
        )

    if wandb_measure_metrics:
        callback_list.append(
            setup_wandb_metrics()
        )
    
    if wandb_use_callback:
        callback_list.append(
            setup_wandb_callback()
        )

    return callback_list

In [21]:
def append_training_session(
    history
) -> None:
    global training_history

    training_history.append(
        history.history
    )

In [22]:
device_name = str('/') + str(
    selected_physical_device.name[
        len(
            '/physical_device:'
        )
        :
        len(
            selected_physical_device.name
        )
    ]
)

autotune = tensorflow.data.AUTOTUNE
training_rotations: int = 1

In [23]:
def is_training_dataset_none() -> bool:
    global training_dataset
    return training_dataset is None

def is_validation_dataset_none() -> bool:
    global validation_dataset
    return validation_dataset is None


In [25]:
from keras.models import Sequential

from keras.layers               \
    import                      \
    RandomFlip,                 \
    RandomZoom,                 \
    RandomContrast,             \
    RandomBrightness,           \
    RandomRotation,             \
    RandomTranslation,          \
    RandomHeight,               \
    RandomWidth,                \
    RandomCrop                  

def augmentation_layers() -> Sequential:
    layers: list = list()

    layers.append(
        RandomFlip(
            'horizontal_and_vertical',
            seed=generate_seed()
        )
    )

    layers.append(
        RandomRotation(
            factor=(
                -1.0, 
                1.0
            ),
            fill_mode='nearest',
            seed=generate_seed()
        )
    )

    layers.append(
        RandomZoom(
            height_factor=(
                -1.0, 
                1.0
            ),
            width_factor=(
                -1.0, 
                1.0
            ),
            fill_mode='nearest',
            seed=generate_seed()
        )
    )

    layers.append(
        RandomContrast(
            factor=1.0,
            seed=generate_seed()
        )
    )

    layers.append(
        RandomBrightness(
            factor=1.0,
            seed=generate_seed()
        )
    )

    augmentation_layers: Sequential = Sequential(
        layers
    )

    return augmentation_layers

def data_augmentation():
    global training_dataset, validation_dataset, autotune
    
    training_augmentation = augmentation_layers()
    training_dataset = training_dataset.map(
        lambda x, y: (
            training_augmentation(x, training=True), y
        ), 
        num_parallel_calls=autotune
    )

    validation_augmentation = augmentation_layers()
    validation_dataset = validation_dataset.map(
        lambda x, y: (
            validation_augmentation(x, training=True), y
        ), 
        num_parallel_calls=autotune
    )


In [26]:
def load_dataset():
    global                      \
        training_dataset,       \
        validation_dataset,     \
        location_of_dataset,    \
        validation_split,       \
        dataset_seed,           \
        height,                 \
        width,                  \
        batches,                \
        training_labels,        \
        validation_labels

    if(
        not(
            is_training_dataset_none()
        )      
        or                                   
        not(
            is_validation_dataset_none()
        )
    ):
        refresh_seed()

    training_dataset, validation_dataset    \
        = image_dataset_from_directory(
        location_of_dataset,
        validation_split=validation_split,
        subset='both',
        seed=dataset_seed,
        image_size=(
            height,
            width
        ),
        batch_size=batches
    )

    training_labels = training_dataset.class_names
    validation_labels = validation_dataset.class_names

    data_augmentation()


In [27]:
def training():
    global training_labels, validation_labels
    load_old_model_weights()
    load_dataset()

    wandb.log(
        {
            'seed': dataset_seed,
            'training': {
                'labels': training_labels
            },
            'validation': {
                'labels': validation_labels
            }
        }
    )

    history = model.fit(
        training_dataset.prefetch(
            buffer_size=autotune
        ),
        
        validation_data=validation_dataset.prefetch(
            buffer_size=autotune
        ),

        epochs=epochs,

        callbacks=callbacks(),

        use_multiprocessing=use_multiprocessing,
        workers=process_workers
    )

    append_training_session(
        history
    )

    model.save(
        location_of_model,
        save_format='tf',
        overwrite=True
    )

    saved_model = wandb.Artifact(
        "o2rm_model",
        type="model"
    )

    saved_model.add_dir(
        location_of_model
    )

    wandb.log_artifact(
        saved_model
    )


In [28]:
def evaluation():
    global model, training_dataset, validation_dataset
    load_dataset()

    model.evaluate(
        training_dataset.prefetch(
            buffer_size=autotune
        ),
        callbacks=[
            WandbMetricsLogger()
        ]
    )

    model.evaluate(
        validation_dataset.prefetch(
            buffer_size=autotune
        ),
        callbacks=[
            WandbMetricsLogger()
        ]
    )

In [29]:
def setup():
    global                      \
        wandb_entity,           \
        wandb_project,          \
        configuration,          \
        wandb_tags,             \
        wandb_jobtype,          \
        is_using_tensorboard

    wandb.init(
        entity=wandb_entity,
        project=wandb_project,
        config=configuration,
        tags=wandb_tags,
        job_type=wandb_jobtype,
        reinit=True,
        tensorboard=is_using_tensorboard,
        save_code=True,
        notes="",
        magic=True
    )

def shutdown():
    summary_ops_v2.flush()
    wandb.finish()

In [30]:
with tensorflow.device(
    device_name
):
    setup()
    evaluation()
    training()

shutdown()

[34m[1mwandb[0m: Currently logged in as: [33mdesignermadsen[0m. Use [1m`wandb login --relogin`[0m to force relogin




VBox(children=(Label(value='0.030 MB of 0.066 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.452287…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666836570000214, max=1.0)…



Found 20000 files belonging to 10 classes.
Using 17000 files for training.
Using 3000 files for validation.


2023-08-14 15:00:38.637406: W tensorflow/core/util/tensor_slice_reader.cc:97] Could not open /opt/models/O2RM: FAILED_PRECONDITION: /opt/models/O2RM; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?


Found 20000 files belonging to 10 classes.
Using 17000 files for training.
Using 3000 files for validation.




Epoch 1/12


2023-08-14 15:00:52.551202: W tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:543] Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may result in compilation or runtime failures, if the program we try to run uses routines from libdevice.
Searched for CUDA in the following directories:
  ./cuda_sdk_lib
  /usr/local/cuda-11.8
  /usr/local/cuda
  .
You can choose the search directory by setting xla_gpu_cuda_data_dir in HloModule's DebugOptions.  For most apps, setting the environment variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.
2023-08-14 15:01:05.621507: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 736.00MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


  1/284 [..............................] - ETA: 2:38:08 - loss: 2.3026 - accuracy: 0.1167

2023-08-14 15:01:17.051347: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 736.00MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.




  saving_api.save_model(


INFO:tensorflow:Assets written to: /tmp/wandb/run-20230814_145510-5zbrwrxb/files/model-best/assets


INFO:tensorflow:Assets written to: /tmp/wandb/run-20230814_145510-5zbrwrxb/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/tmp/wandb/run-20230814_145510-5zbrwrxb/files/model-best)... Done. 0.1s


Epoch 2/12

  saving_api.save_model(


INFO:tensorflow:Assets written to: /tmp/wandb/run-20230814_145510-5zbrwrxb/files/model-best/assets


INFO:tensorflow:Assets written to: /tmp/wandb/run-20230814_145510-5zbrwrxb/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/tmp/wandb/run-20230814_145510-5zbrwrxb/files/model-best)... Done. 0.1s


Epoch 3/12



Epoch 4/12

  saving_api.save_model(


INFO:tensorflow:Assets written to: /tmp/wandb/run-20230814_145510-5zbrwrxb/files/model-best/assets


INFO:tensorflow:Assets written to: /tmp/wandb/run-20230814_145510-5zbrwrxb/files/model-best/assets
[34m[1mwandb[0m: Adding directory to artifact (/tmp/wandb/run-20230814_145510-5zbrwrxb/files/model-best)... Done. 0.1s


Epoch 5/12
 26/284 [=>............................] - ETA: 6:01 - loss: 2.3035 - accuracy: 0.0846