# Training

In [1]:
%pip -q install wheel
%pip -q install pillow

%pip -q install ipympl 
%pip -q install matplotlib 

%pip -q install wandb 
%pip -q install numpy
%pip -q install keras
%pip -q install tensorflow 


/bin/bash: /home/vulgrim/miniconda3/envs/tensorflow/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.
/bin/bash: /home/vulgrim/miniconda3/envs/tensorflow/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.
/bin/bash: /home/vulgrim/miniconda3/envs/tensorflow/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.
/bin/bash: /home/vulgrim/miniconda3/envs/tensorflow/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.
/bin/bash: /home/vulgrim/miniconda3/envs/tensorflow/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.
/bin/bash: /home/vulgrim/minic

In [2]:
%matplotlib ipympl

%load_ext tensorboard
%tensorboard --logdir /tmp/tensorboard

## Packages

In [3]:
from os.path                \
    import                  \
    isdir

from random                 \
    import                  \
    SystemRandom

In [4]:
import wandb

from wandb.keras            \
    import                  \
    WandbMetricsLogger,     \
    WandbCallback



In [5]:
from keras.utils                \
    import                      \
    image_dataset_from_directory

from keras.backend              \
    import                      \
    clear_session

from keras.models               \
    import                      \
    Sequential

from keras.layers               \
    import                      \
    RandomFlip,                 \
    RandomZoom,                 \
    RandomContrast,             \
    RandomBrightness,           \
    RandomRotation

from keras.callbacks            \
    import TensorBoard

In [6]:
import tensorflow

from tensorflow.python.ops  \
  import summary_ops_v2


In [7]:
from O2RMModel.RecognitionModel.model   \
    import Model                        \
    as RecognitionModel

In [8]:
def zero() -> int:
    return 0

In [9]:
gpu_memory_growth: bool = False

In [10]:
physical_devices = tensorflow.config.list_physical_devices(
    str(
        'gpu'
    ).upper()
)

selected_physical_device = physical_devices[
    zero()
]

tensorflow.config.experimental.set_memory_growth(
    selected_physical_device,
    gpu_memory_growth
)


In [11]:
def generate_seed():
    return SystemRandom().randint(
        1,
        32767
    )


In [12]:
device_name = str('/') + str(
    selected_physical_device.name[
        len(
            '/physical_device:'
        )
        :
        len(
            selected_physical_device.name
        )
    ]
)

In [13]:
dataset_seed: int = generate_seed()

location_of_dataset: str = '/opt/dataset/numbers'
location_of_model: str = '/opt/models/O2RM'

width: int = 512
height: int = 512
channels: int = 3

number_of_labels: int = 10

batches: int = 32
epochs: int = 6

validation_split: float = 0.5

use_multiprocessing: bool = True
process_workers: int = 6
is_using_tensorboard: bool = True

In [14]:
wandb_entity: str = 'designermadsen'
wandb_project: str = 'O2RM'

wandb_jobtype: str = 'Training'

wandb_tags: list = [
    'Nvidia',
    'Linux',
    'Ubuntu',
    'Development',
    'Random',
    'test-driven',
    'Bare-Metal',
    'TensorFlow'   
]

wandb_use_callback: bool = False
wandb_measure_metrics: bool = True
wandb_compute_flops: bool = True

In [15]:
configuration: dict = {
    'vision': 
    {
        'width': width,
        'height': height,
        'channels': channels
    },

    'dataset': 
    {
        'number of labels': number_of_labels,
        'batches': batches,
        'epochs': epochs,
        'seed': dataset_seed,
        'using multiprocessing': use_multiprocessing,
        'using tensorboard': is_using_tensorboard,
        'workers': process_workers,
    },

    'wandb':
    {
        'job type': wandb_jobtype,
        'tags': wandb_tags,
        'measure metrics': wandb_measure_metrics,
        'use callbacks': wandb_use_callback,
        'compute flops': wandb_compute_flops
    },

    'gpu':
    {
        'memory growth': gpu_memory_growth
    }
}

In [16]:
training_dataset = None
validation_dataset = None

def get_training_dataset():
    global training_dataset
    return training_dataset

def get_validation_dataset():
    global validation_dataset
    return validation_dataset

In [17]:
def setup_wandb_callback():
    global wandb_compute_flops

    return WandbCallback(
        monitor='val_loss', 
        verbose=0,
    
        save_weights_only=True,
        log_weights=True,
        log_gradients=True,
        save_graph=True,
    
        save_model=True,
    
        training_data=get_training_dataset(),
        validation_data=get_validation_dataset(),
        log_evaluation=True,
    
        compute_flops=wandb_compute_flops,    
        
        input_type='image',
        output_type='label',
        
        labels=get_training_dataset().class_names,
        predictions=15
    )            
    

In [18]:
def setup_wandb_metrics():
    return WandbMetricsLogger()

In [19]:
def setup_tensorboard():
    return TensorBoard(
        '/tmp/tensorboard',
        histogram_freq=0,
        write_graph=True,
        write_images=True, 
        write_steps_per_second=True,
        update_freq=True, 
        profile_batch=False,
        embeddings_freq=0
    )

In [20]:
def callbacks() -> list:
    callback_list: list = list()

    global                      \
        wandb_measure_metrics,  \
        wandb_use_callback,     \
        wandb_compute_flops,    \
        is_using_tensorboard

    if is_using_tensorboard:
        callback_list.append(
            setup_tensorboard()
        )

    if wandb_measure_metrics:
        callback_list.append(
            setup_wandb_metrics()
        )
    
    if wandb_use_callback:
        callback_list.append(
            setup_wandb_callback()
        )

    return callback_list

In [21]:
training_history: list = list()

def append_training_session(
    history
) -> None:
    global training_history

    training_history.append(
        history.history
    )

## Launch training session

In [22]:
autotune = tensorflow.data.AUTOTUNE

wandb.init(
    entity=wandb_entity,
    project=wandb_project,
    config=configuration,
    tags=wandb_tags,
    job_type=wandb_jobtype,
    reinit=True,
    tensorboard=is_using_tensorboard,
    save_code=False,
    notes=""
)

with tensorflow.device(
    device_name
):
    # Setup of model
    model = RecognitionModel(
        width=width,
        height=height,
        channels=channels,
        categories=number_of_labels
    )

    if isdir(
        location_of_model
    ):
        model.load_weights(
            location_of_model
        )

    training_dataset, validation_dataset = image_dataset_from_directory(
        location_of_dataset,
        validation_split=validation_split,
        subset='both',
        seed=dataset_seed,
        image_size=(
            height,
            width
        ),
        batch_size=batches
    )

    wandb.log(
        {
            'training': get_training_dataset().class_names,
            'validation': get_validation_dataset().class_names
        }
    )

    history = model.fit(
        training_dataset.prefetch(
            buffer_size=autotune
        ),
        
        validation_data=validation_dataset.prefetch(
            buffer_size=autotune
        ),

        epochs=epochs,

        callbacks=callbacks(),

        use_multiprocessing=use_multiprocessing,
        workers=process_workers
    )

    append_training_session(
        history
    )

    model.save(
        location_of_model,
        save_format='tf',
        overwrite=True
    )

    saved_model = wandb.Artifact(
        "o2rm_model",
        type="model"
    )

    saved_model.add_dir(
        location_of_model
    )

    wandb.log_artifact(
        saved_model
    )

clear_session()

[34m[1mwandb[0m: Currently logged in as: [33mdesignermadsen[0m. Use [1m`wandb login --relogin`[0m to force relogin


2023-08-13 17:39:03.732411: W tensorflow/core/util/tensor_slice_reader.cc:97] Could not open /opt/models/O2RM: FAILED_PRECONDITION: /opt/models/O2RM; Is a directory: perhaps your file is in a different file format and you need to use a different restore operator?


Found 20000 files belonging to 10 classes.




Using 10000 files for training.
Using 10000 files for validation.
Epoch 1/6


2023-08-13 17:39:13.702068: W tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc:543] Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may result in compilation or runtime failures, if the program we try to run uses routines from libdevice.
Searched for CUDA in the following directories:
  ./cuda_sdk_lib
  /usr/local/cuda-11.8
  /usr/local/cuda
  .
You can choose the search directory by setting xla_gpu_cuda_data_dir in HloModule's DebugOptions.  For most apps, setting the environment variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.






Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
INFO:tensorflow:Assets written to: /opt/models/O2RM/assets


INFO:tensorflow:Assets written to: /opt/models/O2RM/assets
[34m[1mwandb[0m: Adding directory to artifact (/opt/models/O2RM)... Done. 0.2s


In [1]:
wandb.finish()
summary_ops_v2.flush()

NameError: name 'wandb' is not defined