# Initial Setup

In [6]:
!pip install "tf-models-official"

[0m

In [1]:
import os
import io
import pprint
import tempfile
import matplotlib
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from PIL import Image
from six import BytesIO
from IPython import display
from urllib.request import urlopen

2024-05-11 08:42:12.342268: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-11 08:42:12.343381: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 08:42:12.393974: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 08:42:12.601837: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import orbit
import tensorflow_models as tfm

from official.core import exp_factory
from official.core import config_definitions as cfg
from official.vision.serving import export_saved_model_lib
from official.vision.ops.preprocess_ops import normalize_image
from official.vision.ops.preprocess_ops import resize_and_crop_image
from official.vision.utils.object_detection import visualization_utils
from official.vision.dataloaders.tf_example_decoder import TfExampleDecoder

pp = pprint.PrettyPrinter(indent=4) # Set Pretty Print Indentation
print(tf.__version__) # Check the version of tensorflow used

%matplotlib inline

2.16.1


# LabelMaps

## Creacion de Labelmaps

In [58]:
!python ./csv_to_labelmap.py

2024-05-11 08:10:02.406431: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-11 08:10:02.407835: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 08:10:02.410717: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 08:10:02.434814: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Labelmaps generated!


# TFRecords

## Train Model

In [None]:
!python ./generate_tfRecord.py "Playing Cards.v4-yolov8n.tensorflow/train/_annotations.csv" label_map.pbtxt "Playing Cards.v4-yolov8n.tensorflow/train" train.record

## Test Model

In [None]:
!python ./generate_tfRecord.py "Playing Cards.v4-yolov8n.tensorflow/test/_annotations.csv" label_map.pbtxt "Playing Cards.v4-yolov8n.tensorflow/test" test.record

## Valid Model

In [None]:
!python ./generate_tfRecord.py "Playing Cards.v4-yolov8n.tensorflow/valid/_annotations.csv" label_map.pbtxt "Playing Cards.v4-yolov8n.tensorflow/valid" valid.record

# MLModel

## Libraries

In [3]:
import wget
import tarfile
import os
import re

## Enviroment Settings

In [3]:
#Can be configured
DATASET_DIRECTORY = "/tf/dataset/workspaceCards/"
OBJECT_DETECTION_DIRECTORY = "/tf/tensorflow/models/research/object_detection/"

MODEL_NAME = "ssd_resnet152_v1_fpn_1024x1024_coco17_tpu-8"
MODEL_LINK = "http://download.tensorflow.org/models/object_detection/tf2/20200711/ssd_resnet152_v1_fpn_1024x1024_coco17_tpu-8.tar.gz"
MODEL_CONFIG = "ssd_resnet101_v1_fpn_1024x1024_coco17_tpu-8.config"
MODELS_FOLDER = OBJECT_DETECTION_DIRECTORY+"configs/tf2/"
MODEL_FILEPATH = os.path.join(MODELS_FOLDER, MODEL_CONFIG)

MODEL_TRAIN = DATASET_DIRECTORY + "trainModel/"
MODEL_EXPORT = DATASET_DIRECTORY + "exportModel/"

In [4]:
FINE_CHECKPOINT = DATASET_DIRECTORY + "models/" + MODEL_NAME + "/checkpoint/cpkt-0"
TRAIN_TFRECORD = DATASET_DIRECTORY + "train.record"
TEST_TFRECORD = DATASET_DIRECTORY + "test.record"
VALID_TFRECORD = DATASET_DIRECTORY + "valid.record"
LABEL_MAP = DATASET_DIRECTORY + "label_map.pbtxt"

In [5]:
BATCH_SIZE = 12
NUM_STEPS = 10000
NUM_CLASSES = 52
NUM_EVAL_STEPS = 50

In [6]:
exp_config = exp_factory.get_exp_config('retinanet_resnetfpn_coco')

HEIGHT, WIDTH = 640, 640
IMG_SIZE = [HEIGHT, WIDTH, 3]

# Backbone config.
exp_config.task.freeze_backbone = False
exp_config.task.annotation_file = ''

# Model config.
exp_config.task.model.input_size = IMG_SIZE
exp_config.task.model.num_classes = NUM_CLASSES
exp_config.task.model.detection_generator.tflite_post_processing.max_classes_per_detection = exp_config.task.model.num_classes

# Training data config.
exp_config.task.train_data.input_path = TRAIN_TFRECORD
exp_config.task.train_data.dtype = 'float32'
exp_config.task.train_data.global_batch_size = BATCH_SIZE
exp_config.task.train_data.parser.aug_scale_max = 1.0
exp_config.task.train_data.parser.aug_scale_min = 1.0

# Validation data config.
exp_config.task.validation_data.input_path = VALID_TFRECORD
exp_config.task.validation_data.dtype = 'float32'
exp_config.task.validation_data.global_batch_size = BATCH_SIZE


In [7]:
logical_device_names = [logical_device.name for logical_device in tf.config.list_logical_devices()]

if 'GPU' in ''.join(logical_device_names):
  print('This may be broken in Colab.')
  device = 'GPU'
elif 'TPU' in ''.join(logical_device_names):
  print('This may be broken in Colab.')
  device = 'TPU'
else:
  print('Running on CPU is slow, so only train for a few steps.')
  device = 'CPU'


train_steps = 1000
exp_config.trainer.steps_per_loop = 100 # steps_per_loop = num_of_training_examples // train_batch_size

exp_config.trainer.summary_interval = 100
exp_config.trainer.checkpoint_interval = 100
exp_config.trainer.validation_interval = 100
exp_config.trainer.validation_steps =  100 # validation_steps = num_of_validation_examples // eval_batch_size
exp_config.trainer.train_steps = train_steps
exp_config.trainer.optimizer_config.warmup.linear.warmup_steps = 100
exp_config.trainer.optimizer_config.learning_rate.type = 'cosine'
exp_config.trainer.optimizer_config.learning_rate.cosine.decay_steps = train_steps
exp_config.trainer.optimizer_config.learning_rate.cosine.initial_learning_rate = 0.1
exp_config.trainer.optimizer_config.warmup.linear.warmup_learning_rate = 0.05


Running on CPU is slow, so only train for a few steps.


In [8]:
if exp_config.runtime.mixed_precision_dtype == tf.float16:
    tf.keras.mixed_precision.set_global_policy('mixed_float16')

if 'GPU' in ''.join(logical_device_names):
  distribution_strategy = tf.distribute.MirroredStrategy()
elif 'TPU' in ''.join(logical_device_names):
  tf.tpu.experimental.initialize_tpu_system()
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='/device:TPU_SYSTEM:0')
  distribution_strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
  print('Warning: this will be really slow.')
  distribution_strategy = tf.distribute.OneDeviceStrategy(logical_device_names[0])

print('Done')

Done


In [9]:
with distribution_strategy.scope():
  task = tfm.core.task_factory.get_task(exp_config.task, logging_dir=MODEL_TRAIN)


In [10]:
for images, labels in task.build_inputs(exp_config.task.train_data).take(1):
  print()
  print(f'images.shape: {str(images.shape):16}  images.dtype: {images.dtype!r}')
  print(f'labels.keys: {labels.keys()}')


images.shape: (12, 640, 640, 3)  images.dtype: tf.float32
labels.keys: dict_keys(['cls_targets', 'box_targets', 'anchor_boxes', 'cls_weights', 'box_weights', 'image_info'])


2024-05-11 08:42:32.305818: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [11]:
buffer_size = 20
num_of_examples = 3

raw_records = tf.data.TFRecordDataset(
    exp_config.task.train_data.input_path).shuffle(
        buffer_size=buffer_size).take(num_of_examples)
show_batch(raw_records, num_of_examples)

NameError: name 'show_batch' is not defined

In [12]:
model, eval_logs = tfm.core.train_lib.run_experiment(
    distribution_strategy=distribution_strategy,
    task=task,
    mode='train_and_eval',
    params=exp_config,
    model_dir=MODEL_TRAIN,
    run_post_eval=True)

restoring or initializing model...


2024-05-11 08:42:59.277389: W external/local_tsl/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal".


INFO:tensorflow:Customized initialization is done through the passed `init_fn`.


INFO:tensorflow:Customized initialization is done through the passed `init_fn`.


train | step:      0 | training until step 100...


2024-05-11 08:43:22.888628: W tensorflow/core/framework/dataset.cc:959] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.
2024-05-11 08:45:04.720406: W external/local_tsl/tsl/framework/bfc_allocator.cc:487] Allocator (mklcpu) ran out of memory trying to allocate 182.63MiB (rounded to 191505664)requested by op while/body/_1/gradient_tape/FocalLoss/focal_loss/logistic_loss/Select
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-05-11 08:45:04.723038: I external/local_tsl/tsl/framework/bfc_allocator.cc:1044] BFCAllocator dump for mklcpu
2024-05-11 08:45:04.723056: I external/local_tsl/tsl/framework/bfc_allocator.cc:1051] Bin (256): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-r

ResourceExhaustedError: Graph execution error:

Detected at node gradient_tape/FocalLoss/focal_loss/logistic_loss/Select defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py", line 18, in <module>

  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 739, in start

  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.11/asyncio/base_events.py", line 604, in run_forever

  File "/usr/lib/python3.11/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.11/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_25/24471904.py", line 1, in <module>

  File "/usr/local/lib/python3.11/dist-packages/official/core/train_lib.py", line 372, in run_experiment

  File "/usr/local/lib/python3.11/dist-packages/official/core/train_lib.py", line 271, in run

  File "/usr/local/lib/python3.11/dist-packages/orbit/controller.py", line 393, in train_and_evaluate

  File "/usr/local/lib/python3.11/dist-packages/orbit/controller.py", line 282, in train

  File "/usr/local/lib/python3.11/dist-packages/orbit/controller.py", line 517, in _train_n_steps

  File "/usr/local/lib/python3.11/dist-packages/orbit/standard_runner.py", line 146, in train

  File "/usr/local/lib/python3.11/dist-packages/orbit/utils/loop_fns.py", line 116, in loop_fn

  File "/usr/local/lib/python3.11/dist-packages/orbit/utils/loop_fns.py", line 120, in loop_fn

  File "/usr/local/lib/python3.11/dist-packages/official/core/base_trainer.py", line 400, in train_step

  File "/usr/local/lib/python3.11/dist-packages/official/core/base_trainer.py", line 391, in step_fn

  File "/usr/local/lib/python3.11/dist-packages/official/vision/tasks/retinanet.py", line 342, in train_step

OOM when allocating tensor with shape[12,76725,52] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu
	 [[{{node gradient_tape/FocalLoss/focal_loss/logistic_loss/Select}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_loop_fn_27548]

In [15]:
export_saved_model_lib.export_inference_graph(
    input_type='image_tensor',
    batch_size=1,
    input_image_size=[HEIGHT, WIDTH],
    params=exp_config,
    checkpoint_path=tf.train.latest_checkpoint(MODEL_TRAIN),
    export_dir=MODEL_EXPORT)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f7b9f022110>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


KeyboardInterrupt: 