In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import tensorflow.keras as keras
import segmentation_models as sm
import datetime
import time

from Core import create_backbone_efficient, create_backbone_mobile, create_ds, FCN, Test

tf.random.set_seed(1024)

Segmentation Models: using `keras` framework.


In [3]:
n_epoch = 20
n_classes = 2
batch_size = 4
trainds, testds = create_ds(batch_size)
model = FCN(n_classes)

# set to not train backbone
model.fpn.backbone.trainable = False

19373 17243
IMG - LBL NUM: 10687, Intersection: 8686


2022-02-11 18:06:01.457817: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-11 18:06:01.465419: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-11 18:06:01.465770: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-11 18:06:01.466333: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the app

14476 14443
IMG - LBL NUM: 38, Intersection: 14438


In [4]:
optimizer = keras.optimizers.Adam()
focal_loss = sm.losses.CategoricalFocalLoss()
dice_loss = sm.losses.DiceLoss(class_weights=[0.001, 0.999])

In [5]:
ckpt = tf.train.Checkpoint(model=model, optimizer=optimizer)
ckptmg = tf.train.CheckpointManager(ckpt, "trained_model/seagull_mobile_fpn", 5)
ckptmg.restore_or_initialize()

In [6]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = "logs/mobile/" + current_time + "/train"
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
test_log_dir = "logs/mobile/" + current_time + "/test"
test_summary_writer = tf.summary.create_file_writer(test_log_dir)

In [7]:
# Real training
train_iteration = 0
iteration = 0
sum_iou = 0
sum_loss = 0
ALPHA = 1.0

for epoch in range(n_epoch):
    initial_time = time.time()
    for bs_images, bs_labels in trainds:
        bs_images = keras.applications.mobilenet_v2.preprocess_input(bs_images)

        with tf.GradientTape() as t:
            output = model(bs_images, training=True)
            c_loss = dice_loss(bs_labels, output)
            c_loss += ALPHA * focal_loss(bs_labels, output)

        grad = t.gradient(c_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grad, model.trainable_variables))
        sum_loss = c_loss
        train_iteration += 1

        with train_summary_writer.as_default():
            tf.summary.scalar("loss", c_loss, step=train_iteration)
            tf.summary.scalar(
                "iou", sm.metrics.iou_score(bs_labels, output), step=train_iteration
            )
    print(f"Time Taken for 1 Epoch: {(time.time() - initial_time) / 1000}s")

    # for bs_images, bs_labels in testds:
    #     output = model(bs_images, training=False)
    #     sum_loss += (
    #         dice_loss(bs_labels, output) + ALPHA * focal_loss(bs_labels, output)
    #     ) * batch_size
    #     sum_iou += sm.metrics.iou_score(bs_labels, output) * batch_size
    #     iteration += batch_size
    
    # with test_summary_writer.as_default():
    #     tf.summary.scalar("loss", sum_loss / iteration, step=train_iteration)
    #     tf.summary.scalar("iou", sum_iou / iteration, step=train_iteration)

    iteration = 0
    sum_iou = 0
    sum_loss = 0
    ckptmg.save()

2022-02-11 18:06:04.100193: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8100
2022-02-11 18:06:05.997909: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.11GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-02-11 18:06:06.635005: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.11GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


KeyboardInterrupt: 

# start testing