In [1]:
import numpy as np
import os
import warnings
from datetime import datetime
import mlflow
from dotenv import load_dotenv
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from tensorflow import keras 

warnings.filterwarnings('ignore')

load_dotenv('../.env')

RSEED = 123
MODELS_DIR=os.path.join('..','models')
MODEL_CHECKPOINTS_DIR=os.path.join('..','model_checkpoints')

start_time = datetime.now().strftime('-%Y-%m-%d-%T')

2022-12-01 14:58:27.403283: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-01 14:58:27.583482: I tensorflow/core/tpu/tpu_initializer_helper.cc:262] Libtpu path is: libtpu.so
I1201 14:58:27.677741380 3012016 ev_epoll1_linux.cc:121]     grpc epoll fd: 71
D1201 14:58:27.677764310 3012016 ev_posix.cc:141]            Using polling engine: epoll1
D1201 14:58:27.677815385 3012016 lb_policy_registry.cc:43]   registering LB policy factory for "grpclb"
D1201 14:58:27.677825793 3012016 lb_policy_registry.cc:43]   registering LB policy factory for "rls_experimental"
D1201 14:58:27.677835420 3012016 lb_policy_registry.cc:43]   registering LB policy factory for "priority_experimental"
D1201 14:58:27

In [2]:
for dir in MODELS_DIR, MODEL_CHECKPOINTS_DIR:
    if not os.path.exists(dir):
        os.mkdir(dir)

In [3]:
# Set parameters
batch_size=64
patience=10 
min_delta=0.001
dropout_rate=0.25
initial_learning_rate=0.0005


run_name_params = (
    f'bs{batch_size}'
    f'_pat{patience}'
    f'_del{min_delta}'
    f'_dr{dropout_rate}'
    f'_lr{initial_learning_rate}'
)

parent_run_name = f'mobilenetv2_{run_name_params}_save'

In [4]:
# Set up gcloud TPUs
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
tf.config.experimental_connect_to_cluster(cluster_resolver)
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
strategy = tf.distribute.TPUStrategy(cluster_resolver)


INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local


2022-12-01 14:58:29.957218: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-01 14:58:34.050184: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7c5d940 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
2022-12-01 14:58:34.050224: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): TPU, 2a886c8
2022-12-01 14:58:34.050232: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (1): TPU, 2a886c8
2022-12-01 14:58:34.050238: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (2): TPU, 2a886c8
2022-12-01 14:58:34.050244: I tensorflow/compiler/xla/service/service.cc:181]   

INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)
I

In [5]:
# Set information for mlflow
run_description = """
Fully trained models 
    - classify each slice by tumour/tissue regions in the segmentation
    - Uses MobileNetV2
    - Saves model at end of run
"""
dataset = 'full_data_stratified'
mlflow_tracking_uri = os.getenv('MLFLOW_URI')
if mlflow_tracking_uri:
    mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow_expt = os.getenv('CLASSIFICATION_EXPT')
if mlflow_expt:
    mlflow.set_experiment(mlflow_expt)    


print(f'Logging to \n URI:{mlflow_tracking_uri}\n Expt:{mlflow_expt}')


Logging to 
 URI:https://hudsju377cddpoevnjdkfnvpwovniewnipcdsnkvn.mlflow.neuefische.de
 Expt:braintumour_mri_slice_classification


In [6]:


with mlflow.start_run(
    run_name=parent_run_name,
    tags={
        'dataset': dataset,
    },
    description=run_description,
):

    img_height = 240
    img_width = 240
    data_dir = os.path.join('..','data','UPENN-GBM','slice_classification_common_stratify','train')

    train_ds = tf.keras.utils.image_dataset_from_directory(
        data_dir,
        validation_split=0.2,
        subset="training",
        color_mode="rgba",
        seed=RSEED,
        image_size=(img_height, img_width),
        batch_size=batch_size)

    val_ds = tf.keras.utils.image_dataset_from_directory(
        data_dir,
        validation_split=0.2,
        subset="validation",
        color_mode="rgba",
        seed=RSEED,
        image_size=(img_height, img_width),
        batch_size=batch_size)

    
    class_names = train_ds.class_names

    # Calculate class weights for weighting accuracy
    ds_classes = []
    for _, batch_classes in train_ds:
        ds_classes.append(batch_classes.numpy())

    ds_classes = np.concatenate(ds_classes)

    class_weight = compute_class_weight(
        class_weight = 'balanced',
        classes = np.unique(ds_classes),
        y=ds_classes
    )

    class_weight = dict(zip(np.unique(ds_classes), class_weight))

    AUTOTUNE = tf.data.AUTOTUNE

    train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

    num_classes = len(class_names)
    
    margin = 8
    scaled_height = img_height - 2*margin
    scaled_width = img_width - 2*margin

    # Build layers for model with fixed base
    with strategy.scope():
        crop_layer = tf.keras.layers.Cropping2D(margin)
        rescale_initial = tf.keras.layers.Rescaling(1./127.5, offset=-1)
        conv_4to3_channel = tf.keras.layers.Conv2D(3,1,padding='same',activation='tanh')
        base_model = tf.keras.applications.MobileNetV2(
            input_shape=(scaled_width,scaled_height,3),
            include_top=False,
            weights='imagenet'
        )
        global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
        prediction_layer = tf.keras.layers.Dense(num_classes)

        base_model.trainable = False
        
        inputs = tf.keras.Input(shape=(img_width, img_height, 4))
        x = crop_layer(inputs)
        x = rescale_initial(x)
        x = conv_4to3_channel(x)
        x = base_model(x, training=False)
        x = global_average_layer(x)
        x = tf.keras.layers.Dropout(dropout_rate)(x)
        outputs = prediction_layer(x)
    
        earlystopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=patience,
            min_delta=min_delta,
            )
        
        model = tf.keras.Model(inputs, outputs)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=initial_learning_rate,),
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=['accuracy'],
        )

        
    # Initial fit of classification and 4 to 3 channel layers
    with mlflow.start_run(
        run_name=f'fixed_{run_name_params}',
        tags={'dataset': dataset},
        nested=True
    ):
        mlflow.tensorflow.autolog()
        mlflow.log_param('ds_batch_size', batch_size)
        mlflow.log_param('ds_validation_batch_size', batch_size)


        fixed_base_epochs=80
        history_fixed_base = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=fixed_base_epochs,
            class_weight=class_weight,
            callbacks=[earlystopping],
        )

    # Relax top layers of base model
    base_model.trainable = True
    fix_below_layer = 100
    for layer in base_model.layers[:fix_below_layer]:
        layer.trainable = False
    with strategy.scope():
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=initial_learning_rate/10.0),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'],
        )

    with mlflow.start_run(
        run_name=f'partial_{run_name_params}',
        tags={'dataset': dataset},
        nested=True
    ):
        mlflow.tensorflow.autolog()
        mlflow.log_param('ds_batch_size', batch_size)
        mlflow.log_param('ds_validation_batch_size', batch_size)

        partial_relax_epochs=history_fixed_base.epoch[-1] + 100 
        history_partial_relax = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=partial_relax_epochs,
            initial_epoch=history_fixed_base.epoch[-1]+1,
            class_weight=class_weight,
            callbacks=[earlystopping],
        )

    # Fully relax model
    model.trainable = True

    with strategy.scope():
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=initial_learning_rate/10.0),
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=['accuracy'],
        )

    with mlflow.start_run(
        run_name=f'relax_{run_name_params}',
        tags={'dataset': dataset},
        nested=True
    ):
        mlflow.tensorflow.autolog()
        mlflow.log_param('ds_batch_size', batch_size)
        mlflow.log_param('ds_validation_batch_size', batch_size)

        # create checkpoint
        checkpoint_path = os.path.join(
            MODEL_CHECKPOINTS_DIR,
            parent_run_name + start_time + "-{epoch:03d}-{val_loss:.4f}.ckpt"
        )
        ckpt_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_path, 
            verbose=1, 
            save_weights_only=False,
            save_freq='epoch',
            monitor='val_loss',
            mode='min',
            save_best_only=True,
        ) 

        full_relax_epochs=history_partial_relax.epoch[-1] + 100
        history_full_relax = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=full_relax_epochs,
            initial_epoch=history_partial_relax.epoch[-1]+1,
            class_weight=class_weight,
            callbacks=[earlystopping, ckpt_callback],
        )


Found 49725 files belonging to 5 classes.
Using 39780 files for training.
Found 49725 files belonging to 5 classes.
Using 9945 files for validation.


2022-12-01 14:58:56.497091: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-12-01 14:58:56.560125: I tensorflow/compiler/jit/xla_compilation_cache.cc:476] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2022-12-01 14:59:14.691897: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 39780
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:0"
  }
}
attr {
  key: "output_shape

Epoch 1/80


2022-12-01 14:59:21.273645: I tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc:237] Subgraph fingerprint:10405401112127002185
2022-12-01 14:59:21.565325: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/Adam/AssignAddVariableOp.
2022-12-01 14:59:21.866002: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/Adam/AssignAddVariableOp.
2022-12-01 14:59:25.896294: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:437] TPU host compilation cache miss: cache_key(6276048110352915742), session_name()
2022-12-01 14:59:31.843313: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:210] Compilation of 6276048110352915742 with session name  took 5.946923584s and succeeded
2022-12-01 14:59:31.867308: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:470] TPU host c



2022-12-01 14:59:42.459849: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:437] TPU host compilation cache miss: cache_key(14759355675721243362), session_name()




2022-12-01 14:59:48.625944: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:210] Compilation of 14759355675721243362 with session name  took 6.165977192s and succeeded
2022-12-01 14:59:48.648131: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:470] TPU host compilation cache: compilation complete for cache_key(14759355675721243362), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_10405401112127002185", property.function_library_fingerprint = 5401249623246154427, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, topology.MissingChipCount() = 0, std::string(property.shapes_prefix) = "5,240,240,4,;5,;5,;", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
2022-12-01 14:59:48.648189: I tensorfl



2022-12-01 14:59:58.172999: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 9945
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:7"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}

2022-12-01 

Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80


2022-12-01 15:11:55.491450: I tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc:237] Subgraph fingerprint:1223823399348306813
2022-12-01 15:11:55.613899: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2022-12-01 15:11:55.799917: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2022-12-01 15:11:56.074955: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:437] TPU host compilation cache miss: cache_key(13537918162263737921), session_name()




2022-12-01 15:11:59.278100: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:210] Compilation of 13537918162263737921 with session name  took 3.203038711s and succeeded
2022-12-01 15:11:59.290324: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:470] TPU host compilation cache: compilation complete for cache_key(13537918162263737921), session_name(), subgraph_key(std::string(property.function_name) = "cluster_predict_function_1223823399348306813", property.function_library_fingerprint = 5452589718846498119, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, topology.MissingChipCount() = 0, std::string(property.shapes_prefix) = "2,240,240,4,;", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
2022-12-01 15:11:59.290384: I tensorflow/co

INFO:tensorflow:Assets written to: /tmp/tmptivl8ca7/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmptivl8ca7/model/data/model/assets
2022-12-01 15:12:36.153941: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 39780
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:0"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        ar

Epoch 32/130


2022-12-01 15:12:46.674300: I tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc:237] Subgraph fingerprint:8398881911490627436
2022-12-01 15:12:47.262515: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/Adam/AssignAddVariableOp.
2022-12-01 15:12:47.699151: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/Adam/AssignAddVariableOp.
2022-12-01 15:12:48.602267: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:437] TPU host compilation cache miss: cache_key(18224724656348430986), session_name()
2022-12-01 15:12:58.498786: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:210] Compilation of 18224724656348430986 with session name  took 9.896431936s and succeeded
2022-12-01 15:12:58.525800: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:470] TPU host 

 49/622 [=>............................] - ETA: 28s - loss: 1.5108 - accuracy: 0.4490

2022-12-01 15:13:01.328072: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:437] TPU host compilation cache miss: cache_key(13640172023995873860), session_name()


 53/622 [=>............................] - ETA: 2:00 - loss: 1.5345 - accuracy: 0.4444

2022-12-01 15:13:09.671628: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:210] Compilation of 13640172023995873860 with session name  took 8.343468443s and succeeded
2022-12-01 15:13:09.706493: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:470] TPU host compilation cache: compilation complete for cache_key(13640172023995873860), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_8398881911490627436", property.function_library_fingerprint = 12272595353200119753, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, topology.MissingChipCount() = 0, std::string(property.shapes_prefix) = "5,240,240,4,;5,;5,;", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
2022-12-01 15:13:09.706556: I tensorfl



2022-12-01 15:13:29.518000: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 9945
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:7"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}

2022-12-01 

Epoch 33/130
Epoch 34/130
Epoch 35/130
Epoch 36/130
Epoch 37/130
Epoch 38/130
Epoch 39/130
Epoch 40/130
Epoch 41/130
Epoch 42/130
Epoch 43/130
Epoch 44/130
Epoch 45/130
Epoch 46/130
Epoch 47/130
Epoch 48/130
Epoch 49/130
Epoch 50/130
Epoch 51/130
Epoch 52/130
Epoch 53/130
Epoch 54/130
Epoch 55/130
Epoch 56/130
Epoch 57/130
Epoch 58/130
Epoch 59/130
Epoch 60/130
Epoch 61/130
Epoch 62/130
Epoch 63/130
Epoch 64/130
Epoch 65/130
Epoch 66/130


2022-12-01 15:27:48.151941: I tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc:237] Subgraph fingerprint:1223823399348306813
2022-12-01 15:27:48.274884: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2022-12-01 15:27:48.453579: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.






INFO:tensorflow:Assets written to: /tmp/tmp2wbvfc3a/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmp2wbvfc3a/model/data/model/assets
2022-12-01 15:28:26.448452: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 39780
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:0"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        ar

Epoch 67/165


2022-12-01 15:28:43.065571: I tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc:237] Subgraph fingerprint:4437417191903188822
2022-12-01 15:28:43.957930: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/Adam/AssignAddVariableOp.
2022-12-01 15:28:44.535271: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/Adam/AssignAddVariableOp.
2022-12-01 15:28:45.782103: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:437] TPU host compilation cache miss: cache_key(5571990506842326672), session_name()
2022-12-01 15:28:58.335426: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:210] Compilation of 5571990506842326672 with session name  took 12.553222374s and succeeded
2022-12-01 15:28:58.382328: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:470] TPU host c







2022-12-01 15:29:21.129754: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:437] TPU host compilation cache miss: cache_key(16503135754554191190), session_name()




2022-12-01 15:29:34.851085: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:210] Compilation of 16503135754554191190 with session name  took 13.721218781s and succeeded
2022-12-01 15:29:34.893284: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:470] TPU host compilation cache: compilation complete for cache_key(16503135754554191190), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_4437417191903188822", property.function_library_fingerprint = 2254077375692676269, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, topology.MissingChipCount() = 0, std::string(property.shapes_prefix) = "5,240,240,4,;5,;5,;", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
2022-12-01 15:29:34.893342: I tensorfl



2022-12-01 15:29:41.317379: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 9945
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:7"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}

2022-12-01 


Epoch 67: val_loss improved from inf to 0.38845, saving model to ../model_checkpoints/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29-067-0.3884.ckpt




INFO:tensorflow:Assets written to: ../model_checkpoints/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29-067-0.3884.ckpt/assets


INFO:tensorflow:Assets written to: ../model_checkpoints/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29-067-0.3884.ckpt/assets


Epoch 68/165
Epoch 68: val_loss did not improve from 0.38845
Epoch 69/165
Epoch 69: val_loss did not improve from 0.38845
Epoch 70/165
Epoch 70: val_loss did not improve from 0.38845
Epoch 71/165
Epoch 71: val_loss did not improve from 0.38845
Epoch 72/165
Epoch 72: val_loss improved from 0.38845 to 0.35885, saving model to ../model_checkpoints/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29-072-0.3589.ckpt




INFO:tensorflow:Assets written to: ../model_checkpoints/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29-072-0.3589.ckpt/assets


INFO:tensorflow:Assets written to: ../model_checkpoints/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29-072-0.3589.ckpt/assets


Epoch 73/165
Epoch 73: val_loss did not improve from 0.35885
Epoch 74/165
Epoch 74: val_loss did not improve from 0.35885
Epoch 75/165
Epoch 75: val_loss did not improve from 0.35885
Epoch 76/165
Epoch 76: val_loss did not improve from 0.35885
Epoch 77/165
Epoch 77: val_loss improved from 0.35885 to 0.33132, saving model to ../model_checkpoints/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29-077-0.3313.ckpt




INFO:tensorflow:Assets written to: ../model_checkpoints/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29-077-0.3313.ckpt/assets


INFO:tensorflow:Assets written to: ../model_checkpoints/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29-077-0.3313.ckpt/assets


Epoch 78/165
Epoch 78: val_loss did not improve from 0.33132
Epoch 79/165
Epoch 79: val_loss did not improve from 0.33132
Epoch 80/165
Epoch 80: val_loss did not improve from 0.33132
Epoch 81/165
Epoch 81: val_loss did not improve from 0.33132
Epoch 82/165
Epoch 82: val_loss did not improve from 0.33132
Epoch 83/165
Epoch 83: val_loss did not improve from 0.33132
Epoch 84/165
Epoch 84: val_loss did not improve from 0.33132
Epoch 85/165
Epoch 85: val_loss did not improve from 0.33132
Epoch 86/165
Epoch 86: val_loss did not improve from 0.33132
Epoch 87/165
Epoch 87: val_loss did not improve from 0.33132


2022-12-01 15:41:56.776857: I tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc:237] Subgraph fingerprint:1223823399348306813
2022-12-01 15:41:56.945315: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2022-12-01 15:41:57.173248: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.






INFO:tensorflow:Assets written to: /tmp/tmp5oaye9l0/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmp5oaye9l0/model/data/model/assets


In [7]:
test_data_dir = os.path.join('..','data','UPENN-GBM','slice_classification_common_stratify','test')

test_ds = tf.keras.utils.image_dataset_from_directory(
    test_data_dir,
    color_mode="rgba",
    seed=RSEED,
    shuffle=False,
    image_size=(img_height, img_width),
    batch_size=batch_size,
)

Found 9605 files belonging to 5 classes.


In [8]:
val_pred = model.predict(val_ds)
val_prob = tf.nn.softmax(val_pred)
val_class_pred = [np.argmax(x) for x in val_prob]
val_base = [ 0 for x in val_class_pred ]

val_true_class = []
for _, classes in val_ds:
    val_true_class += list(classes)




2022-12-01 15:42:36.696768: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 9945
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\024TensorSliceDataset:7"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}

2022-12-01 

  3/156 [..............................] - ETA: 12s

2022-12-01 15:42:43.046112: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:210] Compilation of 13856037510993769846 with session name  took 3.456192188s and succeeded
2022-12-01 15:42:43.058229: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:470] TPU host compilation cache: compilation complete for cache_key(13856037510993769846), session_name(), subgraph_key(std::string(property.function_name) = "cluster_predict_function_6174737307138202066", property.function_library_fingerprint = 3681382967468109196, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, topology.MissingChipCount() = 0, std::string(property.shapes_prefix) = "8,240,240,4,;8,;", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
2022-12-01 15:42:43.058293: I tensorflow



2022-12-01 15:42:47.752907: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:437] TPU host compilation cache miss: cache_key(4278150416744106827), session_name()




2022-12-01 15:42:51.232946: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:210] Compilation of 4278150416744106827 with session name  took 3.479903897s and succeeded
2022-12-01 15:42:51.246690: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:470] TPU host compilation cache: compilation complete for cache_key(4278150416744106827), session_name(), subgraph_key(std::string(property.function_name) = "cluster_predict_function_6174737307138202066", property.function_library_fingerprint = 3681382967468109196, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, topology.MissingChipCount() = 0, std::string(property.shapes_prefix) = "4,240,240,4,;4,;", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
2022-12-01 15:42:51.246741: I tensorflow/c

In [9]:
test_pred = model.predict(test_ds)
test_prob = tf.nn.softmax(test_pred)
test_class_pred = [np.argmax(x) for x in test_prob]
test_base = [ 0 for x in test_class_pred ]

test_true_class = []
for _, classes in test_ds:
    test_true_class += list(classes)


2022-12-01 15:42:55.980996: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_STRING
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 9605
  }
}
attr {
  key: "is_files"
  value {
    b: false
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\027TensorSliceDataset:7331"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
attr {
  key: "replicate_on_split"
  value {
    b: false
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_STRING
        }
      }
    }
  }
}





2022-12-01 15:43:01.844077: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:437] TPU host compilation cache miss: cache_key(9312899852912686474), session_name()




2022-12-01 15:43:05.217598: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:210] Compilation of 9312899852912686474 with session name  took 3.37341895s and succeeded
2022-12-01 15:43:05.231411: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:470] TPU host compilation cache: compilation complete for cache_key(9312899852912686474), session_name(), subgraph_key(std::string(property.function_name) = "cluster_predict_function_6174737307138202066", property.function_library_fingerprint = 3681382967468109196, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, topology.MissingChipCount() = 0, std::string(property.shapes_prefix) = "2,240,240,4,;2,;", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
2022-12-01 15:43:05.231462: I tensorflow/co

In [10]:
print(classification_report(val_true_class, val_class_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93      3082
           1       0.90      0.80      0.85      2440
           2       0.37      0.64      0.47       303
           3       0.65      0.69      0.67        59
           4       0.98      0.92      0.95      4061

    accuracy                           0.90      9945
   macro avg       0.76      0.81      0.77      9945
weighted avg       0.91      0.90      0.90      9945



In [11]:
print(classification_report(test_true_class, test_class_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      5708
           1       0.71      0.67      0.69      1361
           2       0.20      0.36      0.25       166
           3       0.00      0.00      0.00        69
           4       0.95      0.91      0.93      2301

    accuracy                           0.88      9605
   macro avg       0.56      0.58      0.56      9605
weighted avg       0.89      0.88      0.88      9605



In [12]:
model_file_name = os.path.join(MODELS_DIR, parent_run_name + start_time)
model.save(model_file_name)




INFO:tensorflow:Assets written to: ../models/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29/assets


INFO:tensorflow:Assets written to: ../models/mobilenetv2_bs64_pat10_del0.001_dr0.25_lr0.0005_save-2022-12-01-14:58:29/assets
