In [2]:
import numpy as np
import tensorflow as tf
from PIL import ImageFile
from PIL import Image
import PIL
from os import listdir, remove, environ
from os.path import join, isdir

In [3]:
## Global variables
BATCH_SIZE = 64
IMG_SIZE = 224
ERR_FILE = '../outputs/CK7/test/corrupt_images.txt'
AUTOTUNE = tf.data.AUTOTUNE

ImageFile.LOAD_TRUNCATED_IMAGES = True
environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [8]:
def clean_directory(clean_dir):
    for file in listdir(clean_dir):
        try:
            _ = Image.open(join(clean_dir, file))
        except PIL.UnidentifiedImageError:
            with open(ERR_FILE, 'a') as err:
                err.write(f'{join(clean_dir, file)} is corrupt and has been removed' + "\n")
            remove(join(clean_dir, file))

def make_dataset(dir_path, img_size=(IMG_SIZE, IMG_SIZE), shuffle=True):
    return tf.keras.utils.image_dataset_from_directory(
        dir_path,
        labels=None,
        label_mode=None,
        batch_size=BATCH_SIZE,
        image_size=img_size,
        shuffle=shuffle
    )

def make_dataset_old(dir_path, img_size=(IMG_SIZE, IMG_SIZE), shuffle=True):
    datagen = tf.keras.preprocessing.image.ImageDataGenerator()
    return datagen.flow_from_directory(
        dir_path,
        target_size=img_size,
        classes=None,
        class_mode=None,
        shuffle=shuffle,
        batch_size=None
    )

def make_dataset_from_root(root_dir):
    datasets =[]
    for file in listdir(root_dir):
        if isdir(join(root_dir, file)):
            for mag in listdir(join(root_dir, file)):
                clean_directory(join(root_dir, file, mag))
                ds = make_dataset(join(root_dir, file, mag))
                datasets.append(ds)
                
    return tf.data.Dataset.zip(datasets)
        
def normalize_image(image):
    return tf.cast(image, tf.float32) / 255.0

In [16]:
dir_path = '/scratch/jhowe4/outputs/GDC/paad_example2/test/0'

dataset = make_dataset(dir_path)

Found 291 files belonging to 1 classes.


In [14]:
len(dataset)

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [11]:
for img in dataset:
    print(img.numpy().shape)
    break

2023-07-13 11:24:50.703708: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1072]
	 [[{{node Placeholder/_0}}]]
2023-07-13 11:24:50.706582: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1072]
	 [[{{node Placeholder/_0}}]]
2023-07-13 11:25:01.929862: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 44 of 512
2023-07-13 11:25:11.735241: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 83 of 512
2023-07-13 11:25:2

KeyboardInterrupt: 

In [7]:
dataset = dataset.map(normalize_image, num_parallel_calls=AUTOTUNE)
dataset

<_ParallelMapDataset element_spec=TensorSpec(shape=(224, 224, 3), dtype=tf.float32, name=None)>

In [8]:
for img in dataset:
    print(img.numpy().shape)
    break

2023-07-12 12:05:41.121748: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1040]
	 [[{{node Placeholder/_0}}]]
2023-07-12 12:05:41.123530: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1040]
	 [[{{node Placeholder/_0}}]]


(224, 224, 3)


In [9]:
dataset = dataset.cache()
dataset

<CacheDataset element_spec=TensorSpec(shape=(224, 224, 3), dtype=tf.float32, name=None)>

In [10]:
dataset = dataset.shuffle(len(dataset))

for img in dataset:
    print(img.numpy().shape)
    break

2023-07-12 12:05:46.495321: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1040]
	 [[{{node Placeholder/_0}}]]
2023-07-12 12:05:46.497027: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1040]
	 [[{{node Placeholder/_0}}]]


(224, 224, 3)


In [11]:
dataset = dataset.batch(BATCH_SIZE)
dataset

for img in dataset:
    print(img.numpy().shape)
    break

(64, 224, 224, 3)


2023-07-12 12:05:49.190739: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1040]
	 [[{{node Placeholder/_0}}]]
2023-07-12 12:05:49.192203: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1040]
	 [[{{node Placeholder/_0}}]]


In [12]:
dataset = dataset.prefetch(AUTOTUNE)
dataset

<_PrefetchDataset element_spec=TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None)>