In [None]:
import datetime
import logging as log
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from io import BytesIO
from time import sleep

import boto3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from botocore.config import Config
from PIL import Image, ImageOps

log.basicConfig(stream=sys.stdout, level=log.INFO)

# Create AWS session token
session = boto3.Session(profile_name="dev")
client_config = Config(max_pool_connections=1280)
s3_client = session.client("s3", config=client_config)
BUCKET = "S3_BUCKET"
PREFIX = "finder/ai_or_not/ai_or_not_datasets"

MAX_ITEMS = 8
BATCH_SIZE = 4
IM_SIZE = 128
CHANNELS = 3

TRAIN_DIRECTORIES = [
    # Label 0
    ("DIRE/train/imagenet/real", 0, "*/*"),
    ("DIRE/train/celebahq/real", 0, "*"),
    ("DIRE/train/lsun_bedroom/real", 0, "*"),
    ("cifake/train/REAL", 0, "*"),
    # Label 1
    ("DIRE/train/imagenet/adm", 1, "*/*"),
    ("DIRE/train/lsun_bedroom/adm", 1, "*"),
    # Label 2
    ("cifake/train/FAKE", 2, "*"),  # Generated by SD 1.4
    (
        "FakeImageDataset/ImageData/train/SDv15R-CC1M/SDv15R-dpmsolver-25-1M/SDv15R-CC1M",
        2,
        "*",
    ),  # Generated by SD 1.5
    ("DIRE/train/celebahq/sdv2", 2, "*/*"),
    # Label 3
    ("FakeImageDataset/ImageData/val/Midjourneyv5-5K/Midjourneyv5-5K_train", 3, "*"),
]

# extensions to filter oout
UNWANTED_FILE_EXTENSIONS = ["tar", "gz", "zip", "DS_Store"]

In [None]:
def download_images(keys, labels):
    def _download_image(key, label):
        key = key.decode("utf-8")
        log.debug(f"Key to download: {key}")
        response = s3_client.get_object(Bucket=BUCKET, Key=key)
        body = response["Body"]
        bytes = body.read()
        # convert to PIL image
        image = Image.open(BytesIO(bytes))
        # restrict to 3 channels
        image = image.convert("RGB")
        # Expand or shrink the image to fit
        contained_image = ImageOps.contain(image, (IM_SIZE, IM_SIZE))
        # pad the image
        padded_image = ImageOps.pad(contained_image, (IM_SIZE, IM_SIZE), color=0)
        # convert to numpy array
        image_array = np.array(padded_image)
        # normalize the image
        normalized_image = image_array / 255.0
        log.debug(f"Normalized Image Shape: {normalized_image.shape}")
        return normalized_image, label

    log.debug(f"Keys: {keys.shape} Labels: {labels.shape}")
    keys = keys.numpy()
    labels = labels.numpy()

    # thread pool executor to download images in parallel
    with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
        results = list(executor.map(_download_image, keys, labels))

    # convert to numpy array
    images, labels = zip(*results)
    images = np.array(images)
    labels = np.array(labels)
    log.debug(f"Images Shape: {images.shape}")

    return images, labels


def read_s3_image(prefix, label):
    prefix = prefix.decode("utf-8")
    s3_paginator = s3_client.get_paginator("list_objects_v2")
    s3_iterator = s3_paginator.paginate(
        Bucket=BUCKET, Prefix=prefix, PaginationConfig={"MaxItems": MAX_ITEMS, "PageSize": BATCH_SIZE}
    )
    for page in s3_iterator:
        raw_keys = []
        for item in page["Contents"]:
            key = item["Key"]
            raw_keys.append(key)

        # filter out filders and unwanted file extensions
        keys = []
        for key in raw_keys:
            key_parts = key.split(".")
            last_part = key_parts[-1]
            last_second_part = key_parts[-2]

            is_file_name = key.split("/")[-1]

            if (
                is_file_name != ""
                and last_part not in UNWANTED_FILE_EXTENSIONS
                and last_second_part not in UNWANTED_FILE_EXTENSIONS
            ):
                keys.append(key)

        # only if there are keys
        if len(keys) == 0:
            continue

        converted_keys = tf.convert_to_tensor(keys)
        log.debug(f"{prefix} : Converted Keys {len(keys)}")
        # convert to tensor of shape (batch_size, 1)
        # converted_keys = tf.reshape(converted_keys, (-1, 1))
        log.debug(f"Keys Shape: {converted_keys.shape} {keys}")
        yield converted_keys, tf.constant(label, shape=(len(keys),))

In [None]:
current_directories = []
current_labels = []

for directory in TRAIN_DIRECTORIES:
    current_directories.append(f"{PREFIX}/{directory[0]}")
    current_labels.append(directory[1])

print(current_directories)
print(current_labels)
# create dataset using the directories and labels
directories = tf.data.Dataset.from_tensor_slices((current_directories, current_labels))

# directories = tf.data.Dataset.from_tensor_slices(current_directories)
files_dataset = directories.interleave(
    lambda directory_path, label: tf.data.Dataset.from_generator(
        generator=read_s3_image,
        args=[directory_path, label],
        output_signature=(
            tf.TensorSpec(shape=(None,), dtype=tf.string),
            tf.TensorSpec(shape=(None,), dtype=tf.int32),
        ),
    ),
    cycle_length=len(TRAIN_DIRECTORIES),
    block_length=2,
    num_parallel_calls=tf.data.AUTOTUNE,
)

# # unbatched_dataset = files_dataset.flat_map(tf.data.Dataset.from_tensor_slices)
#  unbatched_dataset = files_dataset.unbatch()

# map keys to download function
image_dataset = files_dataset.map(
    map_func=lambda image_prefix, label: tf.py_function(
        func=download_images,
        inp=[image_prefix, label],
        Tout=[tf.float32, tf.int32],
    ),
    num_parallel_calls=tf.data.AUTOTUNE,
)

batched_dataset = image_dataset.prefetch(tf.data.AUTOTUNE).unbatch().batch(4)
for images, labels in batched_dataset:
    log.info(f"Keys Count: {images.shape} {labels.shape}")

    if images.shape[0] < 4:
        continue
    plt.figure(figsize=(10, 10))
    for n in range(4):
        ax = plt.subplot(4, 4, n + 1)
        plt.imshow(images[n])
        plt.axis("off")
    plt.show()

In [None]:
# # # create a tensorflow dataset from the generator
# # dataset = tf.data.Dataset.from_generator(
# #     generator=s3_generator,
# #     # change the output types accordingly
# #     output_signature=(
# #         tf.TensorSpec(shape=(None,1), dtype=tf.string)
# #     )

# # )

# dataset = get_composite_dataset()

# # unbatched_dataset = dataset.flat_map(tf.data.Dataset.from_tensor_slices)
# unbatched_dataset = dataset.unbatch()

# # map keys to download function
# image_dataset = unbatched_dataset.map(
#     map_func=lambda image_prefix: tf.py_function(
#         func=download_image2,
#         inp=[image_prefix],
#         Tout=tf.float32,
#     ),
#     num_parallel_calls=tf.data.AUTOTUNE
# )


# batched_dataset = image_dataset.prefetch(tf.data.AUTOTUNE).batch(4)
# # for images in batched_dataset:
# #     log.info(f"Keys Count: {images.shape}")

# #     if images.shape[0] < 4:
# #         continue
# #     plt.figure(figsize=(10, 10))
# #     for n in range(4):
# #         ax = plt.subplot(4, 4, n + 1)
# #         plt.imshow(images[n])
# #         plt.axis("off")
# #     plt.show()

In [None]:
# data_path = "../../cache/data/FakeImageDataset/ImageData/train/SDv15R-CC1M/SDv15R-dpmsolver-25-1M/SDv15R-CC1M"

# # write function to read the images from the local directory
# def read_local_image(prefix):
#     for root, dirs, files in os.walk(prefix):
#         for file in files:
#             # print reading path
#             log.info(os.path.join(root, file))
#             # read the image
#             img = Image.open(os.path.join(root, file))

# # read_local_image(data_path)

In [None]:
# # dataset = tf.data.Dataset.range(1, 6)  # ==> [ 1, 2, 3, 4, 5 ]
# # dataset2 = tf.data.Dataset.range(7, 12)
# # # NOTE: New lines indicate "block" boundaries.
# # dataset = dataset.interleave(
# #     lambda x: dataset2)
# # list(dataset.as_numpy_iterator())

# # write a generator function that yields the keys and the file objects
# def  s3_generator():

#     # composite dataset
#     for directory in TRAIN_DIRECTORIES:
#         directory_path = directory[0]
#         aws_prefix = f"{PREFIX}/{directory_path}"
#         log.debug(f"Searching for images in {aws_prefix}")
#         # yield the generator
#         yield from read_s3_image(aws_prefix)


# def  get_composite_dataset():
#     # composite dataset
#     composite_dataset = None
#     datasets = []
#     for directory in TRAIN_DIRECTORIES:
#         directory_path = directory[0]
#         aws_prefix = f"{PREFIX}/{directory_path}"
#         log.debug(f"Searching for images in {aws_prefix}")

#         # create a dataset from the generator
#         # datasets.append(
#         current_dataset = tf.data.Dataset.from_generator(
#                 read_s3_image,
#                 args=[aws_prefix],
#                 output_signature=tf.TensorSpec(shape=(None, 1), dtype=tf.string),
#             )
#         datasets.append(current_dataset)
#         # )
#         # if composite_dataset is None:
#         #     log.info(f"Interleaving {aws_prefix}")
#         #     composite_dataset = current_dataset
#         # else:
#         #     log.info(f"Interleaving {aws_prefix}")
#         #     composite_dataset = composite_dataset.interleave(
#         #         lambda _: current_dataset,
#         #         cycle_length=len(TRAIN_DIRECTORIES),
#         #         block_length=1
#         #     )

#     # interleave the datasets in the list of datasets to create a composite dataset
#     # composite_dataset = tf.data.Dataset.zip(tuple(datasets)).interleave(
#     #     lambda *args: tf.data.Dataset.from_tensor_slices(args).flat_map(
#     #         lambda *images: tf.data.Dataset.from_tensor_slices(images)
#     #     ),
#     #     cycle_length=len(datasets),
#     #     block_length=1,
#     #     num_parallel_calls=tf.data.experimental.AUTOTUNE,
#     # )
#     first_dataset = datasets[0]
#     second_dataset = datasets[1]

#     composite_dataset = second_dataset.interleave(
#         lambda _: first_dataset,
#         cycle_length=1,
#         block_length=1,
#         )
#     return composite_dataset