In [None]:
# ! pip install wandb
# ! pip install pydot
# ! pip install graphviz
# ! pip install datasets
# ! pip install scikit-learn
# ! pip install sagemaker_tensorflow # uses Linux FIFOs so does not work on Mac
# ! pip install tensorflow-datasets

# Importing necessary Libraries

In [None]:
import logging as log
import os
import shutil
from datetime import datetime
from pathlib import Path

import boto3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from boto3.s3.transfer import TransferConfig
from datasets import load_from_disk
from IPython.display import Image
from wandb.keras import WandbCallback

import wandb

In [None]:
PROJECT_SUFFIX = "frequency_classifier_multi"
ENTITY = "makersplace"
PROJECT = f"ai-or-not-{PROJECT_SUFFIX}"
SEED = 7
RUNTIME_DATE_SUFFIX = "%m%d_%H%M"

# current time
JOB_TYPE_SUFFIX = f"{PROJECT_SUFFIX}_M"
RUN_NAME_SUFFIX = datetime.now().strftime(RUNTIME_DATE_SUFFIX)

# Datasets Paths
CACHE_DIRECTORY = "../cache/tf_datasets"
MISC_DIRECTORY = f"{CACHE_DIRECTORY}/../misc"
training_dataset_path = f"{CACHE_DIRECTORY}/training_dataset"
testing_dataset_path = f"{CACHE_DIRECTORY}/testing_datasets"
download_path = f"{MISC_DIRECTORY}/downloaded_datasets"
dataset_cache_path = f"{MISC_DIRECTORY}/s3_dataset.cache"


# S3 Paths
S3_BUCKET = "mp-ml-data-dev"
# PREFIX = f"finder/ai_or_not/tf_datasets/1002_1858/" # 100 Shards
# PREFIX = f"finder/ai_or_not/tf_datasets/1003_1912/" # 1000 Shards
# PREFIX = f"finder/ai_or_not/tf_datasets/1003_0955/" # 100 Shards
# PREFIX = f"finder/ai_or_not/tf_datasets/1003_1812/" # 10 Shards
# PREFIX = f"finder/ai_or_not/tf_datasets/1004_1258/" # 10 *.tfrecord files
# PREFIX = f"finder/ai_or_not/tf_datasets/1004_1338/" # 10 GZIP *.tfrecord files
PREFIX = f"finder/ai_or_not/tf_datasets/1004_1419/"  # 20 GZIP *.tfrecord files


# Deleted and recreated training and validation dataset folders
CLEAN_RUN = True
# Dictates processing all data or just a subset
FULL_RUN = True
COMPRESSION_TYPE = "GZIP"

np.random.seed(SEED)
tf.random.set_seed(SEED)

# log to stdout
log.basicConfig(
    format="%(asctime)s %(levelname)-8s %(message)s",
    level=log.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
)

In [None]:
# WANDB Login
os.environ["WANDB_API_KEY"] = "d13afab09b400fc9d606e612d806a4b0740790fd"
wandb.login()

# S3 client
boto3_session = boto3.Session(profile_name="dev")
# create s3 client config object with max retries set to 10 and connection pool size set to 100 with region_name set to us-east-1
transfer_config = TransferConfig(
    use_threads=True,
    max_concurrency=100,
)
s3_client = boto3_session.client("s3", region_name=boto3_session.region_name)
# set connection pool size to 100

In [None]:
CONFIGURATION = {
    "BATCH_SIZE": 64,
    "IM_SIZE": 128,
    "DROPOUT_RATE": 0.1,
    "N_EPOCHS": 15,
    "REGULARIZATION_RATE": 0.01,
    "N_FILTERS": 6,
    "KERNEL_SIZE": 3,
    "N_STRIDES": 1,
    "POOL_SIZE": 2,
    "N_DENSE_1": 2048,
    "N_DENSE_2": 1024,
    "N_DENSE_3": 256,
    "LEARNING_RATE": 0.001,
    "CHANNELS": 3,
    "CLASS_NAMES": ["REAL", "GAN", "DM", "SD", "MD"],
}

In [None]:
def visualize_dataset(samples):
    plt.figure(figsize=(12, 12))
    index = 1
    for image, label in samples:
        plt.subplot(4, 4, index)
        plt.imshow(image)
        title = CONFIGURATION["CLASS_NAMES"][int(label)]
        plt.title(title)
        plt.axis("off")
        index += 1

    plt.show()

In [None]:
if CLEAN_RUN:
    # shutil.rmtree(dataset_cache_path, ignore_errors=True)
    # delete cache files if they exist
    # shutil.rmtree(f"{dataset_cache_path}*", ignore_errors=True)
    # delete files with glob pattern  if they exist
    for file in Path(dataset_cache_path).glob("*"):
        file.unlink()


# # read training dataset from disk
# training_dataset = tf.data.Dataset.load(training_dataset_path)
# training_dataset = training_dataset.take(100_000)

# training_dataset_sharded_path = f"{training_dataset_path}_sharded"
# # delete the folder if it exists
# if CLEAN_RUN:
#     shutil.rmtree(training_dataset_sharded_path, ignore_errors=True)

# Path(training_dataset_sharded_path).mkdir(parents=True, exist_ok=True)

# # save the dataset as sharded TFRecord files
# def get_shard_id(image, label):
#     # generate rnadom number between 0 and 99 using tensorflow random generator
#     random_number = tf.random.uniform(shape=[], minval=0, maxval=100, dtype=tf.int64)
#     return random_number

# training_dataset_sharded = training_dataset.save(path=training_dataset_sharded_path, shard_func=get_shard_id)

In [None]:
# # read sharded dataset from disk
# loaded_training_dataset_sharded = tf.data.Dataset.load(training_dataset_sharded_path)
# # visualize the dataset
# visualize_dataset(loaded_training_dataset_sharded.take(4))

In [None]:
# model = tf.keras.models.load_model('/Users/skoneru/workspace/discovery/playground/ai_or_not/cache/models/model_ev2s_99_acc_rgb/saved_model')

In [None]:
# read s3 training dataset path using tensorflow dataset api
training_path = f"s3://{S3_BUCKET}/{PREFIX}training_dataset"
log.info(f"Reading training dataset from s3: {training_path}")
# s3_training_dataset = tf.data.TFRecordDataset(
#     training_path,
#     num_parallel_reads=10,
#     buffer_size=10_000_000
# )


def custom_reader_func(datasets):
    return datasets.interleave(
        lambda x: x, cycle_length=10, block_length=1024, num_parallel_calls=10, deterministic=False
    )


# images_dataset = tf.data.TFRecordDataset.load(
#     path=training_path,
#     reader_func=custom_reader_func
# )

# # gather all the tfrecord files in the directory training_dataset_path
tf_record_files = tf.io.gfile.glob(f"{training_path}/*.tfrecord")
log.info(f"Found {tf_record_files} tfrecord files")
s3_training_dataset = tf.data.TFRecordDataset(
    filenames=tf_record_files, compression_type=COMPRESSION_TYPE, num_parallel_reads=8, buffer_size=1_000_000
)


def parse_tfrecord_fn(example):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([], tf.int64),
    }
    example = tf.io.parse_single_example(example, feature_description)
    image_tensor = example["image"]
    image_tensor = tf.io.parse_tensor(image_tensor, out_type=tf.float32)
    label = example["label"]

    return image_tensor, label


images_dataset = s3_training_dataset.map(
    map_func=parse_tfrecord_fn, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False
).batch(64)

# wrire a function to process a batch of examples read from the tfrecord files and return a batch of images and labels tensors
# def process_batch_examples(examples):
#     images, labels = [], []
#     # for example in examples:
#     feature_description = {
#         'image': tf.io.FixedLenFeature([], tf.string),
#         'label': tf.io.FixedLenFeature([], tf.int64),
#     }
#     parsed_examples = tf.io.parse_example(examples, feature_description)
#     # log.info(f"parsed_examples: {parsed_examples}")

#     images = tf.io.parse_tensor(parsed_examples['image'], out_type=tf.float32)
#     labels = parsed_examples['label']


#     return images, labels


# images_dataset = s3_training_dataset.batch(64).map(
#     map_func=process_batch_examples,
#     num_parallel_calls=tf.data.AUTOTUNE,
#     deterministic=False
# )

In [None]:
# response = images_dataset.take(1)
# print(response)
# visualize_dataset(response)

In [None]:
images_dataset = images_dataset.prefetch(buffer_size=tf.data.AUTOTUNE).cache(filename=dataset_cache_path)

In [None]:
# load a keras model from directory
tfds.benchmark(images_dataset, batch_size=64, num_iter=1024)

In [None]:
# model.fit(images_dataset, epochs=1)