# Cats VS Dogs

In [None]:
import os
import urllib.request
import zipfile

def download_dataset(url: str, dest_dir: str) -> str:
    """
    Downloads a dataset from a given URL and saves it to the specified destination directory.
    
    Args:
        url (str): The URL to download the dataset from.w
        dest_dir (str): The directory path to save the downloaded dataset to.
    
    Returns:
        str: The path to the downloaded file.
    """
    try:
        os.makedirs(dest_dir, exist_ok=True)
        file_name = url.split("/")[-1]
        file_path = os.path.join(dest_dir, file_name)
        if not os.path.exists(file_path):
            print("Downloading dataset...")
            urllib.request.urlretrieve(url, file_path)
        else:
            print("Dataset already downloaded.")
        return file_path
    except Exception as e:
        print("An error occurred while downloading the dataset: ", e)


def extract_dataset(zip_file_path: str, dest_dir: str) -> None:
    """
    Extracts a zip file to the specified destination directory.
    
    Args:
        zip_file_path (str): The path to the zip file to extract.
        dest_dir (str): The directory path to extract the zip file to.
    """
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(dest_dir)
    except Exception as e:
        print("An error occurred while extracting the dataset: ", e)


def organize_images(src_dir: str, dest_dir: str, class_sample_count: int = 1000) -> None:
    """
    Organizes images from a source directory to a destination directory, renaming them according to their class and index.
    
    Args:
        src_dir (str): The directory path of the source images.
        dest_dir (str): The directory path to organize the images to.
        class_sample_count (int, optional): The number of images to organize per class. Defaults to 20.
    """
    os.makedirs(dest_dir, exist_ok=True)

    for class_name in os.listdir(src_dir):
        class_dir = os.path.join(dest_dir, class_name)
        class_src_dir = os.path.join(src_dir, class_name)
        for i, file_name in enumerate(os.listdir(class_src_dir)):
            if i == class_sample_count:
                break
            try:
                file_ext = file_name.split(".")[-1]
                file_dest_name = "{}_{}.{}".format(class_name, i, file_ext)
                file_src_path = os.path.join(class_src_dir, file_name)
                file_dest_path = os.path.join(dest_dir, file_dest_name)
                os.rename(file_src_path, file_dest_path)
            except Exception as e:
                print("An error occurred in the main function: ", e)
                class_sample_count += 1
                continue


def main() -> None:
    """
    The main function to download, extract, and organize the dataset.
    """
    try:
        sample_count = 50
        original_data_path = 'data/cats_and_dogs_filtered'
        sampled_data_path = f'{original_data_path}_{sample_count}'
        url_train = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip'
        file_path_train = download_dataset(url_train, 'data')
        extract_dataset(file_path_train, 'data')
        organize_images(os.path.join(original_data_path, 'train'), os.path.join(sampled_data_path, 'train'))
        organize_images(os.path.join(original_data_path, 'validation'), os.path.join(sampled_data_path, 'test'))
    except Exception as e:
        print("An error occurred in the main function: ", e)


if __name__ == '__main__':
    main()


# MNIST

In [None]:
import urllib.request
import os
import gzip
import numpy as np
import matplotlib.pyplot as plt


def load_mnist_images(filename):
    """Load MNIST image data from a gzipped file.

    Args:
        filename (str): Path to the gzipped file.

    Returns:
        numpy.ndarray: A 3D numpy array containing the image data.

    Raises:
        IOError: If the file cannot be opened.

    """
    try:
        with gzip.open(filename, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=16)
            data = data.reshape(-1, 28, 28)
            return data
    except IOError as e:
        print(f"Error: {e}")
        return None


def load_mnist_labels(filename):
    """Load MNIST label data from a gzipped file.

    Args:
        filename (str): Path to the gzipped file.

    Returns:
        numpy.ndarray: A 1D numpy array containing the label data.

    Raises:
        IOError: If the file cannot be opened.

    """
    try:
        with gzip.open(filename, 'rb') as f:
            data = np.frombuffer(f.read(), np.uint8, offset=8)
            return data
    except IOError as e:
        print(f"Error: {e}")
        return None

def download_dataset():
    # Download the dataset from Yann LeCun's website
    urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'train-images.gz')
    urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 'train-labels.gz')
    urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'test-images.gz')
    urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', 'test-labels.gz')

    # Load the dataset into numpy arrays
    train_images = load_mnist_images('train-images.gz')
    train_labels = load_mnist_labels('train-labels.gz')
    test_images = load_mnist_images('test-images.gz')
    test_labels = load_mnist_labels('test-labels.gz')
    return train_images, train_labels, test_images, test_labels

def main(createdirs=False):
    """
    Downloads the MNIST dataset from Yann LeCun's website, loads it into numpy arrays, and saves the images as PNG files.

    Args:
        createdirs (bool): Whether to create directories to save images or not. Default is False.

    Raises:
        Exception: An error occurred while downloading or loading the dataset.

    Returns:
        None
    """
    try:
        
        train_images, train_labels, test_images, test_labels = download_dataset()
        
        sample_count = 50
        original_data_path = 'data/mnist_images'
        sampled_data_path = f'{original_data_path}_{sample_count}'
        seperator = "_"
        
        if createdirs:
            seperator = "/"
            # Create directories to save images
            if not os.path.exists(os.path.join(original_data_path, 'train')):
                os.makedirs(os.path.join(original_data_path, 'train'))
                for i in range(10):
                    os.makedirs(f'{original_data_path}/train/{i}')

            if not os.path.exists(os.path.join(original_data_path, 'test')):
                os.makedirs(os.path.join(original_data_path, 'test'))
                for i in range(10):
                    os.makedirs(f'{original_data_path}/test/{i}')

        os.makedirs(os.path.join(sampled_data_path, 'train'), exist_ok=True)
        os.makedirs(os.path.join(sampled_data_path, 'test'), exist_ok=True)

        # Save images
        train_counter = [0] * 10
        test_counter = [0] * 10
        
        selected_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

        for i in range(train_images.shape[0]):
            label = train_labels[i]
            if label not in selected_labels:
                continue
                
            if train_counter[label] < sample_count:
                plt.imsave(f'{sampled_data_path}/train/{label}{seperator}{train_counter[label]}.png', train_images[i], cmap='gray')
                train_counter[label] += 1

        for i in range(test_images.shape[0]):
            label = test_labels[i]
            if label not in selected_labels:
                continue
                
            if test_counter[label] < sample_count:
                plt.imsave(f'{sampled_data_path}/test/{label}{seperator}{test_counter[label]}.png', test_images[i], cmap='gray')
                test_counter[label] += 1

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == '__main__':
    main()


# STL-10

In [None]:
import numpy as np
import os
import urllib.request
import tarfile
from PIL import Image
import matplotlib.pyplot as plt

def read_all_images(path_to_data):
    """
    Reads all images from the binary file and returns a numpy array of the images

    Args:
        path_to_data (str): The path to the binary data file.

    Returns:
        numpy.ndarray: A numpy array of the images.
    """
    with open(path_to_data, 'rb') as f:
        # Read the binary data
        data = np.fromfile(f, dtype=np.uint8)

        # Reshape the data into individual images
        images = np.reshape(data, (-1, 3, 96, 96))

        # Transpose the images to the correct format: (num_images, height, width, channels)
        images = np.transpose(images, (0, 2, 3, 1))
        images_rotated = np.rot90(images, k=0, axes=(1, 2))
        images_rotated = np.ascontiguousarray(np.transpose(images_rotated, (0, 2, 1, 3)))

    return images_rotated


def read_labels_from_file(path_to_labels):
    """
    Reads all labels from the binary file and returns a numpy array of the labels

    Args:
        path_to_labels (str): The path to the binary labels file.

    Returns:
        numpy.ndarray: A numpy array of the labels.
    """
    with open(path_to_labels, 'rb') as f:
        # Read the binary data
        labels = np.fromfile(f, dtype=np.uint8)

    return labels


def download_and_extract():
    """
    Downloads and extracts the STL-10 dataset, and returns the train and test sets.

    Returns:
        Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]: A tuple of the train_images,
        train_labels, test_images, and test_labels numpy arrays.
    """
    # Define the URL to download the dataset
    url = 'http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz'

    try:
        # Download the file if it doesn't exist
        if not os.path.exists('./data/stl10_binary.tar.gz'):
            print("Downloading stl10_binary.tar.gz...")
            urllib.request.urlretrieve(url, './data/stl10_binary.tar.gz')

        # Extract the file if it hasn't been extracted
        if not os.path.exists('./data/stl10_binary'):
            print("Extracting stl10_binary.tar.gz...")
            with tarfile.open('./data/stl10_binary.tar.gz', 'r:gz') as tar:
                tar.extractall('./data')

        # Load the dataset into numpy arrays
        train_images = read_all_images('./data/stl10_binary/train_X.bin')
        train_labels = read_labels_from_file('./data/stl10_binary/train_y.bin')
        test_images = read_all_images('./data/stl10_binary/test_X.bin')
        test_labels = read_labels_from_file('./data/stl10_binary/test_y.bin')

        return train_images, train_labels, test_images, test_labels

    except Exception as e:
        print(f"An error occurred while downloading or extracting the dataset: {str(e)}")
        return None, None, None, None


def main():
    """
    Downloads the STL-10 dataset and saves the training and test images with their corresponding labels in separate directories.

    Raises:
        ConnectionError: If the download URL is unreachable.
        IOError: If there is an error while reading or writing the image files.
    """

    try:
        # Download and extract the dataset
        train_images, train_labels, test_images, test_labels = download_and_extract()
    except Exception as e:
        print(f"An error occurred while downloading and extracting the dataset: {e}")
        return
    
    # Define the path to save the images
    sample_count = 50
    original_data_path = 'data/stl10_data'
    sampled_data_path = f'{original_data_path}_{sample_count}'
    train_dir = os.path.join(sampled_data_path,'train')
    test_dir = os.path.join(sampled_data_path,'test')
    
    # Create the directories to save the images, if they don't already exist
    try:
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(test_dir, exist_ok=True)
    except Exception as e:
        print(f"An error occurred while creating the image directories: {e}")
        return
    
    selected_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    
    # Save the training images with labels
    counter = np.zeros(11)
    for i in range(train_images.shape[0]):
        label = train_labels[i]
        if label not in selected_labels:
                continue
                
        if counter[label] < sample_count:
            filename = os.path.join(train_dir, f'{str(label)}_{counter[label]}.png')
            image = train_images[i]
            # Normalize the image
            image = (image - np.min(image)) / (np.max(image) - np.min(image))
            # Convert to uint8
            image = (image * 255).astype(np.uint8)
            try:
                # Save the image
                Image.fromarray(image).save(filename)
            except Exception as e:
                print(f"An error occurred while saving the training image: {e}")
                return
            counter[label] += 1
    
    # Save the test images with labels
    counter = np.zeros(11)
    for i in range(test_images.shape[0]):
        label = test_labels[i]
        if label not in selected_labels:
                continue
                
        if counter[label] < sample_count:
            filename = os.path.join(test_dir, f'{str(label)}_{counter[label]}.png')
            image = test_images[i]
            # Normalize the image
            image = (image - np.min(image)) / (np.max(image) - np.min(image))
            # Convert to uint8
            image = (image * 255).astype(np.uint8)
            try:
                # Save the image
                Image.fromarray(image).save(filename)
            except Exception as e:
                print(f"An error occurred while saving the test image: {e}")
                return
            counter[label] += 1



if __name__ == '__main__':
    main()


# CIFAR-10

In [None]:
import numpy as np
import os
import urllib.request
import tarfile
import pickle
from PIL import Image
from tensorflow import keras

def read_all_images(path_to_data):
    """
    Reads all images from the binary file and returns a numpy array of the images

    Args:
        path_to_data (str): The path to the binary data file.

    Returns:
        numpy.ndarray: A numpy array of the images.
    """
    with open(path_to_data, 'rb') as f:
        # Load the binary data
        data_dict = pickle.load(f, encoding='bytes')
        # Extract the images and reshape
        images = np.reshape(data_dict[b'data'], (-1, 3, 32, 32))
        # Transpose the images to the correct format: (num_images, height, width, channels)
        images = np.transpose(images, (0, 2, 3, 1))
    return images


def read_labels_from_file(path_to_labels):
    """
    Reads all labels from the binary file and returns a numpy array of the labels

    Args:
        path_to_labels (str): The path to the binary labels file.

    Returns:
        numpy.ndarray: A numpy array of the labels.
    """
    with open(path_to_labels, 'rb') as f:
        # Load the pickle data
        labels_dict = pickle.load(f, encoding='bytes')
        # Extract the labels
        labels = np.array(labels_dict[b'labels'])
    return labels


def download_and_extract():
    """
    Downloads and extracts the CIFAR-10 dataset, and returns the train and test sets.

    Returns:
        Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]: A tuple of the train_images,
        train_labels, test_images, and test_labels numpy arrays.
    """
    # Define the URL to download the dataset
    url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'

    # Download the file if it doesn't exist
    if not os.path.exists('./data/cifar-10-python.tar.gz'):
        print("Downloading cifar-10-python.tar.gz...")
        urllib.request.urlretrieve(url, './cifar-10-python.tar.gz')

    # Extract the file if it hasn't been extracted
    if not os.path.exists('./data/cifar-10-batches-py'):
        print("Extracting cifar-10-python.tar.gz...")
        with tarfile.open('./data/cifar-10-python.tar.gz', 'r:gz') as tar:
            tar.extractall('./data')

    # Load the dataset into numpy arrays
    train_images = read_all_images('./data/cifar-10-batches-py/data_batch_1')
    train_labels = read_labels_from_file('./data/cifar-10-batches-py/data_batch_1')
    for i in range(2, 6):
        temp_images = read_all_images(f'./data/cifar-10-batches-py/data_batch_{i}')
        temp_labels = read_labels_from_file(f'./data/cifar-10-batches-py/data_batch_{i}')
        train_images = np.concatenate([train_images, temp_images], axis=0)
        train_labels = np.concatenate([train_labels, temp_labels], axis=0)
    test_images = read_all_images('./data/cifar-10-batches-py/test_batch')
    test_labels = read_labels_from_file('./data/cifar-10-batches-py/test_batch')
    

    return  (train_images, train_labels), (test_images, test_labels)


def main():
    """
    Downloads the CIFAR-10 dataset and saves the training and test images with their corresponding labels in separate directories.

    Raises:
        ConnectionError: If the download URL is unreachable.
        IOError: If there is an error while reading or writing the image files.
    """

    try:
        # Download and extract the dataset
        (train_images, train_labels), (test_images, test_labels) = keras.datasets.cifar10.load_data()
    except Exception as e:
        print(f"An error occurred while downloading the dataset: {e}")
        return
    
    # Define the path to save the images
    sample_count = 50
    original_data_path = 'data/cifar10_data'
    sampled_data_path = f'{original_data_path}_{sample_count}'
    train_dir = os.path.join(sampled_data_path,'train')
    test_dir = os.path.join(sampled_data_path,'test')
    
    # Create the directories to save the images, if they don't already exist
    try:
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(test_dir, exist_ok=True)
    except Exception as e:
        print(f"An error occurred while creating the image directories: {e}")
        return
    
    selected_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    
    # Save the training images with labels
    counter = np.zeros(10)
    for i in range(train_images.shape[0]):
        label = train_labels[i][0]
        if label not in selected_labels:
                continue
                
        if counter[label] < sample_count:
            filename = os.path.join(train_dir, f'{str(label)}_{counter[label]}.png')
            image = train_images[i]
            # Convert to uint8
            image = image.astype(np.uint8)
            try:
                # Save the image
                Image.fromarray(image).save(filename)
            except Exception as e:
                print(f"An error occurred while saving the training image: {e}")
                return
            counter[label] += 1
    
    # Save the test images with labels
    counter = np.zeros(10)
    for i in range(test_images.shape[0]):
        label = test_labels[i][0]
        if label not in selected_labels:
                continue
                
        if counter[label] < sample_count:
            filename = os.path.join(test_dir, f'{str(label)}_{counter[label]}.png')
            image = test_images[i]
            # Convert to uint8
            image = image.astype(np.uint8)
            try:
                # Save the image
                Image.fromarray(image).save(filename)
            except Exception as e:
                print(f"An error occurred while saving the test image: {e}")
                return
            counter[label] += 1


if __name__ == '__main__':
    main()


# Fashion-MNSIT

In [None]:
import urllib.request
import os
import gzip
import numpy as np
import matplotlib.pyplot as plt
from torchvision.datasets import FashionMNIST

def download_dataset():
    # Download the Fashion-MNIST dataset
    train_dataset = FashionMNIST(root='./data', train=True, download=True)
    test_dataset = FashionMNIST(root='./data', train=False, download=True)

    # Load the dataset into numpy arrays
    train_images = train_dataset.data.numpy()
    train_labels = train_dataset.targets.numpy()
    test_images = test_dataset.data.numpy()
    test_labels = test_dataset.targets.numpy()

    return train_images, train_labels, test_images, test_labels


def main(createdirs=False):
    """
    Downloads the Fashion-MNIST dataset, loads it into numpy arrays, and saves the images as PNG files.

    Args:
        createdirs (bool): Whether to create directories to save images or not. Default is False.

    Raises:
        Exception: An error occurred while downloading or loading the dataset.

    Returns:
        None
    """
    try:
        train_images, train_labels, test_images, test_labels = download_dataset()

        sample_count = 50
        original_data_path = 'data/fashion_mnist_images'
        sampled_data_path = f'{original_data_path}_{sample_count}'
        separator = "_"

        if createdirs:
            separator = "/"
            # Create directories to save images
            if not os.path.exists(os.path.join(original_data_path, 'train')):
                os.makedirs(os.path.join(original_data_path, 'train'))
                for i in range(10):
                    os.makedirs(f'{original_data_path}/train/{i}')

            if not os.path.exists(os.path.join(original_data_path, 'test')):
                os.makedirs(os.path.join(original_data_path, 'test'))
                for i in range(10):
                    os.makedirs(f'{original_data_path}/test/{i}')

        os.makedirs(os.path.join(sampled_data_path, 'train'), exist_ok=True)
        os.makedirs(os.path.join(sampled_data_path, 'test'), exist_ok=True)

        # Save images
        train_counter = [0] * 10
        test_counter = [0] * 10

        selected_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

        for i in range(train_images.shape[0]):
            label = train_labels[i]
            if label not in selected_labels:
                continue

            if train_counter[label] < sample_count:
                plt.imsave(f'{sampled_data_path}/train/{label}{separator}{train_counter[label]}.png', train_images[i],
                           cmap='gray')
                train_counter[label] += 1

        for i in range(test_images.shape[0]):
            label = test_labels[i]
            if label not in selected_labels:
                continue

            if test_counter[label] < sample_count:
                plt.imsave(f'{sampled_data_path}/test/{label}{separator}{test_counter[label]}.png', test_images[i],
                           cmap='gray')
                test_counter[label] += 1

    except Exception as e:
        print(f"An error occurred: {e}")


if __name__ == '__main__':
    main()
