In [5]:
import requests
import zipfile
import io
import scipy.io
import tempfile
import splitfolders 
import shutil
import random

import os
import numpy as np
import matplotlib.pyplot as plt
import cv2

# Loading dataset

If you only downloaded the notebook without the dataset run the following two blocks:

In [None]:

# url = "https://github.com/HenriqueDSousa/pix2pix/raw/main/dataset/ut-zap50k-data.zip?download="

# response = requests.get(url)
# response.raise_for_status()

# # Unzip the file
# with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
#     zip_ref.extractall("/extracted_files")


In [None]:
# url = "https://github.com/HenriqueDSousa/pix2pix/raw/main/dataset/ut-zap50k-images.zip?download="

# response = requests.get(url)
# response.raise_for_status()

# # Unzip the file
# with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
#     zip_ref.extractall("/extracted_files")


### Extracting files

In [None]:
extract_path = './extracted_files'

if not os.path.isdir(extract_path):
    zip_path = 'dataset/ut-zap50k-data.zip'

    os.makedirs(extract_path, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    
    zip_path = 'dataset/ut-zap50k-images.zip'

    os.makedirs(extract_path, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

Selecting all images and spliting them into train, test and validation. The output will be on *image_data* directory. 

In [None]:
def flatten_directory(source_dir, flat_dir):
    # Create the flat directory if it doesn't exist
    os.makedirs(flat_dir, exist_ok=True)
    
    # Iterate through all files and copy images to the flat directory
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                # Define source and destination paths
                source_path = os.path.join(root, file)
                destination_path = os.path.join(flat_dir, file)
                
                # Ensure unique filenames if necessary
                if os.path.exists(destination_path):
                    # Append a unique suffix to the filename
                    base, ext = os.path.splitext(file)
                    count = 1
                    while os.path.exists(destination_path):
                        destination_path = os.path.join(flat_dir, f"{base}_{count}{ext}")
                        count += 1
                
                # Copy the file to the flat directory
                shutil.copy2(source_path, destination_path)

# Example usage
source_dir = 'extracted_files/ut-zap50k-images'
flat_dir = 'flattened_dataset'

flatten_directory(source_dir, flat_dir)

In [7]:
def split_data(flat_dir, train_dir, val_dir, test_dir, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    classes = [d for d in os.listdir(flat_dir) if os.path.isdir(os.path.join(flat_dir, d))]
     
    for cls in classes:
        cls_path = os.path.join(flat_dir, cls)
        images = os.listdir(cls_path)
        random.shuffle(images)

        train_split = int(train_ratio * len(images))
        val_split = int(val_ratio * len(images))
        
        train_images = images[:train_split]
        val_images = images[train_split:train_split + val_split]
        test_images = images[train_split + val_split:]

        os.makedirs(os.path.join(train_dir, cls), exist_ok=True)
        os.makedirs(os.path.join(val_dir, cls), exist_ok=True)
        os.makedirs(os.path.join(test_dir, cls), exist_ok=True)

        for img in train_images:
            shutil.copy2(os.path.join(cls_path, img), os.path.join(train_dir, cls, img))

        for img in val_images:
            shutil.copy2(os.path.join(cls_path, img), os.path.join(val_dir, cls, img))

        for img in test_images:
            shutil.copy2(os.path.join(cls_path, img), os.path.join(test_dir, cls, img))

source_dir = 'extracted_files/ut-zap50k-images'
flat_dir = 'flattened_dataset'
train_dir = 'image_data/train'
val_dir = 'image_data/val'
test_dir = 'image_data/test'

flatten_directory(source_dir, flat_dir)

split_data(flat_dir, train_dir, val_dir, test_dir)

KeyboardInterrupt: 

## Images

In [None]:
image = cv2.imread("extracted_files/ut-zap50k-images/Boots/Over the Knee/Annie/8016025.7294.jpg")

plt.imshow(image)