In [None]:
import os
import shutil
import random
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from collections import defaultdict

# ================================
# CONFIGURA√á√ïES - ALTERAR AQUI
# ================================

In [None]:
# Caminhos dos datasets originais
CARTOON_PATH = "VCPI_individual/cartoonset100k_jpg"
CELEBA_PATH = "VCPI_individual/celebahq256_imgs"

# Pasta de destino para dataset organizado
OUTPUT_PATH = "dataset/real2cartoon"

# N√∫mero total de imagens por dom√≠nio (real/cartoon)
TOTAL_IMAGES = 30000  # Altere aqui: 1000 para testes r√°pidos, 30000 para treino completo

# Divis√£o treino/teste (0.8 = 80% treino, 20% teste)
TRAIN_SPLIT = 0.8

# ================================
# FUN√á√ïES AUXILIARES
# ================================

In [None]:
def get_all_images(path, extensions=['.jpg', '.jpeg', '.png']):
    """Obter todas as imagens de uma pasta (recursivamente)"""
    images = []
    for ext in extensions:
        images.extend(Path(path).glob(f"**/*{ext}"))
        images.extend(Path(path).glob(f"**/*{ext.upper()}"))
    return [str(img) for img in images]

def create_directory_structure(base_path):
    """Criar estrutura de pastas para CycleGAN"""
    dirs = ['trainA', 'trainB', 'testA', 'testB']
    for dir_name in dirs:
        os.makedirs(os.path.join(base_path, dir_name), exist_ok=True)
    print(f"‚úÖ Estrutura criada em: {base_path}")

def copy_images_with_progress(source_list, dest_folder, prefix="img"):
    """Copiar imagens com barra de progresso manual"""
    os.makedirs(dest_folder, exist_ok=True)
    
    for i, src_path in enumerate(source_list):
        # Nome do arquivo de destino
        ext = os.path.splitext(src_path)[1]
        dest_name = f"{prefix}_{i:06d}{ext}"
        dest_path = os.path.join(dest_folder, dest_name)
        
        # Copiar arquivo
        shutil.copy2(src_path, dest_path)
        
        # Progresso a cada 1000 imagens
        if (i + 1) % 1000 == 0 or (i + 1) == len(source_list):
            print(f"  Copiadas: {i + 1}/{len(source_list)} imagens")
    
    return len(source_list)

def display_sample_images(dataset_path, n_samples=4):
    """Mostrar amostras de cada pasta"""
    dirs = ['trainA', 'trainB', 'testA', 'testB']
    titles = ['Real (Train)', 'Cartoon (Train)', 'Real (Test)', 'Cartoon (Test)']
    
    fig, axes = plt.subplots(4, n_samples, figsize=(15, 12))
    fig.suptitle('Amostras do Dataset Organizado', fontsize=16)
    
    for row, (dir_name, title) in enumerate(zip(dirs, titles)):
        folder_path = os.path.join(dataset_path, dir_name)
        images = get_all_images(folder_path)
        
        if len(images) >= n_samples:
            sample_images = random.sample(images, n_samples)
            for col, img_path in enumerate(sample_images):
                img = Image.open(img_path)
                axes[row, col].imshow(img)
                axes[row, col].axis('off')
                if col == 0:
                    axes[row, col].set_ylabel(title, fontsize=12)
        else:
            for col in range(n_samples):
                axes[row, col].text(0.5, 0.5, 'Sem imagens', ha='center', va='center')
                axes[row, col].axis('off')
    
    plt.tight_layout()
    plt.show()

# ================================
# FUN√á√ÉO PRINCIPAL
# ================================

In [None]:
def organize_datasets():
    """Organizar datasets na estrutura do CycleGAN"""
    
    global TOTAL_IMAGES  # Permitir modifica√ß√£o da vari√°vel global
    
    print("=" * 60)
    print("üé® ORGANIZADOR DE DATASETS - REAL2CARTOON")
    print("=" * 60)
    print(f"üìä Configura√ß√£o atual:")
    print(f"   ‚Ä¢ Total de imagens por dom√≠nio: {TOTAL_IMAGES:,}")
    print(f"   ‚Ä¢ Divis√£o treino/teste: {int(TRAIN_SPLIT*100)}% / {int((1-TRAIN_SPLIT)*100)}%")
    print(f"   ‚Ä¢ Pasta de sa√≠da: {OUTPUT_PATH}")
    print()
    
    # 1. Verificar se datasets existem
    print("üîç 1. Verificando datasets originais...")
    
    if not os.path.exists(CARTOON_PATH):
        print(f"‚ùå ERRO: N√£o encontrado {CARTOON_PATH}")
        return False
    
    if not os.path.exists(CELEBA_PATH):
        print(f"‚ùå ERRO: N√£o encontrado {CELEBA_PATH}")
        return False
    
    print(f"‚úÖ Cartoon dataset: {CARTOON_PATH}")
    print(f"‚úÖ CelebA dataset: {CELEBA_PATH}")
    
    # 2. Contar imagens dispon√≠veis
    print("\nüìà 2. Contando imagens dispon√≠veis...")
    
    cartoon_images = get_all_images(CARTOON_PATH)
    celeba_images = get_all_images(CELEBA_PATH)
    
    print(f"   ‚Ä¢ Cartoons encontrados: {len(cartoon_images):,}")
    print(f"   ‚Ä¢ Rostos reais encontrados: {len(celeba_images):,}")
    
    # Verificar se h√° imagens suficientes
    if len(cartoon_images) < TOTAL_IMAGES:
        print(f"‚ö†Ô∏è  AVISO: S√≥ h√° {len(cartoon_images)} cartoons (pedido: {TOTAL_IMAGES})")
        TOTAL_IMAGES = len(cartoon_images)
    
    if len(celeba_images) < TOTAL_IMAGES:
        print(f"‚ö†Ô∏è  AVISO: S√≥ h√° {len(celeba_images)} rostos reais (pedido: {TOTAL_IMAGES})")
        TOTAL_IMAGES = min(TOTAL_IMAGES, len(celeba_images))
    
    print(f"‚úÖ Usando {TOTAL_IMAGES:,} imagens de cada tipo")
    
    # 3. Preparar divis√µes
    print(f"\nüéØ 3. Preparando divis√µes...")
    
    train_size = int(TOTAL_IMAGES * TRAIN_SPLIT)
    test_size = TOTAL_IMAGES - train_size
    
    print(f"   ‚Ä¢ Treino: {train_size:,} imagens por dom√≠nio")
    print(f"   ‚Ä¢ Teste: {test_size:,} imagens por dom√≠nio")
    
    # Embaralhar e dividir
    random.shuffle(cartoon_images)
    random.shuffle(celeba_images)
    
    # Sele√ß√µes
    cartoon_train = cartoon_images[:train_size]
    cartoon_test = cartoon_images[train_size:train_size + test_size]
    
    celeba_train = celeba_images[:train_size]
    celeba_test = celeba_images[train_size:train_size + test_size]
    
    # 4. Criar estrutura de pastas
    print(f"\nüìÅ 4. Criando estrutura de pastas...")
    create_directory_structure(OUTPUT_PATH)
    
    # 5. Copiar imagens
    print(f"\nüìã 5. Copiando imagens...")
    
    print("üé® Copiando cartoons para treino...")
    copy_images_with_progress(cartoon_train, os.path.join(OUTPUT_PATH, "trainB"), "cartoon")
    
    print("üé® Copiando cartoons para teste...")
    copy_images_with_progress(cartoon_test, os.path.join(OUTPUT_PATH, "testB"), "cartoon")
    
    print("üë§ Copiando rostos reais para treino...")
    copy_images_with_progress(celeba_train, os.path.join(OUTPUT_PATH, "trainA"), "real")
    
    print("üë§ Copiando rostos reais para teste...")
    copy_images_with_progress(celeba_test, os.path.join(OUTPUT_PATH, "testA"), "real")
    
    # 6. Verifica√ß√£o final
    print(f"\n‚úÖ 6. Verifica√ß√£o final...")
    
    structure = {}
    for folder in ['trainA', 'trainB', 'testA', 'testB']:
        path = os.path.join(OUTPUT_PATH, folder)
        count = len(get_all_images(path))
        structure[folder] = count
        print(f"   ‚Ä¢ {folder}: {count:,} imagens")
    
    total_copied = sum(structure.values())
    expected = TOTAL_IMAGES * 2  # real + cartoon
    
    print(f"\nüéâ Dataset organizado com sucesso!")
    print(f"   ‚Ä¢ Total copiado: {total_copied:,} imagens")
    print(f"   ‚Ä¢ Esperado: {expected:,} imagens")
    print(f"   ‚Ä¢ Status: {'‚úÖ OK' if total_copied == expected else '‚ö†Ô∏è VERIFICAR'}")
    print(f"   ‚Ä¢ Localiza√ß√£o: {os.path.abspath(OUTPUT_PATH)}")
    
    return True

In [None]:
def show_dataset_stats():
    """Mostrar estat√≠sticas do dataset organizado"""
    if not os.path.exists(OUTPUT_PATH):
        print("‚ùå Dataset ainda n√£o foi organizado. Execute organize_datasets() primeiro.")
        return
    
    print("\n" + "=" * 40)
    print("üìä ESTAT√çSTICAS DO DATASET")
    print("=" * 40)
    
    for folder in ['trainA', 'trainB', 'testA', 'testB']:
        path = os.path.join(OUTPUT_PATH, folder)
        if os.path.exists(path):
            count = len(get_all_images(path))
            label = {'trainA': 'Rostos Reais (Treino)', 
                    'trainB': 'Cartoons (Treino)',
                    'testA': 'Rostos Reais (Teste)', 
                    'testB': 'Cartoons (Teste)'}[folder]
            print(f"{label:.<25} {count:>6,} imagens")
    
    print("\nüñºÔ∏è  Mostrando amostras...")
    display_sample_images(OUTPUT_PATH)

# ================================
# EXECU√á√ÉO
# ================================

In [None]:
organize_datasets()