In [None]:
from google.colab import drive
import os
import shutil
import pandas as pd
from tqdm import tqdm

# Monta Drive
drive.mount('/content/drive')

RAW_PATH = "/content/drive/MyDrive/pdi/skin-cancer-mnist-ham10000"
OUT_PATH = "/content/drive/MyDrive/pdi/dataset/ham10000/all"

IMG_DIRS = [
    "HAM10000_images_part_1",
    "HAM10000_images_part_2"
]

META = os.path.join(RAW_PATH, "HAM10000_metadata.csv")
df = pd.read_csv(META)

# usa classes mais frequentes
classes = df['dx'].value_counts().index[:7]
print("Classes usadas:", classes.tolist())

# remove dataset antigo
if os.path.exists(OUT_PATH):
    shutil.rmtree(OUT_PATH)

# cria pastas
for c in classes:
    os.makedirs(f"{OUT_PATH}/{c}", exist_ok=True)

# =====================================================
# üî• PASSO CR√çTICO: indexar TODAS as imagens UMA vez
# =====================================================
print("Indexando imagens...")
image_path = {}

for d in IMG_DIRS:
    dir_path = os.path.join(RAW_PATH, d)
    for fname in os.listdir(dir_path):
        if fname.endswith(".jpg"):
            image_id = fname.replace(".jpg", "")
            image_path[image_id] = os.path.join(dir_path, fname)

print(f"{len(image_path)} imagens indexadas.")

# =====================================================
# üöÄ C√≥pia r√°pida
# =====================================================
MAX_IMAGES_PER_CLASS = 200  # ‚ö° Limite de imagens por classe

print("Copiando imagens...")
for c in classes:
    imgs = df[df['dx'] == c]['image_id'].tolist()
    imgs = imgs[:MAX_IMAGES_PER_CLASS]  # ‚ö° Pega apenas as primeiras N imagens
    for img in tqdm(imgs, desc=f"Classe {c}"):
        shutil.copy(image_path[img], f"{OUT_PATH}/{c}/")

print("Dataset ALL criado com sucesso!")


Mounted at /content/drive
Classes usadas: ['nv', 'mel', 'bkl', 'bcc', 'akiec']
Indexando imagens...
10015 imagens indexadas.
Copiando imagens...


Classe nv:  22%|‚ñà‚ñà‚ñè       | 43/200 [00:34<01:57,  1.33it/s]

In [None]:
import os

for classe in os.listdir(f"/content/drive/MyDrive/pdi/dataset/ham10000/all"):
    path = f"/content/drive/MyDrive/pdi/dataset/ham10000/all/{classe}"
    n_imgs = len(os.listdir(path))
    print(f"{classe}: {n_imgs} imagens")