In [12]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split
import glob

# ============ CONFIG ============
base_dir = '/tf/keras_neural_network/Mis_Tests/plant-pathology-2020-fgvc7/images'     # carpeta donde están las imágenes originales
csv_path = '/tf/keras_neural_network/Mis_Tests/plant-pathology-2020-fgvc7/train.csv' # CSV con image_id (sin extensión) + one-hot cols
extensions_to_try = [".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"]
seed = 42
train_frac = 0.6
test_frac = 0.3
val_frac = 0.1
# ================================

# cargar df
df = pd.read_csv(csv_path)

# columnas de clase (todo excepto image_id)
class_columns = [c for c in df.columns if c != "image_id"]

# function: obtener nombre de clase (primera columna con valor==1)
def get_class_name(row):
    # si los valores no son 0/1 exactos, consideramos >0.5 como positivo
    for col in class_columns:
        try:
            if float(row[col]) == 1 or float(row[col]) > 0.5:
                return col
        except Exception:
            continue
    return None

df["class_name"] = df.apply(get_class_name, axis=1)

# comprobar filas sin clase
no_class = df[df["class_name"].isnull()]
if len(no_class) > 0:
    print(f"[ERROR] {len(no_class)} imágenes sin clase asignada. Ejemplos:")
    print(no_class.head())
    raise SystemExit("Corrige el CSV antes de continuar.")

# construir filenames (no añadimos extensión todavía)
df["image_id_str"] = df["image_id"].astype(str)

# helper para encontrar el archivo con extensión
def find_image_path(base_dir, image_id):
    # busca coincidencias estrictas image_id + ext
    for ext in extensions_to_try:
        candidate = os.path.join(base_dir, image_id + ext)
        if os.path.exists(candidate):
            return candidate
    # si no está, intenta buscar con glob (por si hay sufijos/leading zeros etc)
    pattern = os.path.join(base_dir, image_id + ".*")
    matches = glob.glob(pattern)
    if matches:
        # devuelve el primero que encuentre
        return matches[0]
    return None

# agregar columna con ruta encontrada (pero no fallar aún)
df["file_path"] = df["image_id_str"].apply(lambda iid: find_image_path(base_dir, iid))
missing_files = df[df["file_path"].isnull()]
if len(missing_files) > 0:
    print(f"[WARN] {len(missing_files)} imágenes no encontradas en {base_dir}. Ejemplos:")
    print(missing_files.head())
    # decidimos continuar pero no mover esas entradas
    df = df[~df["file_path"].isnull()].reset_index(drop=True)

# preparar etiquetas para stratify
labels = df["class_name"].values

# comprobar tamaños por clase
class_counts = df["class_name"].value_counts()
min_count = class_counts.min()
print("Recuento por clase (ejemplo):")
print(class_counts.head())

use_stratify = True
if min_count < 2:
    use_stratify = False
    print("[WARN] Al menos una clase tiene <2 muestras. No se podrá hacer 'stratify' correctamente. Se usará partición aleatoria sin estratificar.")

# Primero: train vs temp (train_frac vs 1-train_frac)
if use_stratify:
    train_df, temp_df = train_test_split(df, train_size=train_frac, stratify=labels, random_state=seed, shuffle=True)
else:
    train_df, temp_df = train_test_split(df, train_size=train_frac, random_state=seed, shuffle=True)

# Ahora dividir temp en test y val con proporción relativa
# test_frac_relative = test_frac / (test_frac + val_frac) = 0.3 / 0.4 = 0.75
temp_total = test_frac + val_frac
if temp_total <= 0:
    raise ValueError("test_frac + val_frac debe ser > 0")

test_relative = test_frac / temp_total

if use_stratify:
    test_df, val_df = train_test_split(temp_df, train_size=test_relative, stratify=temp_df["class_name"].values, random_state=seed, shuffle=True)
else:
    test_df, val_df = train_test_split(temp_df, train_size=test_relative, random_state=seed, shuffle=True)

print(f"Split sizes -> train: {len(train_df)}, test: {len(test_df)}, val: {len(val_df)}")

# función para mover archivos a la estructura dataset/{split}/{class}/file
def move_to_split(df_subset, split_name):
    for _, row in df_subset.iterrows():
        src = row["file_path"]
        cls = row["class_name"]
        fname = os.path.basename(src)
        dest_dir = os.path.join(base_dir, split_name, cls)  # carpeta final dentro de la carpeta original
        os.makedirs(dest_dir, exist_ok=True)
        dest = os.path.join(dest_dir, fname)
        try:
            shutil.move(src, dest)
        except Exception as e:
            print(f"[ERROR] al mover {src} → {dest}: {e}")

# mover
move_to_split(train_df, "train")
move_to_split(test_df, "test")
move_to_split(val_df, "val")

print("✅ Movimiento completado.")
# imprimir resumen por split y clase
from collections import Counter
for split_name, d in [("train", train_df), ("test", test_df), ("val", val_df)]:
    c = Counter(d["class_name"].values)
    print(f"--- {split_name} --- total {len(d)}")
    for k, v in c.items():
        print(f"  {k}: {v}")

Recuento por clase (ejemplo):
class_name
rust                 622
scab                 592
healthy              516
multiple_diseases     91
Name: count, dtype: int64
Split sizes -> train: 1092, test: 546, val: 183
✅ Movimiento completado.
--- train --- total 1092
  scab: 355
  rust: 373
  healthy: 309
  multiple_diseases: 55
--- test --- total 546
  scab: 178
  rust: 186
  healthy: 155
  multiple_diseases: 27
--- val --- total 183
  scab: 59
  healthy: 52
  rust: 63
  multiple_diseases: 9


In [11]:
from PIL import Image
import os
import numpy as np

path = base_dir
folder = ['/train','/test','/val']
subfolder = ['/healthy','/multiple_diseases', '/rust', '/scab']

def resize_with_color_reduction(image_path, target_size=(384, 256), bits_per_channel=6):
    """Resize image and reduce color depth while preserving color information"""
    for item in os.listdir(image_path):
        full_path = os.path.join(image_path, item)
        if os.path.isfile(full_path) and item.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            try:
                with Image.open(full_path) as im:
                    # Resize to target size (width, height)
                    imResize = im.resize(target_size, Image.Resampling.LANCZOS)
                    
                    # Reduce color depth
                    if bits_per_channel < 8:
                        factor = 256 // (2 ** bits_per_channel)
                        imReduced = imResize.point(lambda x: (x // factor) * factor)
                    else:
                        imReduced = imResize
                    
                    # Save with optimization
                    f, e = os.path.splitext(full_path)
                    imReduced.save(f + '.jpg', 'JPEG', quality=85, optimize=True)
                    
            except Exception as e:
                print(f"Error processing {item}: {e}")

# Process all folders
for i in folder:
    for j in subfolder:
        current_path = path + i + j
        if os.path.exists(current_path):
            resize_with_color_reduction(current_path, target_size=(384, 256), bits_per_channel=6)
        else:
            print(f"Path does not exist: {current_path}")

Processed: Train_3.jpg -> 6 bits/channel
Processed: Train_281.jpg -> 6 bits/channel
Processed: Train_125.jpg -> 6 bits/channel
Processed: Train_1486.jpg -> 6 bits/channel
Processed: Train_520.jpg -> 6 bits/channel
Processed: Train_362.jpg -> 6 bits/channel
Processed: Train_548.jpg -> 6 bits/channel
Processed: Train_606.jpg -> 6 bits/channel
Processed: Train_279.jpg -> 6 bits/channel
Processed: Train_1162.jpg -> 6 bits/channel
Processed: Train_347.jpg -> 6 bits/channel
Processed: Train_378.jpg -> 6 bits/channel
Processed: Train_1316.jpg -> 6 bits/channel
Processed: Train_1641.jpg -> 6 bits/channel
Processed: Train_588.jpg -> 6 bits/channel
Processed: Train_294.jpg -> 6 bits/channel
Processed: Train_108.jpg -> 6 bits/channel
Processed: Train_1177.jpg -> 6 bits/channel
Processed: Train_518.jpg -> 6 bits/channel
Processed: Train_641.jpg -> 6 bits/channel
Processed: Train_1082.jpg -> 6 bits/channel
Processed: Train_615.jpg -> 6 bits/channel
Processed: Train_1716.jpg -> 6 bits/channel
Proces