In [1]:
import os
import torch
import random
import shutil
from torch import nn
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import models, transforms, datasets

In [2]:
def remove_zone_identifier_files(path, recursive=True):
    removed_files = []
    
    for root, _, files in os.walk(path):
        for file in files:
            if "Zone.Identifier" in file:
                full_path = os.path.join(root, file)
                try:
                    os.remove(full_path)
                    removed_files.append(full_path)
                    print(f"🗑️  Eliminado: {full_path}")
                except Exception as e:
                    print(f"⚠️  Error al eliminar {full_path}: {e}")
        
        if not recursive:
            break  # Solo el nivel superior si no es recursivo

    print(f"\n✅ Total eliminados: {len(removed_files)}")
    return removed_files

remove_zone_identifier_files("/home/nahumfg/Projects/GithubProjects/TesisTransparenciaDataset/data/b_classification/asistencia")
remove_zone_identifier_files("/home/nahumfg/Projects/GithubProjects/TesisTransparenciaDataset/data/b_classification/votacion")


✅ Total eliminados: 0

✅ Total eliminados: 0


[]

In [3]:
def list_images_in_path(path, recursive=True, extensions=('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp', '.tiff')):
    image_paths = []
    for root, _, files in os.walk(path):
        for file in files:
            if file.lower().endswith(extensions):
                image_paths.append(os.path.join(root, file))
        if not recursive:
            break  # No seguir en subdirectorios
    return image_paths


In [4]:
def get_input_size(model_name: str):
    name = model_name.lower()
    if name in [
        'vgg16', 'resnet50', 'densenet121',
        'mobilenet_v2', 'googlenet', 'efficientnet_b0'
    ]:
        return (224, 224)
    elif name == 'inception_v3':
        return (299, 299)
    else:
        return (224, 224)  # valor por defecto

def get_model(model_name, num_classes):
    if model_name == 'mobilenet_v2':
        model = models.mobilenet_v2(weights=None)
        model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
    elif model_name == 'resnet50':
        model = models.resnet50(weights=None)
        model.fc = nn.Linear(model.fc.in_features, num_classes)
    else:
        raise ValueError(f"Modelo '{model_name}' no soportado en este script.")
    return model

def predict(image_path, model_path, model_name, data_dir, device=None, show=False):
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')

    # Obtener clases
    dataset = datasets.ImageFolder(data_dir)
    class_names = dataset.classes
    num_classes = len(class_names)

    # Transformaciones
    input_size = get_input_size(model_name)
    transform = transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    # Preparar imagen
    image = Image.open(image_path).convert("RGB")
    input_tensor = transform(image).unsqueeze(0).to(device)

    # Cargar modelo y pesos
    model = get_model(model_name, num_classes)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    # Predicción
    with torch.no_grad():
        output = model(input_tensor)
        _, pred = torch.max(output, 1)
        predicted_class = class_names[pred.item()]

    if show:
        print(f"✅ Predicción: {predicted_class}")
        plt.imshow(image)
        plt.title(f"Predicción: {predicted_class}")
        plt.axis("off")
        plt.show()

    return predicted_class


In [5]:
# Directorios base
data_org = "/home/nahumfg/Projects/GithubProjects/TesisTransparenciaDataset/data/a_org"
data_for_label_studio = "/home/nahumfg/Projects/GithubProjects/TesisTransparenciaDataset/data/c_split_for_labelstudio"

# Rutas de destino para cada clase
asistencia_dir = os.path.join(data_for_label_studio, "asistencia")
votacion_dir = os.path.join(data_for_label_studio, "votacion")

# Aseguramos que existan estos directorios
os.makedirs(asistencia_dir, exist_ok=True)
os.makedirs(votacion_dir, exist_ok=True)

# Listamos todas las imágenes en subdirectorios de data_org
all_images = list_images_in_path(data_org, recursive=True)
random.shuffle(all_images)
total_images = len(all_images)

# Contadores para renombrar
count_asistencia = 0
count_votacion = 0
max_images_per_class = 200  # Sólo queremos 200 de cada clase

# Modelo y ruta de pesos
model_path = "../b_classification/resnet50_best_parallel.pth"
model_name = "resnet50"
data_dir_for_classes = "../../data/a_org"  # Para obtener las clases

# Iteramos sobre todas las imágenes
for i, img_path in enumerate(all_images, start=1):
    # Verifica si ya se llegó al máximo en ambas clases
    if count_asistencia >= max_images_per_class and count_votacion >= max_images_per_class:
        print("\n🚨 Se alcanzaron las 200 imágenes para ambas clases. Finalizando el proceso.")
        break

    # Obtenemos la predicción
    pred_class = predict(
        image_path=img_path,
        model_path=model_path,
        model_name=model_name,
        data_dir=data_dir_for_classes, 
        device='cuda' if torch.cuda.is_available() else 'cpu',
        show=False
    )

    print(f"[{i}/{total_images}] Procesando: {img_path} → Clase: {pred_class}")

    # --- Copiamos sólo si aún no llegamos a 200 ---
    if pred_class == 'asistencia':
        if count_asistencia < max_images_per_class:
            count_asistencia += 1
            new_filename = f"{str(count_asistencia).zfill(3)}_asistencia.jpg"
            new_path = os.path.join(asistencia_dir, new_filename)
            image = Image.open(img_path).convert("RGB")
            image.save(new_path, format="JPEG")
            print(f"   📁 Convertida y copiada a: {new_path}")
        else:
            print("   ⛔ Ya se alcanzaron las 200 imágenes de 'asistencia'. Ignorando...")

    elif pred_class == 'votacion':
        if count_votacion < max_images_per_class:
            count_votacion += 1
            new_filename = f"{str(count_votacion).zfill(3)}_votacion.jpg"
            new_path = os.path.join(votacion_dir, new_filename)
            image = Image.open(img_path).convert("RGB")
            image.save(new_path, format="JPEG")
            print(f"   📁 Convertida y copiada a: {new_path}")
        else:
            print("   ⛔ Ya se alcanzaron las 200 imágenes de 'votacion'. Ignorando...")

    else:
        print("   ⛔ Etiqueta 'otros' detectada. Imagen ignorada.")

print("\n✅ Proceso de clasificación y copiado finalizado.")

[1/13933] Procesando: /home/nahumfg/Projects/GithubProjects/TesisTransparenciaDataset/data/a_org/votacion/008_Asis-vot-26-05-2011-CF_page_22.png → Clase: votacion
   📁 Convertida y copiada a: /home/nahumfg/Projects/GithubProjects/TesisTransparenciaDataset/data/c_split_for_labelstudio/votacion/001_votacion.jpg
[2/13933] Procesando: /home/nahumfg/Projects/GithubProjects/TesisTransparenciaDataset/data/a_org/asistencia/012_00-1-Ast-Votc-11-05-17-OFICIAL-2_page_1.png → Clase: asistencia
   📁 Convertida y copiada a: /home/nahumfg/Projects/GithubProjects/TesisTransparenciaDataset/data/c_split_for_labelstudio/asistencia/001_asistencia.jpg
[3/13933] Procesando: /home/nahumfg/Projects/GithubProjects/TesisTransparenciaDataset/data/a_org/votacion/002_Asis-vot-OFICIAL-17-06-2010-CF_page_9.png → Clase: votacion
   📁 Convertida y copiada a: /home/nahumfg/Projects/GithubProjects/TesisTransparenciaDataset/data/c_split_for_labelstudio/votacion/002_votacion.jpg
[4/13933] Procesando: /home/nahumfg/Project