In [1]:
import os
import pandas as pd
from PIL import Image
from torchvision import transforms
from torchvision.models import resnet34
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

In [2]:
img_dir = "./SoundAnimals_Train/train_redimensionadas"

csv_path = "./SoundAnimals_Train/train_spectograms.csv"


In [3]:
class AudioDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.data = pd.read_csv(csv_path)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.img_dir, row["filename"])
        img = Image.open(img_path).convert("RGB")

        if self.transform:
            img = self.transform(img)

        labels = torch.tensor(row.iloc[1:].astype(float).values, dtype=torch.float32)
        return img, labels

In [4]:
def compute_metrics(y_true, y_pred, threshold=0.5):
    y_true = y_true.cpu().numpy()
    y_pred = y_pred.cpu().numpy()

    y_pred_binary = (y_pred >= threshold).astype(int)
    metrics = {
        "F1-Score (micro)": f1_score(y_true, y_pred_binary, average="micro", zero_division=1),
        "F1-Score (macro)": f1_score(y_true, y_pred_binary, average="macro", zero_division=1),
        "Hamming Loss": hamming_loss(y_true, y_pred_binary),
        "Exact Match Ratio": accuracy_score(y_true, y_pred_binary),
    }
    return metrics

In [5]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device="cuda"):
    model.to(device)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        print("-" * 10)

        # Fase de entrenamiento
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader, desc="Training"):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"Training Loss: {epoch_loss:.4f}")

    # Evaluación final en el conjunto de validación
    model.eval()
    val_labels, val_preds = [], []
    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc="Validating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            val_labels.append(labels)
            val_preds.append(outputs)

    val_labels = torch.cat(val_labels, dim=0)
    val_preds = torch.cat(val_preds, dim=0)
    metrics = compute_metrics(val_labels, val_preds)
    print("Validation Metrics:")
    for name, value in metrics.items():
        print(f"{name}: {value:.4f}")

    return model

In [6]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

In [7]:
dataset = AudioDataset(csv_path, img_dir, transform=transform)
n_splits = 10
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
splits = list(kf.split(X=range(len(dataset)), y=dataset.data.iloc[:, 1:].sum(axis=1) > 0))
train_indices, val_indices = splits[0]
train_dataset = torch.utils.data.Subset(dataset, train_indices)
val_dataset = torch.utils.data.Subset(dataset, val_indices)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=0)
num_classes = 42

In [8]:
model = resnet34(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, num_classes)


Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /Users/jleandrojm/.cache/torch/hub/checkpoints/resnet34-b627a593.pth
100%|██████████| 83.3M/83.3M [01:24<00:00, 1.04MB/s]


In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)


In [10]:
criterion = nn.BCEWithLogitsLoss()
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Usando el dispositivo: {device}")
model.to(device)

Usando el dispositivo: mps


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [12]:
trained_model = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=5,
    device=device,
)

Epoch 1/5
----------


Training: 100%|██████████| 3499/3499 [27:18<00:00,  2.14it/s]  


Training Loss: 0.0337
Epoch 2/5
----------


Training: 100%|██████████| 3499/3499 [33:19<00:00,  1.75it/s]  


Training Loss: 0.0194
Epoch 3/5
----------


Training: 100%|██████████| 3499/3499 [27:51<00:00,  2.09it/s]


Training Loss: 0.0148
Epoch 4/5
----------


Training: 100%|██████████| 3499/3499 [28:35<00:00,  2.04it/s]


Training Loss: 0.0122
Epoch 5/5
----------


Training: 100%|██████████| 3499/3499 [39:28<00:00,  1.48it/s]  


Training Loss: 0.0102


Validating: 100%|██████████| 389/389 [01:05<00:00,  5.93it/s]

Validation Metrics:
F1-Score (micro): 0.9448
F1-Score (macro): 0.8840
Hamming Loss: 0.0039
Exact Match Ratio: 0.8645





In [34]:
# Ruta a la carpeta de imágenes de test
test_img_dir = "./SoundAnimals_Train/test_redimensionados"  # Reemplaza con la ruta a tu carpeta de test

# Ruta para guardar el archivo CSV de predicciones
output_csv_path = "./salida_resnet34_4.csv"

In [35]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

In [36]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
model.eval()


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [37]:
predictions = []

In [38]:
for img_name in tqdm(os.listdir(test_img_dir), desc="Procesando imágenes de test"):
    img_path = os.path.join(test_img_dir, img_name)
    
    # Cargar la imagen
    img = Image.open(img_path).convert("RGB")
    img = transform(img).unsqueeze(0).to(device)  # Agregar dimensión batch

    # Generar predicciones
    with torch.no_grad():
        output = model(img)  # Salida del modelo
        preds = (torch.sigmoid(output) >= 0.25).int().cpu().numpy().flatten()

    # Convertir predicciones a formato requerido
    active_labels = " ".join([str(i + 1) for i, pred in enumerate(preds) if pred == 1])
    if not active_labels:  # Si no hay etiquetas activas
        active_labels = "0"

    # Agregar al DataFrame
    predictions.append({"Id": img_name, "Predicted": active_labels})

Procesando imágenes de test: 100%|██████████| 31187/31187 [08:27<00:00, 61.44it/s]


In [39]:
# Crear y guardar el DataFrame en formato CSV
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv(output_csv_path, index=False)

print(f"Predicciones guardadas en: {output_csv_path}")

Predicciones guardadas en: ./salida_resnet34_4.csv


In [40]:
import pandas as pd

# Ruta al CSV existente
# Ruta al CSV existente
input_csv_path = "./salida_resnet34_4.csv"  # Cambia a la ruta de tu CSV actual
output_csv_path = "./predicciones_ultimo_resnet_4.csv"  # Ruta del nuevo CSV

# Cargar el CSV
df = pd.read_csv(input_csv_path)

# Eliminar la extensión de los nombres en la columna "Id"
df["Id"] = df["Id"].str.replace(r"\.[a-zA-Z0-9]+$", "", regex=True)

# Guardar el nuevo CSV
df.to_csv(output_csv_path, index=False)

print(f"CSV con nombres sin extensión guardado en: {output_csv_path}")

CSV con nombres sin extensión guardado en: ./predicciones_ultimo_resnet_4.csv
