In [2]:
import os
import pandas as pd
from PIL import Image
from torchvision import transforms
from torchvision.models import efficientnet_b0
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

In [3]:
img_dir = "./SoundAnimals_Train/train_redimensionadas"

csv_path = "./SoundAnimals_Train/train_spectograms.csv"

In [4]:
class AudioDataset(Dataset):
    def __init__(self, csv_path, img_dir, transform=None):
        self.data = pd.read_csv(csv_path)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.img_dir, row["filename"])
        img = Image.open(img_path).convert("RGB")

        if self.transform:
            img = self.transform(img)

        labels = torch.tensor(row.iloc[1:].astype(float).values, dtype=torch.float32)
        return img, labels

In [5]:
def compute_metrics(y_true, y_pred, threshold=0.5):
    y_true = y_true.cpu().numpy()
    y_pred = y_pred.cpu().numpy()

    y_pred_binary = (y_pred >= threshold).astype(int)
    metrics = {
        "F1-Score (micro)": f1_score(y_true, y_pred_binary, average="micro", zero_division=1),
        "F1-Score (macro)": f1_score(y_true, y_pred_binary, average="macro", zero_division=1),
        "Hamming Loss": hamming_loss(y_true, y_pred_binary),
        "Exact Match Ratio": accuracy_score(y_true, y_pred_binary),
    }
    return metrics

In [6]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device="cuda"):
    model.to(device)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        print("-" * 10)

        # Fase de entrenamiento
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader, desc="Training"):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"Training Loss: {epoch_loss:.4f}")

    # Evaluación final en el conjunto de validación
    model.eval()
    val_labels, val_preds = [], []
    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc="Validating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            val_labels.append(labels)
            val_preds.append(outputs)

    val_labels = torch.cat(val_labels, dim=0)
    val_preds = torch.cat(val_preds, dim=0)
    metrics = compute_metrics(val_labels, val_preds)
    print("Validation Metrics:")
    for name, value in metrics.items():
        print(f"{name}: {value:.4f}")

    return model

In [7]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

In [8]:
dataset = AudioDataset(csv_path, img_dir, transform=transform)


In [9]:
n_splits = 10
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
splits = list(kf.split(X=range(len(dataset)), y=dataset.data.iloc[:, 1:].sum(axis=1) > 0))


In [10]:
train_indices, val_indices = splits[0]
train_dataset = torch.utils.data.Subset(dataset, train_indices)
val_dataset = torch.utils.data.Subset(dataset, val_indices)


In [11]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=0)


In [12]:
num_classes = 42
model = efficientnet_b0(pretrained=True)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)


Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /Users/jleandrojm/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:18<00:00, 1.17MB/s]


In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss()


In [1]:
import torch
print("¿MPS está disponible?:", torch.backends.mps.is_available())

¿MPS está disponible?: True


In [14]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Usando el dispositivo: {device}")
model.to(device)
trained_model = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=10,
    device=device,
)


Usando el dispositivo: mps
Epoch 1/10
----------


Training: 100%|██████████| 3499/3499 [32:20<00:00,  1.80it/s]   


Training Loss: 0.0511
Epoch 2/10
----------


Training: 100%|██████████| 3499/3499 [24:51<00:00,  2.35it/s]


Training Loss: 0.0417
Epoch 3/10
----------


Training: 100%|██████████| 3499/3499 [22:45<00:00,  2.56it/s]


Training Loss: 0.0387
Epoch 4/10
----------


Training: 100%|██████████| 3499/3499 [22:42<00:00,  2.57it/s]


Training Loss: 0.0367
Epoch 5/10
----------


Training: 100%|██████████| 3499/3499 [24:34<00:00,  2.37it/s]


Training Loss: 0.0358
Epoch 6/10
----------


Training: 100%|██████████| 3499/3499 [27:43<00:00,  2.10it/s]


Training Loss: 0.0351
Epoch 7/10
----------


Training: 100%|██████████| 3499/3499 [23:57<00:00,  2.43it/s]


Training Loss: 0.0348
Epoch 8/10
----------


Training: 100%|██████████| 3499/3499 [22:46<00:00,  2.56it/s]


Training Loss: 0.0345
Epoch 9/10
----------


Training: 100%|██████████| 3499/3499 [29:53<00:00,  1.95it/s] 


Training Loss: 0.0341
Epoch 10/10
----------


Training: 100%|██████████| 3499/3499 [24:00<00:00,  2.43it/s]


Training Loss: 0.0337


Validating: 100%|██████████| 389/389 [00:44<00:00,  8.68it/s]

Validation Metrics:
F1-Score (micro): 0.8380
F1-Score (macro): 0.5134
Hamming Loss: 0.0106
Exact Match Ratio: 0.6794





In [51]:
test_img_dir ="./SoundAnimals_Train/test_redimensionados" 


output_csv_path = "./salida_5.csv" 

In [52]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

In [53]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
model.eval()

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [54]:
predictions = []

In [55]:
for img_name in tqdm(os.listdir(test_img_dir), desc="Procesando imágenes de test"):
    img_path = os.path.join(test_img_dir, img_name)
    
    
    img = Image.open(img_path).convert("RGB")
    img = transform(img).unsqueeze(0).to(device) 
    with torch.no_grad():
        output = model(img) 
        preds = (torch.sigmoid(output) >= 0.1).int().cpu().numpy().flatten()


    active_labels = " ".join([str(i + 1) for i, pred in enumerate(preds) if pred == 1])
    if not active_labels: 
        active_labels = "0"

   
    predictions.append({"Id": img_name, "Predicted": active_labels})

Procesando imágenes de test: 100%|██████████| 31187/31187 [09:44<00:00, 53.38it/s]


In [56]:
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv(output_csv_path, index=False)

print(f"Predicciones guardadas en: {output_csv_path}")

Predicciones guardadas en: ./salida_5.csv


In [21]:
import pandas as pd


input_csv_path = "./salida.csv"  
output_csv_path = "./final_predicciones.csv"  

df = pd.read_csv(input_csv_path)

df["Id"] = df["Id"].str.replace(".png", ".wav", regex=False)


df.to_csv(output_csv_path, index=False)

print(f"CSV con nombres actualizados guardado en: {output_csv_path}")

CSV con nombres actualizados guardado en: ./final_predicciones.csv


In [57]:
import pandas as pd


input_csv_path = "./salida_5.csv"  
output_csv_path = "./predicciones_ultimo_5.csv"  


df = pd.read_csv(input_csv_path)


df["Id"] = df["Id"].str.replace(r"\.[a-zA-Z0-9]+$", "", regex=True)
df.to_csv(output_csv_path, index=False)

print(f"CSV con nombres sin extensión guardado en: {output_csv_path}")

CSV con nombres sin extensión guardado en: ./predicciones_ultimo_5.csv
