In [12]:
#pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
from preprocessing import Preprocessing, SplitAudio
import torchaudio



In [6]:
ruta_carpeta = "./MusicCaps"  # Reemplaza con la ruta real de tu carpeta

archivos = os.listdir(ruta_carpeta)

X = []
metadata = []
sample_rate_red = 16000
for archivo in archivos[:200]:
    waveform, samp_rt = torchaudio.load("./MusicCaps/" + archivo)
    f, s = SplitAudio(waveform, sample_rate = samp_rt, new_sample_rate = sample_rate_red)

    f_spec, f_maxi, f_mini = Preprocessing(f, 16000, resampler_f = False)
    s_spec, s_maxi, s_mini = Preprocessing(s, 16000, resampler_f = False)

    X += [f_spec, s_spec]
    metadata += [{"nombre":archivo, "parte":"first", "minimum":f_mini, "maximum":f_maxi},
                 {"nombre":archivo, "parte":"second", "minimum":s_mini, "maximum":s_maxi}]



In [7]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convertir la lista de espectrogramas en un tensor
X_tensor = torch.stack(X)  # Suponiendo que X es una lista de tensores (N, 256, 256)
print(X_tensor.shape)
# Añadir la dimensión del canal (1 para monocanal)
X_tensor = X_tensor.unsqueeze(1)  # (N, 1, 256, 256)
print(X_tensor.shape)

batch_size = 64
dataset = TensorDataset(X_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for batch in dataloader:
    batch = batch[0]  # Extraer el tensor del DataLoader
    print("Batch shape:", batch.shape)  # Debe ser (64, 1, 256, 256)
    break  # Solo mostramos el primer batch

torch.Size([400, 256, 256])
torch.Size([400, 1, 256, 256])
Batch shape: torch.Size([64, 1, 256, 256])


In [62]:
from torch import nn
import torch.optim as optim

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=128, kernel_size=3, stride=2, padding=1),  # Reduce tamaño a 128x128
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=2, padding=1),  # Reduce tamaño a 128x128
            nn.BatchNorm2d(128),
            nn.Tanh()
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(in_channels=128, out_channels=128, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(in_channels=128, out_channels=128, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(in_channels=128, out_channels=128, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.ConvTranspose2d(in_channels=128, out_channels=128, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Conv2d(in_channels=128, out_channels=1, kernel_size=3, stride=1, padding=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

autoencoder = Autoencoder()

# Función de pérdida (MSE)
criterion = nn.MSELoss()

# Optimizador AdamW
optimizer = optim.AdamW(autoencoder.parameters(), lr=1e-3, weight_decay=1e-4)

In [63]:
from torchinfo import summary
summary(autoencoder, input_size=(64, 1, 256, 256))  # (batch_size, channels, height, width)


Layer (type:depth-idx)                   Output Shape              Param #
Autoencoder                              [64, 1, 256, 256]         --
├─Sequential: 1-1                        [64, 128, 16, 16]         --
│    └─Conv2d: 2-1                       [64, 128, 128, 128]       1,280
│    └─BatchNorm2d: 2-2                  [64, 128, 128, 128]       256
│    └─LeakyReLU: 2-3                    [64, 128, 128, 128]       --
│    └─Conv2d: 2-4                       [64, 128, 64, 64]         147,584
│    └─BatchNorm2d: 2-5                  [64, 128, 64, 64]         256
│    └─LeakyReLU: 2-6                    [64, 128, 64, 64]         --
│    └─Conv2d: 2-7                       [64, 128, 32, 32]         147,584
│    └─BatchNorm2d: 2-8                  [64, 128, 32, 32]         256
│    └─LeakyReLU: 2-9                    [64, 128, 32, 32]         --
│    └─Conv2d: 2-10                      [64, 128, 16, 16]         147,584
│    └─BatchNorm2d: 2-11                 [64, 128, 16, 16]      

In [46]:
import torch
import torch.nn as nn

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=128, kernel_size=3, stride=2, padding=1),  # Reduce tamaño a 128x128
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=2, padding=1),
        )

        # Decoder
        self.decoder = nn.Sequential(
            # nn.ConvTranspose2d(in_channels=128, out_channels=1, kernel_size=3, stride=2, padding=1, output_padding=1),
            # nn.BatchNorm2d(1),
            nn.LeakyReLU(0.2)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Crear el modelo y verificar la forma de salida
autoencoder = Autoencoder()
x = torch.randn(1, 1, 256, 256)  # Ejemplo de entrada (batch=1, channels=1, height=256, width=256)
output = autoencoder(x)
print("Shape de la salida:", output.shape)  # Debería ser (1, 1, 256, 256)
summary(autoencoder, input_size=(64, 1, 256, 256))  # (batch_size, channels, height, width)

Shape de la salida: torch.Size([1, 128, 64, 64])


Layer (type:depth-idx)                   Output Shape              Param #
Autoencoder                              [64, 128, 64, 64]         --
├─Sequential: 1-1                        [64, 128, 64, 64]         --
│    └─Conv2d: 2-1                       [64, 128, 128, 128]       1,280
│    └─BatchNorm2d: 2-2                  [64, 128, 128, 128]       256
│    └─LeakyReLU: 2-3                    [64, 128, 128, 128]       --
│    └─Conv2d: 2-4                       [64, 128, 64, 64]         147,584
├─Sequential: 1-2                        [64, 128, 64, 64]         --
│    └─LeakyReLU: 2-5                    [64, 128, 64, 64]         --
Total params: 149,120
Trainable params: 149,120
Non-trainable params: 0
Total mult-adds (Units.GIGABYTES): 40.03
Input size (MB): 16.78
Forward/backward pass size (MB): 2415.92
Params size (MB): 0.60
Estimated Total Size (MB): 2433.29