### ***포자랩스 과제***
---

In [1]:
!pip install torchaudio librosa numpy torch



In [2]:
### library import
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount("/content/drive")

# pytorch
import os
import torch
from torch import nn, optim
import torch.nn.functional as F
import torchaudio
from torchaudio.transforms import GriffinLim
import torchaudio.transforms as transforms
from torch.utils.data import TensorDataset, DataLoader
import librosa

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
### data import
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

# audio data transform
def audio_to_spectrogram(waveform,n_fft=400,hop_length=160, n_mels=128):
    spectrogram_transform = transforms.MelSpectrogram(
        sample_rate=16000,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels
    )
    spectrogram=spectrogram_transform(waveform)
    return spectrogram

# combine data
def load_dataset(audio_paths, label_paths):
    dataset = []
    for audio_path, label_path in zip(audio_paths, label_paths):
        waveform, _=torchaudio.load(audio_path)
        label=pd.read_csv(label_path)
        spectrogram=audio_to_spectrogram(waveform)
        dataset.append((spectrogram, label))
    return dataset

data_files=["/content/drive/MyDrive/Self Study/deep learning/dataset/X_train/2202.wav",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/X_train/2203.wav",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/X_train/2204.wav",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/X_train/2241.wav",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/X_train/2242.wav",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/X_train/2243.wav",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/X_train/2244.wav",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/X_train/2288.wav",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/X_train/2289.wav"
            ]
label_files=["/content/drive/MyDrive/Self Study/deep learning/dataset/y_train/2202.csv",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/y_train/2203.csv",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/y_train/2204.csv",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/y_train/2241.csv",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/y_train/2242.csv",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/y_train/2243.csv",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/y_train/2244.csv",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/y_train/2288.csv",
            "/content/drive/MyDrive/Self Study/deep learning/dataset/y_train/2289.csv"]

df=load_dataset(data_files,label_files)
print(len(df))



9


In [4]:
### dataloader
spectrograms=[item[0] for item in df]
labels=[item[1] for item in df]
labels=[df['instrument'].values for df in labels if 'instrument' in df.columns]

# resize spectrogram
def resize_spec(spectrogram,max_len):
  if spectrogram.shape[-1]<max_len:
    padding_size=max_len - spectrogram.shape[-1]
    padding=torch.zeros((spectrogram.shape[0], spectrogram.shape[1], padding_size))
    spectrogram=torch.cat((spectrogram, padding), dim=-1)
  elif spectrogram.shape[-1]>max_len:
    spectrogram=spectrogram[:, :, :max_len]
  return spectrogram

max_length=max([s.shape[-1] for s in spectrograms])
spectrograms=[resize_spec(s, max_length) for s in spectrograms]

# resize label
max_len_label=max(len(l) for l in labels)
padded_labels=[np.pad(l, (0, max_len_label - len(l)), 'constant', constant_values=0) for l in labels]

# to tensor
spectrograms_tensor=torch.stack(spectrograms).float()
labels_tensor=torch.tensor(padded_labels).float()

# to dataloader
batch_size=1
dataset=TensorDataset(spectrograms_tensor, labels_tensor)
dataloader=DataLoader(dataset,batch_size=batch_size,shuffle=True)

  labels_tensor=torch.tensor(padded_labels).float()


In [5]:
### get tensor size
data_iter = iter(dataloader)
spectrograms, labels = next(data_iter)

# size check
print("Spectrogram size:", spectrograms.size())

Spectrogram size: torch.Size([1, 1, 128, 86199])


In [None]:
### model consturction
# hyperparameters
  # (batch_size, channels, height, width)
input_dim = 1 * 128 * 86199
latent_dim=64
channels=1
epochs=50

# model
class AudioCVAE(nn.Module):
    def __init__(self, channels, latent_size):
        super(AudioCVAE, self).__init__()
        # encoder
        self.fc1=nn.Linear(input_dim,128)
        self.fc_mu=nn.Linear(128,latent_dim)
        self.fc_log_var=nn.Linear(128,latent_dim)

        # decoder
        self.fc2=nn.Linear(latent_dim,128)
        self.fc3=nn.Linear(128,input_dim)

    # encode
    def encode(self, x):
      h1=F.relu(self.fc1(x))
      return self.fc_mu(h1), self.fc_log_var(h1)

    # reparameterization
    def reparameterize(self, mu, log_var):
      std=torch.exp(0.5 * log_var)
      eps=torch.randn_like(std)
      return mu + eps * std

    # decode
    def decode(self, z):
      h2=F.relu(self.fc2(z))
      return torch.sigmoid(self.fc3(h2))

    def forward(self, x):
      mu,log_var=self.encode(x_flat)
      z=self.reparameterize(mu, log_var)
      return self.decode(z), mu, log_var

# print model
model=AudioCVAE(channels, latent_dim).to(device)
optimizer=optim.Adam(model.parameters(), lr=1e-3)
print(model)

In [None]:
def train_cvae(model, train_loader, optimizer, epochs):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for i, (spectrograms, conditions) in enumerate(train_loader):
            spectrograms = spectrograms.to(device)
            conditions = conditions.to(device)
            optimizer.zero_grad()
            reconstruction, mu, logvar = model(spectrograms, conditions)

            # build loss function
            recon_loss = F.mse_loss(reconstruction, spectrograms)
            kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
            loss = recon_loss + kl_loss

            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {train_loss/len(train_loader):.4f}")

    print("Training complete")

train_cvae(model, dataloader, optimizer, epochs=50)

In [None]:
### print and save audio file
def generate_audio(model,latent_dim,device="cuda",path):
  z=torch.randn(1,latent_dim).to(device)
  model.eval()

  with torch.no_grad():
    generated_spectrogram, _, _ = model(z)

    waveform = torchaudio.transforms.GriffinLim(n_fft=1024, hop_length=256)(generated_spectrogram.squeeze(0))

    torchaudio.save('generated_audio.wav',path , waveform, sample_rate=16000)

path="/content/drive/MyDrive/Self Study/deep learning/dataset/result.wav"
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
generate_audio(model, latent_dim, device)