In [27]:
import os
import librosa
import numpy as np
import pretty_midi
from librosa.display import specshow
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.functional import pad
import tensorflow as tf
import sys


# Add the path to the cloned models/research/audioset directory
# sys.path.append(os.path.join('/home/lordvirg/university/GANmaster/Generate_Your_Own_Music/models/research/audioset/vggish'))

In [2]:
# Directories for input and output
VOCALS_DIR = "dataset/vocals"
INSTRUMENTALS_DIR = "dataset/instrumentals"
PROCESSED_DIR = "processed_data"

# Create processed directories
os.makedirs(os.path.join(PROCESSED_DIR, "vocals"), exist_ok=True)
os.makedirs(os.path.join(PROCESSED_DIR, "instrumentals"), exist_ok=True)

In [3]:
def mp3_to_melspectrogram(file_path, n_fft=2048, hop_length=512, n_mels=128, target_duration=30, sr=22050):
    y, sr = librosa.load(file_path, sr=sr)
    target_length = int(target_duration * sr)  # Number of samples for 30 seconds

    # Trim or pad the audio to the target length
    if len(y) < target_length:
        padding = target_length - len(y)
        y = np.pad(y, (0, padding), mode='constant')
    elif len(y) > target_length:
        y = y[:target_length]

    # Convert to mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)  # Convert to decibel scale

    # Ensure the output is 2D (1 channel)
    return mel_spec_db


def normalize_spectrogram(spec):
    scaler = MinMaxScaler(feature_range=(0, 1))
    spec_normalized = scaler.fit_transform(spec)
    return spec_normalized

def segment_spectrogram(spec, frame_length=128):
    segments = []
    for i in range(0, spec.shape[1] - frame_length + 1, frame_length // 2):  # Overlapping frames
        segments.append(spec[:, i:i + frame_length])
    return np.array(segments)


In [4]:
# Process Vocals
vocal_files = [f for f in os.listdir(VOCALS_DIR) if f.endswith(".mp3")]
for file in vocal_files:
    file_path = os.path.join(VOCALS_DIR, file)
    mel_spec = mp3_to_melspectrogram(file_path)
    mel_spec_normalized = normalize_spectrogram(mel_spec)
    mel_segments = segment_spectrogram(mel_spec_normalized)
    np.save(os.path.join(PROCESSED_DIR, "vocals", file.replace(".mp3", "_mel_segments.npy")), mel_segments)

# Process Instrumentals
instrumental_files = [f for f in os.listdir(INSTRUMENTALS_DIR) if f.endswith(".mp3")]
for file in instrumental_files:
    file_path = os.path.join(INSTRUMENTALS_DIR, file)
    mel_spec = mp3_to_melspectrogram(file_path)
    mel_spec_normalized = normalize_spectrogram(mel_spec)
    mel_segments = segment_spectrogram(mel_spec_normalized)
    np.save(os.path.join(PROCESSED_DIR, "instrumentals", file.replace(".mp3", "_mel_segments.npy")), mel_segments)


In [5]:
class VocalInstrumentalDataset(Dataset):
    def __init__(self, vocal_dir, instrumental_dir):
        self.vocal_files = sorted([os.path.join(vocal_dir, f) for f in os.listdir(vocal_dir) if f.endswith(".npy")])
        self.instrumental_files = sorted([os.path.join(instrumental_dir, f) for f in os.listdir(instrumental_dir) if f.endswith(".npy")])

    def __len__(self):
        return len(self.vocal_files)

    def __getitem__(self, idx):
        vocal = np.load(self.vocal_files[idx])  # Shape: [n_segments, n_mels, n_frames]
        instrumental = np.load(self.instrumental_files[idx])

        # Ensure single segment is used for each batch item
        if len(vocal.shape) == 3:  # [n_segments, n_mels, n_frames]
            vocal = vocal[0]  # Pick first segment or handle multiple segments as needed
        if len(instrumental.shape) == 3:
            instrumental = instrumental[0]

        # Add channel dimension
        vocal = np.expand_dims(vocal, axis=0)  # Shape: [1, n_mels, n_frames]
        instrumental = np.expand_dims(instrumental, axis=0)

        return torch.tensor(vocal, dtype=torch.float32), torch.tensor(instrumental, dtype=torch.float32)


In [6]:
class Generator(nn.Module):
    def __init__(self, input_channels, output_channels):
        super(Generator, self).__init__()
        # Encoder
        self.enc1 = nn.Conv2d(input_channels, 64, kernel_size=4, stride=2, padding=1)
        self.enc2 = nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1)
        self.enc3 = nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1)
        self.enc4 = nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1)

        # Decoder
        self.dec4 = nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1)
        self.dec3 = nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1)
        self.dec2 = nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1)
        self.dec1 = nn.ConvTranspose2d(64, output_channels, kernel_size=4, stride=2, padding=1)

        # Activation
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

    def forward(self, x):
        # Encoding
        e1 = self.relu(self.enc1(x))
        e2 = self.relu(self.enc2(e1))
        e3 = self.relu(self.enc3(e2))
        e4 = self.relu(self.enc4(e3))

        # Decoding
        d4 = self.relu(self.dec4(e4))
        d3 = self.relu(self.dec3(d4 + e3))  # Skip connection
        d2 = self.relu(self.dec2(d3 + e2))  # Skip connection
        d1 = self.tanh(self.dec1(d2 + e1))  # Skip connection

        return d1

In [7]:
class Discriminator(nn.Module):
    def __init__(self, input_channels):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(input_channels, 64, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2),
            nn.Conv2d(256, 1, kernel_size=4, stride=1, padding=1)
        )

    def forward(self, x):
        return self.model(x)

In [92]:
class PatchDiscriminator(nn.Module):
    def __init__(self, input_channels):
        super(PatchDiscriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(input_channels, 64, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2),
            nn.Conv2d(256, 1, kernel_size=4, stride=1, padding=1)  # Output single-channel feature map
        )
    
    def forward(self, x):
        return self.model(x)


In [87]:
# Loss Functions
def adversarial_loss(predictions, targets):
    return nn.MSELoss()(predictions, targets)

def reconstruction_loss(predicted, target):
    return nn.L1Loss()(predicted, target)

In [86]:
# Training Function
def train(generator, discriminator, dataloader, g_optimizer, d_optimizer, epochs, device):
    generator.train()
    discriminator.train()
    g_loss_item = 0
    d_loss_item = 0

    for epoch in range(epochs):
        for i, (vocal, instrumental) in enumerate(dataloader):
            vocal, instrumental = vocal.to(device), instrumental.to(device)

            # Train Discriminator
            d_optimizer.zero_grad()
            real_data = torch.cat((vocal, instrumental), dim=1)
            fake_data = torch.cat((vocal, generator(vocal)), dim=1)

            real_loss = adversarial_loss(discriminator(real_data), torch.ones_like(discriminator(real_data)))
            fake_loss = adversarial_loss(discriminator(fake_data), torch.zeros_like(discriminator(fake_data)))
            d_loss = (real_loss + fake_loss) / 2

            d_loss.backward()
            d_optimizer.step()

            # Train Generator
            g_optimizer.zero_grad()
            fake_data = generator(vocal)
            g_loss_adv = adversarial_loss(discriminator(torch.cat((vocal, fake_data), dim=1)), torch.ones_like(discriminator(torch.cat((vocal, fake_data), dim=1))))
            g_loss_rec = reconstruction_loss(fake_data, instrumental)
            g_loss = g_loss_adv + g_loss_rec

            g_loss.backward()
            g_optimizer.step()

            if i % 10 == 0:
                print(f"Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(dataloader)}], D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")
                
            g_loss_item = g_loss.item()
            d_loss_item = d_loss.item()
            
                    
    return d_loss_item, g_loss_item

In [41]:
def validate(generator, dataloader, device):
    generator.eval()
    snr_list, lsd_list = [], []

    with torch.no_grad():
        for vocal, instrumental in dataloader:
            vocal, instrumental = vocal.to(device), instrumental.to(device)
            predicted = generator(vocal)

            # Calculate SNR
            signal_power = torch.sum(instrumental**2)
            noise_power = torch.sum((instrumental - predicted)**2)
            snr = 10 * torch.log10(signal_power / noise_power)
            snr_list.append(snr.item())

            # Calculate LSD
            instrumental_spec = librosa.amplitude_to_db(torch.squeeze(instrumental.cpu()).numpy(), ref=np.max)
            predicted_spec = librosa.amplitude_to_db(torch.squeeze(predicted.cpu()).numpy(), ref=np.max)
            lsd = np.mean(np.sqrt(np.mean((instrumental_spec - predicted_spec)**2, axis=-1)))
            lsd_list.append(lsd)

    avg_snr = np.mean(snr_list)
    avg_lsd = np.mean(lsd_list)

    print(f"Validation Results - SNR: {avg_snr:.4f}, LSD: {avg_lsd:.4f}")
    
    return avg_snr, avg_lsd



In [99]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

# Initialize dataset and dataloader
dataset = VocalInstrumentalDataset(
    vocal_dir=os.path.join(PROCESSED_DIR, "vocals"),
    instrumental_dir=os.path.join(PROCESSED_DIR, "instrumentals")
)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize models
generator = Generator(input_channels=1, output_channels=1).to(device)
discriminator = Discriminator(input_channels=2).to(device)
# discriminator = PatchDiscriminator(input_channels=2).to(device)




# Optimizers and learning rate schedulers 
# g_optimizer = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
# d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))
# g_scheduler = optim.lr_scheduler.StepLR(g_optimizer, step_size=20, gamma=0.5)
# d_scheduler = optim.lr_scheduler.StepLR(d_optimizer, step_size=20, gamma=0.5)

# # Faster initial learning with steeper decay
# g_optimizer = optim.Adam(generator.parameters(), lr=0.001, betas=(0.5, 0.999))
# d_optimizer = optim.Adam(discriminator.parameters(), lr=0.001, betas=(0.5, 0.999))
# g_scheduler = optim.lr_scheduler.StepLR(g_optimizer, step_size=10, gamma=0.3)
# d_scheduler = optim.lr_scheduler.StepLR(d_optimizer, step_size=10, gamma=0.3)

# Slower, more stable learning
# g_optimizer = optim.Adam(generator.parameters(), lr=0.0001, betas=(0.7, 0.999))
# d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0001, betas=(0.7, 0.999))
# g_scheduler = optim.lr_scheduler.StepLR(g_optimizer, step_size=30, gamma=0.7)
# d_scheduler = optim.lr_scheduler.StepLR(d_optimizer, step_size=30, gamma=0.7)

# Different learning rates for generator and discriminator
g_optimizer = optim.Adam(generator.parameters(), lr=0.0003, betas=(0.5, 0.999))
d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0001, betas=(0.5, 0.999))
g_scheduler = optim.lr_scheduler.StepLR(g_optimizer, step_size=15, gamma=0.5)
d_scheduler = optim.lr_scheduler.StepLR(d_optimizer, step_size=15, gamma=0.5)

# Using cosine annealing instead of step scheduling
# g_optimizer = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
# d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))
# g_scheduler = optim.lr_scheduler.CosineAnnealingLR(g_optimizer, T_max=50, eta_min=1e-6)
# d_scheduler = optim.lr_scheduler.CosineAnnealingLR(d_optimizer, T_max=50, eta_min=1e-6)

# Using RMSprop instead of Adam
# g_optimizer = optim.RMSprop(generator.parameters(), lr=0.0002, alpha=0.99)
# d_optimizer = optim.RMSprop(discriminator.parameters(), lr=0.0002, alpha=0.99)
# g_scheduler = optim.lr_scheduler.ExponentialLR(g_optimizer, gamma=0.97)
# d_scheduler = optim.lr_scheduler.ExponentialLR(d_optimizer, gamma=0.97)

# Using linear warm-up and reduce on plateau
# g_optimizer = optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999))
# d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0001, betas=(0.5, 0.999))
# g_scheduler = optim.lr_scheduler.ReduceLROnPlateau(g_optimizer, mode='min', factor=0.5, patience=5)
# d_scheduler = optim.lr_scheduler.ReduceLROnPlateau(d_optimizer, mode='min', factor=0.5, patience=5)

# Multiple learning rate drops at specific milestones
# g_optimizer = optim.Adam(generator.parameters(), lr=0.0004, betas=(0.5, 0.999))
# d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0004, betas=(0.5, 0.999))
# g_scheduler = optim.lr_scheduler.MultiStepLR(g_optimizer, milestones=[10, 20, 30], gamma=0.5)
# d_scheduler = optim.lr_scheduler.MultiStepLR(d_optimizer, milestones=[10, 20, 30], gamma=0.5)

# Cyclic learning rate strategy
# g_optimizer = optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999))
# d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0001, betas=(0.5, 0.999))
# g_scheduler = optim.lr_scheduler.CyclicLR(g_optimizer, base_lr=0.0001, max_lr=0.001, step_size_up=2000)
# d_scheduler = optim.lr_scheduler.CyclicLR(d_optimizer, base_lr=0.0001, max_lr=0.001, step_size_up=2000)

best_snr = float('-inf')  # Track the best SNR value
best_lsd = float('inf')   # Track the best LSD value
best_epoch = 0          # Track the best epoch


# Training and validation
epochs = 50
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    d_loss, g_loss = train(generator, discriminator, train_loader, g_optimizer, d_optimizer, epochs=5, device=device)
    avg_snr, avg_lsd = validate(generator, val_loader, device)
    
    if avg_snr > best_snr and avg_lsd < best_lsd:
        best_snr = avg_snr
        best_lsd = avg_lsd
        best_epoch = epoch 
    
    print(f"Best Results - Epoch: {best_epoch}, Best SNR: {best_snr:.4f}, Best LSD: {best_lsd:.4f}")
    

    # Step schedulers
    g_scheduler.step()
    d_scheduler.step()
    
    # g_scheduler.step(g_loss) 
    # d_scheduler.step(d_loss) 

print("Training Complete!")



cuda
Epoch 1/50


ValueError: LSTM: Expected input to be 2D or 3D, got 4D instead