In [1]:
# import zipfile

# zip_file_path = "/teamspace/studios/this_studio/SpeechAssign2_Q2/Libri2Mix-20240418T033112Z-001.zip"
# output_directory = "/teamspace/studios/this_studio/SpeechAssign2_Q2"

# # Open the zip file
# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     # Extract all the contents to the output directory
#     zip_ref.extractall(output_directory)


In [None]:
! pip install speechbrain
! pip install torchmetrics
! pip install pysndfx sox speechbrain mir_eval
! sudo apt-get install sox -y

In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torchaudio
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import speechbrain as sb
from speechbrain.dataio.dataset import DynamicItemDataset
from speechbrain.utils.checkpoints import Checkpointer
from speechbrain.dataio.dataio import read_audio
from speechbrain.inference.separation import SepformerSeparation as separator
from mir_eval.separation import bss_eval_sources

In [2]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        mixture_path = row['mixture_path']
        source_1_path = row['source_1_path']
        source_2_path = row['source_2_path']

        mixture_waveform = read_audio(mixture_path)
        source_1_waveform = read_audio(source_1_path)
        source_2_waveform = read_audio(source_2_path)

        return (mixture_waveform, source_1_waveform, source_2_waveform)

In [3]:
csv_file = '/teamspace/studios/this_studio/SpeechAssign2_Q2/Libri2Mix/wav8k/min/metadata/mixture_test_mix_clean.csv'

df = pd.read_csv(csv_file)

# Split the data into train and test with 70:30 ratio
train_df = df.sample(frac=0.7, random_state=25, replace=False)
test_df = df.drop(train_df.index)

# Reload the split data
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers = 6)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False,num_workers = 6)
for i in test_loader:
    print(len(i))
    
    print(len(i[0]))
    print(i[2])

    break
    


3
1
tensor([[ 0.0010,  0.0005,  0.0006,  ...,  0.0067, -0.0238, -0.1091]])


In [6]:
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr',run_opts = {'device':'cuda'})

learning_rate = 1e-4
num_epochs = 1
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

sisnr_evaluation = sb.nnet.losses.get_si_snr_with_pitwrapper

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

cuda


In [8]:
running_sisnri = 0
running_sdri = 0

for audio1,audio2,audio3 in tqdm(test_loader):
    mix = audio1.to(device)
    # print(mix)
    audio2 = audio2.to(device)
    audio3 = audio3.to(device)

    with torch.no_grad():
        # Forward pass
        est_sources = model(mix)
        source1, source2  = audio2, audio3
        
        # Compute loss
        sources = torch.stack([source1, source2 ], dim=-1)
        sisnr = sisnr_evaluation(est_sources, sources)
        
        # Compute SI-SNR improvement
        mixture_signal = torch.stack([mix] * 2, dim=-1)
        sisnr_baseline = sisnr_evaluation(mixture_signal, sources)
        sisnr_i = sisnr - sisnr_baseline
    
        # Compute SDR
        sdr, _, _, _ = bss_eval_sources(sources.squeeze(0).t().cpu().numpy(),est_sources.squeeze(0).t().detach().cpu().numpy())
        sdr_baseline, _, _, _ = bss_eval_sources(sources.squeeze(0).t().cpu().numpy(),mixture_signal.squeeze(0).t().detach().cpu().numpy())
        sdr_i = sdr.mean() - sdr_baseline.mean()
    
        running_sisnri+=-sisnr_i.item()
        running_sdri+=sdr_i.item()
    
        torch.cuda.empty_cache()

100%|██████████| 900/900 [17:38<00:00,  1.18s/it]


In [10]:
# Print loss
print(f'isnri: {running_sisnri/len(test_loader)} , sdri: {running_sdri/len(test_loader)}')

isnri: 8.591031035366985 , sdri: 11.656670690982217


In [4]:
!pip install wandb




In [4]:
import wandb
import torch
import torch.nn.functional as F

wandb.init(project = 'Speech_Assignment2_Q2')
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr',run_opts = {'device':'cuda'})

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

sisnr_evaluation = sb.nnet.losses.get_si_snr_with_pitwrapper

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

[34m[1mwandb[0m: Currently logged in as: [33mm23csa014[0m. Use [1m`wandb login --relogin`[0m to force relogin


cuda


In [5]:
torch.cuda.empty_cache()

In [6]:
num_epochs = 2

model.requires_grad_(True)
model.train()

for epoch in range(num_epochs):
    running_loss = 0
    for mix,source1, source2 in tqdm(train_loader):
        mix = mix[:4000].to(device)
        source1 = source1[:4000].to(device)
        source2 = source2[:4000].to(device)
        
        est_sources = model(mix)
          # = audio2, audio3
        
        # Compute loss
        sources = torch.stack([source1, source2 ], dim=-1)
        del source1
        del source2
        sisnr = sisnr_evaluation(est_sources, sources).mean()
        # sisnr_var = torch.autograd.Variable(sisnr, requires_grad=True)
        
        # Backward pass
        optimizer.zero_grad()
        sisnr.backward()
        optimizer.step()
        running_loss += sisnr.item()

        torch.cuda.empty_cache()

    # Print loss
    running_loss = running_loss/len(train_loader)
    wandb.log({'sisnr_loss':running_loss})
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss}')


  0%|          | 1/2100 [00:02<1:41:22,  2.90s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 9.06 MiB is free. Process 59635 has 14.74 GiB memory in use. Of the allocated memory 13.69 GiB is allocated by PyTorch, and 913.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [20]:
model.eval()  # Set the model to evaluation mode
total_sisnr_loss = 0

with torch.no_grad():
    for audio1, audio2, audio3 in tqdm(test_loader):
        mix = audio1.to(device)
        audio2 = audio2.to(device)
        audio3 = audio3.to(device)

        est_sources = model(mix)
        source1, source2 = audio2, audio3

        # Compute loss
        sources = torch.stack([source1, source2], dim=-1)
        sisnr = sisnr_evaluation(est_sources, sources).mean()
        
        total_sisnr_loss += sisnr.item()

        torch.cuda.empty_cache()

# Calculate average loss
avg_sisnr_loss = total_sisnr_loss / len(test_loader)

print(f'Testing complete. Average Si-SNR Loss: {avg_sisnr_loss}')
wandb.log({'sisnr_loss': avg_sisnr_loss})  # Log the average loss to wandb


100%|██████████| 900/900 [04:13<00:00,  3.55it/s]

Testing complete. Average Si-SNR Loss: -8.590278609345356



