In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import speechbrain as sb
from speechbrain.dataio.dataio import read_audio
import pandas as pd
import matplotlib.pyplot as plt

from speechbrain.inference.separation import SepformerSeparation as separator
import torchaudio
from speechbrain.dataio.batch import PaddedBatch
from mir_eval.separation import bss_eval_sources
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# df = pd.read_csv('D:/programming/datasets/Libri2Mix/wav8k/min/metadata/mixture_test_mix_clean.csv')
# df.head()

#randomly split the df into train and test data with 70:30 ratio
# train_df = df.sample(frac=0.7, random_state=25,replace=False)
# test_df = df.drop(train_df.index)

# train_df.to_csv('mixture_train_split_mix_clean.csv',index=False)
# test_df.to_csv('mixture_test_split_mix_clean.csv',index=False)

In [3]:
train_df = pd.read_csv('mixture_train_split_mix_clean.csv')
test_df = pd.read_csv('mixture_test_split_mix_clean.csv')

In [4]:
train_df.head()

Unnamed: 0,mixture_ID,mixture_path,source_1_path,source_2_path,length
0,3570-5695-0006_3575-170457-0053,D:\programming\datasets\Libri2Mix\wav8k\min\te...,D:\programming\datasets\Libri2Mix\wav8k\min\te...,D:\programming\datasets\Libri2Mix\wav8k\min\te...,59760
1,1284-1180-0010_4507-16021-0024,D:\programming\datasets\Libri2Mix\wav8k\min\te...,D:\programming\datasets\Libri2Mix\wav8k\min\te...,D:\programming\datasets\Libri2Mix\wav8k\min\te...,41120
2,2830-3979-0000_5142-36377-0004,D:\programming\datasets\Libri2Mix\wav8k\min\te...,D:\programming\datasets\Libri2Mix\wav8k\min\te...,D:\programming\datasets\Libri2Mix\wav8k\min\te...,43880
3,4077-13754-0000_6930-76324-0025,D:\programming\datasets\Libri2Mix\wav8k\min\te...,D:\programming\datasets\Libri2Mix\wav8k\min\te...,D:\programming\datasets\Libri2Mix\wav8k\min\te...,32960
4,7176-92135-0045_3729-6852-0039,D:\programming\datasets\Libri2Mix\wav8k\min\te...,D:\programming\datasets\Libri2Mix\wav8k\min\te...,D:\programming\datasets\Libri2Mix\wav8k\min\te...,41840


In [5]:
class LibriMix(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        mix = read_audio(row['mixture_path'])
        s1 = read_audio(row['source_1_path'])
        s2 = read_audio(row['source_2_path'])
        return {'s1':s1, 's2': s2, 'mix':mix}

In [6]:
train_dataset = LibriMix(train_df)
test_dataset = LibriMix(test_df)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True,collate_fn=PaddedBatch)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False,collate_fn=PaddedBatch)

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr',run_opts={"device":device})
model.requires_grad_(True)
class SeparationBrain(sb.Brain):
    def __init__(self, train_loss, modules, opt_class,run_opts):
        super(SeparationBrain, self).__init__(modules=modules, opt_class=opt_class,run_opts=run_opts)
        self.train_loss = train_loss

    def compute_forward(self, batch):
        """Forward computations from the mixture to the separated signals."""
        # print(batch['mix'][0])
        # import pdb; pdb.set_trace()
        # mix, _ = batch['mix']
        # Get the estimates for the sources
        est_sources = self.modules.mdl.forward(batch)
        return est_sources

    def compute_objectives(self, targets, est_sources):
        """Computes the loss functions between estimated and ground truth sources"""
        if self.train_loss == 'l1':
          return (est_sources - targets).abs().mean()
        elif self.train_loss == 'si-snr':
          return sb.nnet.losses.get_si_snr_with_pitwrapper(targets, est_sources).mean()


    def fit_batch(self, batch):
        """Trains one batch"""
        # Unpacking batch list
        batch.to(self.device)
        source1, source2, mix =  batch['s1'][0] , batch['s2'][0], batch['mix'][0]
        targets = torch.stack([source1, source2], dim=-1)

        est_sources = self.compute_forward(mix)
        loss = self.compute_objectives(targets, est_sources)
        
        # print(loss)
        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()
  
        return loss.detach().cpu()

    def evaluate_batch(self, batch, stage):
        """Computations needed for test batches"""

        source1, source2, mix = batch['s1'][0] , batch['s2'][0], batch['mix'][0]
        targets = torch.stack([source1, source2], dim=-1)

        est_sources = self.compute_forward(mix)

        # Compute SI-SNR
        sisnr = sb.nnet.losses.get_si_snr_with_pitwrapper(est_sources, targets)

        # Compute SI-SNR improvement
        mixture_signal = torch.stack(
            [mix] * 2, dim=-1
        )
        mixture_signal = mixture_signal.to(targets.device)
        sisnr_baseline = sb.nnet.losses.get_si_snr_with_pitwrapper(
            mixture_signal, targets
        )
        sisnr_i = sisnr - sisnr_baseline

        
        # Compute SDR
        sdr, _, _, _ = bss_eval_sources(
            targets.t().cpu().numpy(),
            est_sources.t().detach().cpu().numpy(),
        )

        sdr_baseline, _, _, _ = bss_eval_sources(
            targets.t().cpu().numpy(),
            mixture_signal.t().detach().cpu().numpy(),
        )

        sdr_i = sdr.mean() - sdr_baseline.mean()

        # print(f'SI-SNRi = {-sisnr_i.item()}, SDRi = {sdr_i}')
        return{"si-snr_i": -sisnr_i.item(),"sdr_i": sdr_i} 
    
from functools import partial

optimizer = lambda x: torch.optim.Adam(x, lr=0.0001)
N_epochs = 1
epoch_counter = sb.utils.epoch_loop.EpochCounter(limit=N_epochs)
run_opts = {"device": device}
sepformer = SeparationBrain(
        train_loss='si-snr',
        modules={'mdl': model},
        opt_class=optimizer,
        run_opts=run_opts,

    )
sepformer.modules.mdl.training = True
# sepformer.compute_forward(next(iter(train_loader)))

sepformer.fit(
            epoch_counter,
            train_loader,
            test_loader)

  0%|          | 0/2100 [00:00<?, ?it/s]

tensor(-9.8298, device='cuda:0', grad_fn=<MeanBackward0>)


  0%|          | 1/2100 [00:10<5:50:28, 10.02s/it, train_loss=-9.83]


KeyboardInterrupt: 

In [155]:
epochs = 10
sepformer.modules.mdl.train()
for epoch in range(epochs):
    for batch in tqdm(train_loader):
        sepformer.fit_batch(batch)
        # a,b = batch['mix']
        # print(a.shape,b.shape)
        break
    break


  0%|          | 0/525 [00:13<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 9.85 GiB is allocated by PyTorch, and 721.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
sepformer.evaluate(test_loader, epoch_counter)

In [143]:
model.training = False

In [144]:
model.training

False

In [85]:
sepformer.compute_forward(next(iter(test_loader)).to(device)).shape

torch.Size([4, 50880, 2])

In [88]:
test_dataset

<__main__.LibriMix at 0x19e6e6f5ab0>

In [96]:
sepformer.evaluate_batch(next(iter(test_loader)).to(device), 'test')

  0%|          | 0/225 [00:00<?, ?it/s]


TypeError: expected Tensor as element 0 in argument 0, but got PaddedData