In [1]:
import sys
from librosa.core import resample
import pandas as pd
import numpy as np
from IPython.display import Audio
import torch
import pathlib
def create_dir(filename):
    pathlib.Path('/'.join(filename.split('/')[:-1])).mkdir(parents=True, exist_ok=True)
from tqdm.notebook import tqdm
sys.path.append('Conv-TasNet/src/')
sys.path.append('SincNet/')
from conv_tasnet import *
from pit_criterion import cal_loss
from dnn_models import *
from data_io import ReadList,read_conf_inp,str_to_bool
from collections import Counter
import os
device = 1
device_ids = [1,2,3]
root = '../'
root_csv = '../'
old_sr = 16000
new_sr = 8000

In [2]:
def load8hz(filename):
    samples = np.load(filename)/(2**15)
    samples = resample(samples, old_sr, new_sr)
    # pad the samples
    if len(samples)>new_sr*2:
        samples = samples[:new_sr*2]
    if len(samples)<new_sr*2:
        padding = np.zeros(new_sr*2-len(samples))
        samples = np.concatenate([samples, padding])
    
    return samples

class E2ESet(torch.utils.data.Dataset):
    def __init__(self, root, csv):
        super().__init__()
        self.root = root
        self.csv = pd.read_csv(root+csv)
        self.speakers = list(set(self.csv['first_speaker']))
        self.speakers.sort()
        self.spkr2idx = {spkr:i for i, spkr in enumerate(self.speakers)}
    def __len__(self):
        return len(self.csv)
    def __getitem__(self, idx):
        row = self.csv.iloc[idx]
        sig1, sig2 = load8hz(root_csv+root+row['first_file']), load8hz(root_csv+root+row['second_file']) # original files
        spkr1, spkr2 = row['first_speaker'], row['second_speaker']
        target_vec = np.zeros(len(self.speakers))
        target_vec[self.spkr2idx[spkr1]] = 1
        target_vec[self.spkr2idx[spkr2]] = 1
        return sig1+sig2, target_vec
e2eset_train = E2ESet(root, 'overlay-train.csv')
e2eset_val = E2ESet(root, 'overlay-val.csv')
e2eset_test = E2ESet(root, 'overlay-test.csv')

In [3]:
tasnet = ConvTasNet(N = 256, L = 20, B = 256, H = 512, P = 3, X = 8, R = 4, C = 3, norm_type="gLN", causal=0,
             mask_nonlinear='relu').cuda(device)
if os.path.exists('models/tasnet.pth'):
    print('load tasnet model')
    checkpoint = torch.load('models/tasnet.pth')
    tasnet.load_state_dict(checkpoint['model_state_dict'])
    
    
    
    
fs=new_sr
cw_len=200
#cw_shift=10
wlen=int(fs*cw_len/1000.00)

class MixedClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        cnn_arch = {
                'input_dim':wlen,
                'fs':fs,
                'cnn_N_filt':[80,60,60],
                'cnn_len_filt':[251,5,5],
                'cnn_max_pool_len':[3,3,3],
                'cnn_use_laynorm_inp':True,
                'cnn_use_batchnorm_inp':False,
                'cnn_use_laynorm':[True,True,True],
                'cnn_use_batchnorm':[False,False,False],
                'cnn_act':['leaky_relu','leaky_relu','leaky_relu'],
                'cnn_drop':[0.0,0.0,0.0]
                }
        self.cnn_net = SincNet(cnn_arch)

        dnn1_arch = {'input_dim': self.cnn_net.out_dim,
                  'fc_lay': [2048,2048,2048],
                  'fc_drop': [0.0,0.0,0.0], 
                  'fc_use_batchnorm': [True,True,True],
                  'fc_use_laynorm': [False,False,False],
                  'fc_use_laynorm_inp': False,
                  'fc_use_batchnorm_inp': False,
                  'fc_act': ['leaky_relu','leaky_relu','leaky_relu']
                  }
        self.dnn1 = MLP(dnn1_arch)


        dnn2_arch = {'input_dim':2048 ,
                  'fc_lay': [20],
                  'fc_drop': [0.0], 
                  'fc_use_batchnorm': [False],
                  'fc_use_laynorm': [False],
                  'fc_use_laynorm_inp': False,
                  'fc_use_batchnorm_inp': False,
                  'fc_act': ['linear'] # leakyrelu(1) is just identity mapping
                  }
        self.dnn2 = MLP(dnn2_arch)
        
        self.softmax = nn.Softmax(dim = 1)
    def forward(self, X):
        out = self.cnn_net(X)
        out = self.dnn1(out)
        out = self.dnn2(out)
        out = self.softmax(out)
        return out

cls = MixedClassifier().cuda(device)
if os.path.exists('models/sincnet.pth'):
    print('load sincnet model')
    checkpoint = torch.load('models/sincnet.pth')
    cls.load_state_dict(checkpoint['model_state_dict'])
cls.train()


class E2Enet(nn.Module):
    def __init__(self, tasnet, cls):
        super().__init__()
        self.tasnet = tasnet
        self.cls = cls
    def chop_chunk(self, signal):
        batch_size, signal_len = signal.shape
        N_fr=signal_len//wlen
        chunks = []
        for i in range(N_fr):
            chunks.append(signal[..., i*wlen:(i+1)*wlen]) # list of N_fr elements, each (batch_size*wlen)
        return chunks
    def estimate(self, chunks):
        out_vecs = []
        for chunk in chunks:
            out_vecs.append(self.cls(chunk)) # list of N_fr elements, each (batch_size*N_spkr), softmaxed
        out_tensor = torch.stack(out_vecs, dim = 1) # batch_size*N_fr*N_spkr
        out_tensor = out_tensor.mean(dim = 1) # batch_size*N_spkr
        return out_tensor
    def forward(self, sig_mixed):
        sig123 = self.tasnet(sig_mixed) # batch_size*wlen, batch_size*wlen
        sig1, sig2, sig3 = sig123[:, 0], sig123[:, 1], sig123[:, 2]
        chunks1, chunks2, chunks3 = self.chop_chunk(sig1), self.chop_chunk(sig2), self.chop_chunk(sig3)
        pred1, pred2, pred3 = self.estimate(chunks1), self.estimate(chunks2), self.estimate(chunks3)
        pred_combined = torch.stack([pred1, pred2, pred3], dim = 0) # 2*batch_size*N_spkr
        pred_combined , _ = torch.max(pred_combined, dim = 0) # batch_size*N_spkr
        return pred_combined

e2enet = E2Enet(tasnet, cls).cuda(device)
e2enet.train()
e2enet = nn.DataParallel(e2enet, device_ids = device_ids)

optimizer = torch.optim.Adam(e2enet.parameters(), lr = 0.001)
if False and os.path.exists('models/e2enet.pth'):
    print('load model')
    checkpoint = torch.load('models/e2enet.pth')
    e2enet.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    loss = checkpoint['loss']
    if 'bestacc' in checkpoint:
        bestacc = checkpoint['bestacc']
    else:
        bestacc = 0.0
else:
    print('using loaded tasnet+sincnet')
    bestacc = 0.0



load model
load sincnet model
using loaded tasnet+sincnet


In [22]:
def find_max3(tensor):
    array = tensor.cpu().detach().numpy()
    max3 = []
    for row in array:
        max3.append(np.argsort(row)[::-1][:3])
    return np.array(max3)

def compute_corrects(tensor1, tensor2):
    max_1, max_2 = find_max2(tensor1), find_max2(tensor2)
    batch_size = max_1.shape[0]
    batch_corrects = 0
    for i in range(batch_size):
        if Counter(max_1[i])==Counter(max_2[i]):
            batch_corrects+=1
    return batch_corrects

In [5]:
batch_size = 16
e2eloader_train  = torch.utils.data.DataLoader(e2eset_train, batch_size=batch_size, shuffle=True, pin_memory = True, num_workers = 16)
e2eloader_val  = torch.utils.data.DataLoader(e2eset_val, batch_size=batch_size, shuffle=True, pin_memory = True, num_workers = 16)
criterion = torch.nn.BCELoss()

for epoch in range(64):
    running_loss = 0.0
    running_accuracy = 0.0
    for batch_idx, (mixed_sig, target) in enumerate(tqdm(e2eloader_train)):
        optimizer.zero_grad()
        mixed_sig, target = mixed_sig.float().cuda(device), target.float().cuda(device)
        out = e2enet(mixed_sig)
        loss = criterion(out, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(e2enet.parameters(), 0.5)
        optimizer.step()

        running_loss += loss.item()
        running_accuracy += compute_corrects(out, target)/batch_size

        if batch_idx % 200 == 199:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f accuracy: %.3f' % 
                  (epoch + 1, batch_idx + 1, running_loss / 200, running_accuracy / 200))
            running_loss = 0.0
            running_accuracy = 0.0
            torch.save({
            'model_state_dict': e2enet.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss
            }, 'models/e2enet.pth')

    with torch.no_grad():    
        corrects = 0
        for batch_idx, (mixed_sig, target) in enumerate(tqdm(e2eloader_val)):
            mixed_sig, target = mixed_sig.float().cuda(device), target.float().cuda(device)
            out = e2enet(mixed_sig)
            corrects += compute_corrects(out, target)
        print('val acc:', corrects/len(e2eset_val))
        if corrects/len(e2eset_val) > bestacc:
            bestacc = corrects/len(e2eset_val)
            torch.save({
            'model_state_dict': e2enet.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            'bestacc': bestacc
            }, 'models/best-e2enet.pth')
        e2enet.train()
    pass

HBox(children=(FloatProgress(value=0.0, max=5510.0), HTML(value='')))

  frame = signal.new_tensor(frame).long()  # signal may in GPU or CPU


[1,   200] loss: 0.112 accuracy: 0.792
[1,   400] loss: 0.108 accuracy: 0.816
[1,   600] loss: 0.109 accuracy: 0.802
[1,   800] loss: 0.107 accuracy: 0.821
[1,  1000] loss: 0.106 accuracy: 0.806


KeyboardInterrupt: 

In [10]:
e2enet.eval()
batch_size = 16
e2eloader_test  = torch.utils.data.DataLoader(e2eset_test, batch_size=batch_size, shuffle=True, pin_memory = True, num_workers = 16)
with torch.no_grad():    
    corrects = 0
    for batch_idx, (mixed_sig, target) in enumerate(tqdm(e2eloader_test)):
        mixed_sig, target = mixed_sig.float().cuda(device), target.float().cuda(device)
        out = e2enet(mixed_sig)
        corrects += compute_corrects(out, target)
    print('test acc:', corrects/len(e2eset_test))
e2enet.train()
pass

HBox(children=(FloatProgress(value=0.0, max=12398.0), HTML(value='')))

  frame = signal.new_tensor(frame).long()  # signal may in GPU or CPU


NameError: Caught NameError in replica 0 on device 1.
Original Traceback (most recent call last):
  File "/home/junzhez2/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/junzhez2/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "<ipython-input-5-001d503c6886>", line 96, in forward
    pred1, pred2, pred3 = self.estimate(chunks1), self.estimate(chunks2), self.estimate(chunks3)
NameError: name 'chunks3' is not defined


In [88]:
tasnet

ConvTasNet(
  (encoder): Encoder(
    (conv1d_U): Conv1d(1, 256, kernel_size=(20,), stride=(10,), bias=False)
  )
  (separator): TemporalConvNet(
    (network): Sequential(
      (0): ChannelwiseLayerNorm()
      (1): Conv1d(256, 256, kernel_size=(1,), stride=(1,), bias=False)
      (2): Sequential(
        (0): Sequential(
          (0): TemporalBlock(
            (net): Sequential(
              (0): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
              (1): PReLU(num_parameters=1)
              (2): GlobalLayerNorm()
              (3): DepthwiseSeparableConv(
                (net): Sequential(
                  (0): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512, bias=False)
                  (1): PReLU(num_parameters=1)
                  (2): GlobalLayerNorm()
                  (3): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
                )
              )
            )
          )
          (1): TemporalBlock(
      

In [34]:
tasnet.eval()
test_csv = pd.read_csv(root_csv+root+'testfiles/start.csv')
sig1 = load8hz(root_csv+root+test_csv[test_csv['speaker']=='martin_savage'].iloc[6]['segfile'])
sig2 = load8hz(root_csv+root+test_csv[test_csv['speaker']=='andrea_arsenault'].iloc[0]['segfile'])
sig3 = load8hz(root_csv+root+test_csv[test_csv['speaker']=='lou_waters'].iloc[6]['segfile'])
mixed_sig = torch.Tensor(sig1+sig2+sig3)[None, ...].float().cuda(device)
out = tasnet(mixed_sig).cpu().detach().numpy()[0]
new_sig1, new_sig2, new_sig3 = out[0], out[1], out[2]

IndexError: single positional indexer is out-of-bounds

In [27]:
Audio(new_sig1, rate = new_sr)

In [24]:
Audio(new_sig2, rate = new_sr)

In [25]:
Audio(new_sig3, rate = new_sr)

In [28]:
print(e2eset_train.speakers)
test_csv

['andrea_arsenault', 'brian_lamb', 'csp_waj_susan', 'david_brancaccio', 'eddie_mair', 'joie_chen', 'kathleen_kennedy', 'leon_harris', 'linda_wertheimer', 'linden_soles', 'lisa_mullins', 'lou_waters', 'lynn_vaughan', 'mark_mullen', 'natalie_allen', 'noah_adams', 'peter_jennings', 'robert_siegel', 'ted_koppel', 'thalia_assuras']


Unnamed: 0,end,segfile,segment_idx,silence_ratio,soundfile,speaker,start
0,16.639,testfiles/start_segments/e960510b_seg0.npy,0.0,0.00,testfiles/files/e960510b.sph,martin_savage,1.071
1,46.576,testfiles/start_segments/e960510b_seg1.npy,1.0,0.05,testfiles/files/e960510b.sph,lou_waters,16.639
2,62.153,testfiles/start_segments/e960510b_seg2.npy,2.0,0.05,testfiles/files/e960510b.sph,lou_waters,47.227
3,75.420,testfiles/start_segments/e960510b_seg3.npy,3.0,0.00,testfiles/files/e960510b.sph,martin_savage,63.025
4,98.866,testfiles/start_segments/e960510b_seg4.npy,4.0,0.08,testfiles/files/e960510b.sph,jamie_mcintyre,77.279
...,...,...,...,...,...,...,...
745,1553.800,testfiles/start_segments/e960513b_seg90.npy,90.0,0.18,testfiles/files/e960513b.sph,mark_bernheimer,1547.015
746,1563.705,testfiles/start_segments/e960513b_seg91.npy,91.0,0.00,testfiles/files/e960513b.sph,e960513b_f_us_016,1553.823
747,1577.195,testfiles/start_segments/e960513b_seg92.npy,92.0,0.10,testfiles/files/e960513b.sph,mark_bernheimer,1563.705
748,1580.689,testfiles/start_segments/e960513b_seg94.npy,94.0,0.00,testfiles/files/e960513b.sph,lou_waters,1578.420
