# Initialize environment
Install and import necessary packages

In [1]:
!pip install torchaudio
!pip install pystoi
!pip install https://github.com/ludlows/python-pesq/archive/master.zip
import numpy as np
import glob
import librosa
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Audio
import torch
import time
import torch.nn.functional as F
import torch.nn as nn
#import torch.optim as optim
import torchaudio as ta
if torch.cuda.is_available():
  cuda=True
else:
  cuda=False
#import datetime as dt
import os
from torch.autograd import Variable
from IPython.display import Image, display, clear_output
import datetime
from pystoi.stoi import stoi
from pesq import pesq

from google.colab import drive
drive.mount('/content/drive')

Collecting https://github.com/ludlows/python-pesq/archive/master.zip
[?25l  Downloading https://github.com/ludlows/python-pesq/archive/master.zip
[K     / 491kB 2.8MB/s
Building wheels for collected packages: pesq
  Building wheel for pesq (setup.py) ... [?25l[?25hdone
  Created wheel for pesq: filename=pesq-0.0.1-cp36-cp36m-linux_x86_64.whl size=162010 sha256=deba2538ef8ad2145d3b89ddb716081e60d42670d5360c18175b0eb721f93fc4
  Stored in directory: /tmp/pip-ephem-wheel-cache-qy5l3s7i/wheels/85/91/09/5ae7677a054a05d49111dc8f3b282e886b3852348384893a32
Successfully built pesq
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# DataLoader class:

In [0]:
def get_filepaths(data_directory, dataset):
    """
    Returns a sorted list of file paths for the specified dataset.
    """
    assert dataset in {'clean-speech', 'noise', 'impulse-responses'}
    
    regex = '/'.join((data_directory, dataset, '*.wav'))
    filepaths = glob.glob(regex)

    if len(filepaths) == 0:
        raise Exception('No files were found in the specified dataset!')
    
    filepaths.sort()
    return filepaths

class DataLoader():
  def __init__(self, data_directory='/content/drive/My Drive/reverberation'
               , sample_rate=16000
               ,GT_range=[[0,3121]], IR_range=[[0,27]], N_range=[[0,15]]
               ,sequence_length_GT=32000, sequence_length_IR=16000):
    
    self.sample_rate=sample_rate
    self.seq_len_GT=sequence_length_GT
    self.seq_len_IR=sequence_length_IR
    self.marc=None
    self.GT_is_padded = False

    if GT_range:
      self.paths_GT=[]
      filepaths=get_filepaths(data_directory, 'clean-speech')
      for i in range(len(GT_range)):
        self.paths_GT.extend(filepaths[GT_range[i][0]:GT_range[i][1]])
    else:
      self.paths_GT=None

    if IR_range:
      self.paths_IR=[]
      filepaths=get_filepaths(data_directory, 'impulse-responses')
      for i in range(len(IR_range)):
        self.paths_IR.extend(filepaths[IR_range[i][0]:IR_range[i][1]])
    else:
      self.paths_IR=None

    if N_range:
      self.paths_N=[]
      filepaths=get_filepaths(data_directory, 'noise')
      for i in range(len(N_range)):
        self.paths_N.extend(filepaths[N_range[i][0]:N_range[i][1]])
    else:
      self.paths_N=None
    
    """
    if IR_range:
      self.paths_IR=get_filepaths(data_directory, 'impulse-responses')[IR_range[0]:IR_range[1]]
    else:
      self.paths_IR=None
    if N_range:
      self.paths_N=get_filepaths(data_directory, 'noise')[N_range[0]:N_range[1]]
    else:
      self.paths_N=None
    """
    print(f"Found {len(self.paths_GT)} ground truth files in data set")
    print(f"Found {len(self.paths_IR)} impulse response files in data set")
    print(f"Found {len(self.paths_N)} noise files in data set\n")


    self.use_cuda = torch.cuda.is_available()
    print("Running GPU.") if self.use_cuda else print("No GPU available.")

    ###Load and normalize
    self.GT=list()
    self.IR=list()
    self.N=list()
    
    self.GT_lengths=list()
    self.IR_lengths=list()
    self.N_lengths=list()

    self.GT_filenames=list()
    self.IR_filenames=list()
    self.N_filenames=list()

    if GT_range:
      print("GT")
      count=0
      maxcount=len(self.paths_GT)
      for file in self.paths_GT:
        count+=1
        if count % 100 == 1:
          print(f"Processing file {count} of {maxcount}...")
        
        ##Load the file into memory
        d = self.load(file)

        #Add to class memory:
        self.GT.append(d)
        self.GT_lengths.append(d.shape[1])
        self.GT_filenames.append(file)

    if IR_range:
      print("IR")
      count=0
      maxcount=len(self.paths_IR)
      for file in self.paths_IR:
        count+=1
        if count % 100 == 1:
          print(f"Processing file {count} of {maxcount}...")
        
        ##Load the file into memory
        d = self.load(file)

        #Add to class memory:
        self.IR.append(d)
        self.IR_lengths.append(d.shape[1])
        self.IR_filenames.append(file)


    if N_range:
      print("N")
      count=0
      maxcount=len(self.paths_N)
      for file in self.paths_N:
        count+=1
        if count % 100 == 1:
          print(f"Processing file {count} of {maxcount}...")
        
        ##Load the file into memory
        d = self.load(file)

        #Add to class memory:
        self.N.append(d)
        self.N_lengths.append(d.shape[1])
        self.N_filenames.append(file)
  

  def printFilenames(self,type):
    if type=='GT':
      print("GT:")
      for n in range(len(self.GT_filenames)):
        print(f"\t{n}:\t{self.GT_filenames[n]}")
    elif type=='IR':
      print("IR:")
      for n in range(len(self.IR_filenames)):
        print(f"\t{n}:\t{self.IR_filenames[n]}")
    else:
      print("N:")
      for n in range(len(self.N_filenames)):
        print(f"\t{n}:\t{self.N_filenames[n]}")

  def load(self, file):
    ##Load the file into memory
    if self.use_cuda:
      d, sr = ta.load(file)
      #sr = sr.cuda()
      if d.shape[0]>1:
        d=d[0,:].unsqueeze(0)
      
      #print(d.shape)
      #print()
      if sr!=self.sample_rate:
        #print("Sample rates not equal")
        d = ta.transforms.Resample(sr, self.sample_rate)(d)
      #d = d.squeeze(0)
      d  = d.cuda()

    else:    
      d, sr = librosa.load(file, sr=self.sample_rate)
      #normalize
      d_minus_mean=d-np.mean(d)
      d = d_minus_mean/np.max(np.abs(d_minus_mean))
    return d
  
  def cropAndPadIR(self,matrix_ops=True):
    #IR=self.IR
    for i in range(len(self.IR)):
      idx = torch.argmax(torch.abs(self.IR[i]))
      self.IR[i]=self.IR[i][:,idx:]
      self.IR[i]=torch.cat((self.IR[i][:,:self.seq_len_IR],
                            torch.zeros(1,max(0,self.seq_len_IR-self.IR[i].shape[1])).cuda()),dim=1)
    if matrix_ops:
      self.IR=torch.stack(self.IR)
      
  
  def cropAndPadGT(self,matrix_ops=True):
    #for gt in self.GT:
    #  gt=torch.cat((gt[:,:self.seq_len_GT],torch.zeros(1,max(0,self.seq_len_GT-gt.shape[1])).cuda()),dim=1) 
    for i in range(len(self.GT)):
      self.GT[i]=torch.cat((self.GT[i][:,:self.seq_len_GT],
                            torch.zeros(1,max(0,self.seq_len_GT-self.GT[i].shape[1])).cuda()),dim=1) 
    self.GT_is_padded=True
    if matrix_ops:
      self.GT=torch.stack(self.GT)
    
  def add_reverb(self,matrix_ops=True):
    ## Takes the ground truth (GT_files), and applies convolutions from IR_files ##
    ## GT_lengths is the amount of samples in each ground truth files ##
    ## IR_lengths is the amount of samples in each impulse response files ##
    if self.GT_is_padded: 
      if self.use_cuda:
        if matrix_ops:
          self.GT_IR = torch.nn.functional.conv1d(self.GT,torch.flip(self.IR,[2]),padding=self.IR.shape[2])[:,:,1:-self.IR.shape[2]]
          self.GT_IR = self.GT_IR.contiguous()
        else:
          self.GT_IR = []
          len_GT = len(self.GT)
          for gt in range(len_GT):
            if gt % 100 == 1:
              print(f"Adding reverb to GT {gt} of {len_GT}...")
            for ir in range(len(self.IR)):
              gt_ir = torch.nn.functional.conv1d(self.GT[gt].unsqueeze(0),torch.flip(self.IR[ir].unsqueeze(0),[2]),padding=self.IR[ir].shape[1])[:,:,1:-self.IR[ir].shape[1]]
              self.GT_IR.append(gt_ir)
          print("stacking GT_IR")
          self.GT_IR = torch.stack(self.GT_IR) 
          print("stacking GT")
          self.GT = torch.stack(self.GT)
          print("stacking IR")
          self.IR = torch.stack(self.IR)

    else:
      print("ERROR: you should pad GT first")
  

  def normalize(self):
    max_abs = torch.max(torch.abs(self.GT_IR),2)[0]
    max_abs = max_abs.unsqueeze(2) #Need to be shape (GT,IR,1)
    eps = 1e-12
    self.GT_IR = self.GT_IR / (max_abs + eps)

  def cpu(self):
    self.IR=self.IR.cpu()
    self.GT=self.GT.cpu()
    self.GT_IR=self.GT_IR.cpu()

  def spectrogram(self,n_fft=160, melScale=False):
    ##Turns raw audio signal into a Spectrogram. Either on the Hz-scale or Mel-scale.
    ##Also converts to logarithmic scale.
    if melScale:
      spect = ta.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_fft=n_fft)
    else:
      spect = ta.transforms.Spectrogram(n_fft=n_fft)
    #self.GT_spectrogram=spect(self.GT.squeeze(1))[:,:,:-1].log2()
    #self.GT_IR_spectrogram=spect(self.GT_IR.view(-1,1,self.GT_IR.shape[2]).squeeze(1))[:,:,:-1].log2()
    self.GT_spectrogram=spect(self.GT.squeeze(1))
    self.GT_IR_spectrogram=spect(self.GT_IR.view(-1,1,self.GT_IR.shape[2]).squeeze(1))
      
  def stft(self, window=None, n_fft=640):
    self.GT_stft = torch.stft(self.GT.squeeze(1), n_fft, hop_length=n_fft//2, window=window)
    self.GT_IR_stft = torch.stft(self.GT_IR.view(-1,1,self.GT_IR.shape[2]).squeeze(1), n_fft, hop_length=n_fft//2, window=window)
    #print(f"GT_stft.shape: {self.GT_stft.shape}")
    #print(f"GT_IR_stft.shape: {self.GT_IR_stft.shape}")
    self.GT_mag,self.GT_phase = ta.functional.magphase(self.GT_stft)
    self.GT_IR_mag,self.GT_IR_phase = ta.functional.magphase(self.GT_IR_stft)
    #print(f"GT_mag.shape: {self.GT_mag.shape}")
    #print(f"GT_IR_mag.shape: {self.GT_IR_mag.shape}")
    #print(f"GT_phase.shape: {self.GT_phase.shape}")
    #print(f"GT_IR_phase.shape: {self.GT_IR_phase.shape}")

# Iterator class:

In [0]:
class Iterator():
  def __init__(self, object, mode='waveform'):
    self.paths_GT = object.paths_GT
    self.paths_IR = object.paths_IR
    self.mode=mode
    if self.mode=='spectrogram':
      self.GT = object.GT_spectrogram.repeat_interleave(len(self.paths_IR),dim=0).unsqueeze(1)
      self.GT_IR = object.GT_IR_spectrogram.unsqueeze(1)
    elif self.mode=='waveform':
      self.GT = object.GT.repeat_interleave(len(self.paths_IR),dim=0)
      self.GT_IR = object.GT_IR
    elif self.mode=='stft':
      self.GT = object.GT_mag.repeat_interleave(len(self.paths_IR),dim=0).unsqueeze(1)
      self.GT_IR = object.GT_IR_mag.unsqueeze(1)
      self.GT_phase = object.GT_phase.repeat_interleave(len(self.paths_IR),dim=0).unsqueeze(1)
      self.GT_IR_phase = object.GT_IR_phase.unsqueeze(1)
    elif self.mode=='both':
      self.GT = object.GT.repeat_interleave(len(self.paths_IR),dim=0)
      self.GT_IR = object.GT_IR
      self.GT_mag = object.GT_mag.repeat_interleave(len(self.paths_IR),dim=0).unsqueeze(1)
      self.GT_IR_mag = object.GT_IR_mag.unsqueeze(1)
      self.GT_phase = object.GT_phase.repeat_interleave(len(self.paths_IR),dim=0).unsqueeze(1)
      self.GT_IR_phase = object.GT_IR_phase.unsqueeze(1)


  def setChunkSize(self,k):
    self.chunkSize=k
  def setBatchSize(self,k):
    self.batchSize=k

  def chunkify(self):
    if self.mode=='spectrogram':
      self.GT_IR = self.GT_IR.view(-1,1,self.GT_IR.shape[2],self.chunkSize)
      self.GT = self.GT.view(-1,1,self.GT_IR.shape[2],self.chunkSize)
    elif self.mode=='waveform':
      self.GT_IR = self.GT_IR.view(-1,1,self.chunkSize)
      self.GT = self.GT.view(-1,1,self.chunkSize)
    elif self.mode=='stft':
      self.GT_IR = self.GT_IR.view(-1,1,self.GT_IR.shape[2],self.chunkSize)
      self.GT = self.GT.view(-1,1,self.GT_IR.shape[2],self.chunkSize)
      self.GT_IR_phase = self.GT_IR_phase.view(-1,1,self.GT_IR_phase.shape[2],self.chunkSize)
      self.GT_phase = self.GT_phase.view(-1,1,self.GT_IR_phase.shape[2],self.chunkSize)
    elif self.mode=='both':
      self.GT_IR = self.GT_IR.view(-1,1,self.chunkSize[0])
      self.GT = self.GT.view(-1,1,self.chunkSize[0])
      self.GT_IR_mag = self.GT_IR_mag.view(-1,1,self.GT_IR_mag.shape[2],self.chunkSize[1])
      self.GT_mag = self.GT_mag.view(-1,1,self.GT_IR_mag.shape[2],self.chunkSize[1])
      self.GT_IR_phase = self.GT_IR_phase.view(-1,1,self.GT_IR_phase.shape[2],self.chunkSize[1])
      self.GT_phase = self.GT_phase.view(-1,1,self.GT_IR_phase.shape[2],self.chunkSize[1])
  
  def __iter__(self):
    self.n=0
    return self
  def __next__(self):
    b1=self.n*self.batchSize
    b2=(self.n+1)*self.batchSize
    if b2<=self.GT_IR.shape[0]:
      if self.mode == 'spectrogram':
        x = self.GT_IR[b1:b2,:,:,:]
        y = self.GT[b1:b2,:,:,:]
        self.n+=1
        return x,y, 0, 0 #last 2 zeros are placeholders, so the iterator is consistent
      elif self.mode == 'waveform':
        x = self.GT_IR[b1:b2,:,:]
        y = self.GT[b1:b2,:,:]
        self.n+=1
        return x,y, 0, 0 #last 2 zeros are placeholders, so the iterator is consistent
      elif self.mode == 'stft':
        x = self.GT_IR[b1:b2,:,:,:]
        y = self.GT[b1:b2,:,:,:]
        x_phase = self.GT_IR_phase[b1:b2,:,:,:]
        y_phase = self.GT_phase[b1:b2,:,:,:]
        self.n+=1
        return x,y, x_phase, y_phase
      elif self.mode == 'both':
        x_mag = self.GT_IR_mag[b1:b2,:,:,:]
        y_mag = self.GT_mag[b1:b2,:,:,:]
        x_phase = self.GT_IR_phase[b1:b2,:,:,:]
        y_phase = self.GT_phase[b1:b2,:,:,:]
        x = self.GT_IR[b1:b2,:,:]
        y = self.GT[b1:b2,:,:]
        self.n+=1
        return x_mag,y_mag, x_phase, y_phase,x,y


    else:
      raise StopIteration


# Custom functions (loss, weight init and RAdam)

In [0]:
#init xavier weights
def init_weights(m):
      if type(m) == nn.Conv1d:
          torch.nn.init.xavier_normal_(m.weight)
          m.bias.data.fill_(0)
      elif type(m) == nn.ConvTranspose1d:
          torch.nn.init.xavier_normal_(m.weight)
          m.bias.data.fill_(0)

#Custom sftf loss function for waveform
def stftMAE(y_pred,y_true,n_fft=160):
  pred_stft = torch.stft(y_pred.squeeze(1),n_fft,hop_length=n_fft//2, normalized = True)
  pred_stft = torch.abs(pred_stft)
  true_stft = torch.stft(y_true.squeeze(1),n_fft,hop_length=n_fft//2, normalized = True)
  true_stft = torch.abs(true_stft)
  mae = F.l1_loss(pred_stft,true_stft)
  return mae

def stftMAE_hann(y_pred,y_true,n_fft=160):
  pred_stft = torch.stft(y_pred.squeeze(1),n_fft,hop_length=n_fft//2, window=torch.hann_window(n_fft).cuda(), normalized = True)
  pred_stft = torch.abs(pred_stft)
  true_stft = torch.stft(y_true.squeeze(1),n_fft,hop_length=n_fft//2, window=torch.hann_window(n_fft).cuda(), normalized = True)
  true_stft = torch.abs(true_stft)
  mae = F.l1_loss(pred_stft,true_stft)
  return mae

def stftHuber(y_pred,y_true,n_fft=160):
  pred_stft = torch.stft(y_pred.squeeze(1),n_fft,hop_length=n_fft//2, normalized = True)
  pred_stft = torch.abs(pred_stft)
  true_stft = torch.stft(y_true.squeeze(1),n_fft,hop_length=n_fft//2, normalized = True)
  true_stft = torch.abs(true_stft)
  mae = F.smooth_l1_loss(pred_stft,true_stft)
  return mae
  
def stftMSE(y_pred,y_true,n_fft=160):
  pred_stft = torch.stft(y_pred,n_fft,hop_length=n_fft//2)
  true_stft = torch.stft(y_true,n_fft,hop_length=n_fft//2)
  mse = F.mse_loss(pred_stft,true_stft)
  return mse

def magMAE(y_pred,y_true,n_fft=160):
  pred_stft = torch.stft(y_pred.squeeze(1),n_fft,hop_length=n_fft//2)
  true_stft = torch.stft(y_true.squeeze(1),n_fft,hop_length=n_fft//2)
  pred_mag,pred_phase = ta.functional.magphase(y_pred)
  true_mag,true_phase = ta.functional.magphase(y_true)
  mae = F.l1_loss(pred_mag,true_mag)
  return mae


In [0]:
import math
import torch
from torch.optim.optimizer import Optimizer, required

class RAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        
        self.degenerated_to_sgd = degenerated_to_sgd
        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
            for param in params:
                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
                    param['buffer'] = [[None, None, None] for _ in range(10)]
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
        super(RAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(RAdam, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

                state['step'] += 1
                buffered = group['buffer'][int(state['step'] % 10)]
                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
                        step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    elif self.degenerated_to_sgd:
                        step_size = 1.0 / (1 - beta1 ** state['step'])
                    else:
                        step_size = -1
                    buffered[2] = step_size

                # more conservative since it's an approximated value
                if N_sma >= 5:
                    if group['weight_decay'] != 0:
                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
                    p.data.copy_(p_data_fp32)
                elif step_size > 0:
                    if group['weight_decay'] != 0:
                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)
                    p.data.copy_(p_data_fp32)

        return loss

class PlainRAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
                    
        self.degenerated_to_sgd = degenerated_to_sgd
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)

        super(PlainRAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(PlainRAdam, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

                state['step'] += 1
                beta2_t = beta2 ** state['step']
                N_sma_max = 2 / (1 - beta2) - 1
                N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)


                # more conservative since it's an approximated value
                if N_sma >= 5:
                    if group['weight_decay'] != 0:
                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
                    step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
                    p.data.copy_(p_data_fp32)
                elif self.degenerated_to_sgd:
                    if group['weight_decay'] != 0:
                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
                    step_size = group['lr'] / (1 - beta1 ** state['step'])
                    p_data_fp32.add_(-step_size, exp_avg)
                    p.data.copy_(p_data_fp32)

        return loss

# Define neural network:

## Baseline

In [0]:
class Baseline_2000(nn.Module):
    def __init__(self,kernel_size):
        super(Baseline_2000, self).__init__()
        # We encode the data onto the latent space
        self.encoder = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            #nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2) # <- note the 2*latent_features
        )            
        # The latent code must be decoded into the original image
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(in_channels=256*2, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=256*2, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=256*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=0),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=0),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=1, kernel_size=kernel_size, padding=kernel_size//2, stride=1, output_padding=0),
            nn.Tanh()
        )
    def forward(self, x):
        enc_outs = []
        for i, l in enumerate(self.encoder):
            x = l(x)
            if i in [5, 11]:
              x = F.dropout(x, p=0.2)
            enc_outs.append(x)

        for i, l in enumerate(self.decoder):
          if (i % 2 == 0) and (i < len(self.decoder)-1):
            x = l(torch.cat((x, enc_outs[-i-1]), 1))
          else:
            x = l(x)
          if i in [5, 11]:
            x = F.dropout(x, p=0.2)
          #print(x.shape)

        return x

# Choose the shape of the autoencoder
net = Baseline_2000(11)
net.forward(torch.randn(128, 1, 2000))

if cuda:
    net = net.cuda()

print(net)

Baseline_2000(
  (encoder): Sequential(
    (0): Conv1d(1, 64, kernel_size=(11,), stride=(1,), padding=(5,))
    (1): ReLU()
    (2): Conv1d(64, 64, kernel_size=(11,), stride=(2,), padding=(5,))
    (3): ReLU()
    (4): Conv1d(64, 64, kernel_size=(11,), stride=(2,), padding=(5,))
    (5): ReLU()
    (6): Conv1d(64, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (7): ReLU()
    (8): Conv1d(128, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (9): ReLU()
    (10): Conv1d(128, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (11): ReLU()
    (12): Conv1d(128, 256, kernel_size=(11,), stride=(2,), padding=(5,))
    (13): ReLU()
    (14): Conv1d(256, 256, kernel_size=(11,), stride=(2,), padding=(5,))
    (15): ReLU()
    (16): Conv1d(256, 256, kernel_size=(11,), stride=(2,), padding=(5,))
  )
  (decoder): Sequential(
    (0): ConvTranspose1d(512, 256, kernel_size=(11,), stride=(2,), padding=(5,), output_padding=(1,))
    (1): ReLU()
    (2): ConvTranspose1d(512, 256, ker

In [0]:
class Baseline_1000(nn.Module):
    def __init__(self,kernel_size):
        super(Baseline_1000, self).__init__()
        # We encode the data onto the latent space
        self.encoder = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            #nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2) # <- note the 2*latent_features
        )            
        # The latent code must be decoded into the original image
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(in_channels=256*2, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=256*2, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=256*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=0),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=0),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=1, kernel_size=kernel_size, padding=kernel_size//2, stride=1, output_padding=0),
            nn.Tanh()
        )
    def forward(self, x):
        enc_outs = []
        for i, l in enumerate(self.encoder):
            x = l(x)
            if i in [5, 11]:
              x = F.dropout(x, p=0.2)
            enc_outs.append(x)

        for i, l in enumerate(self.decoder):
          if (i % 2 == 0) and (i < len(self.decoder)-1):
            x = l(torch.cat((x, enc_outs[-i-1]), 1))
          else:
            x = l(x)
          if i in [5, 11]:
            x = F.dropout(x, p=0.2)
          #print(x.shape)

        return x

# Choose the shape of the autoencoder
net = Baseline_1000(11)
net.forward(torch.randn(128, 1, 1000))

if cuda:
    net = net.cuda()

print(net)

Baseline_1000(
  (encoder): Sequential(
    (0): Conv1d(1, 64, kernel_size=(11,), stride=(1,), padding=(5,))
    (1): ReLU()
    (2): Conv1d(64, 64, kernel_size=(11,), stride=(2,), padding=(5,))
    (3): ReLU()
    (4): Conv1d(64, 64, kernel_size=(11,), stride=(2,), padding=(5,))
    (5): ReLU()
    (6): Conv1d(64, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (7): ReLU()
    (8): Conv1d(128, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (9): ReLU()
    (10): Conv1d(128, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (11): ReLU()
    (12): Conv1d(128, 256, kernel_size=(11,), stride=(2,), padding=(5,))
    (13): ReLU()
    (14): Conv1d(256, 256, kernel_size=(11,), stride=(2,), padding=(5,))
    (15): ReLU()
    (16): Conv1d(256, 256, kernel_size=(11,), stride=(2,), padding=(5,))
  )
  (decoder): Sequential(
    (0): ConvTranspose1d(512, 256, kernel_size=(11,), stride=(2,), padding=(5,), output_padding=(1,))
    (1): ReLU()
    (2): ConvTranspose1d(512, 256, ker

In [0]:
class Baseline_4000(nn.Module):
    def __init__(self,kernel_size):
        super(Baseline_4000, self).__init__()
        # We encode the data onto the latent space
        self.encoder = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=1),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            #nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2) # <- note the 2*latent_features
        )            
        # The latent code must be decoded into the original image
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(in_channels=256*2, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=256*2, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=0),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=256*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=0),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=1, kernel_size=kernel_size, padding=kernel_size//2, stride=1, output_padding=0),
            nn.Tanh()
        )
    def forward(self, x):
        enc_outs = []
        for i, l in enumerate(self.encoder):
            x = l(x)
            if i in [5, 11]:
              x = F.dropout(x, p=0.2)
            enc_outs.append(x)

        for i, l in enumerate(self.decoder):
          if (i % 2 == 0) and (i < len(self.decoder)-1):
            x = l(torch.cat((x, enc_outs[-i-1]), 1))
          else:
            x = l(x)
          if i in [5, 11]:
            x = F.dropout(x, p=0.2)
          #print(x.shape)

        return x

# Choose the shape of the autoencoder
net = Baseline_4000(11)
net.forward(torch.randn(128, 1, 4000))

if cuda:
    net = net.cuda()

print(net)

Baseline_4000(
  (encoder): Sequential(
    (0): Conv1d(1, 64, kernel_size=(11,), stride=(1,), padding=(5,))
    (1): ReLU()
    (2): Conv1d(64, 64, kernel_size=(11,), stride=(2,), padding=(5,))
    (3): ReLU()
    (4): Conv1d(64, 64, kernel_size=(11,), stride=(2,), padding=(5,))
    (5): ReLU()
    (6): Conv1d(64, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (7): ReLU()
    (8): Conv1d(128, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (9): ReLU()
    (10): Conv1d(128, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (11): ReLU()
    (12): Conv1d(128, 256, kernel_size=(11,), stride=(2,), padding=(5,))
    (13): ReLU()
    (14): Conv1d(256, 256, kernel_size=(11,), stride=(2,), padding=(5,))
    (15): ReLU()
    (16): Conv1d(256, 256, kernel_size=(11,), stride=(2,), padding=(5,))
  )
  (decoder): Sequential(
    (0): ConvTranspose1d(512, 256, kernel_size=(11,), stride=(2,), padding=(5,), output_padding=(1,))
    (1): ReLU()
    (2): ConvTranspose1d(512, 256, ker

##Baseline Batchnorm

In [0]:
class Baseline_BN(nn.Module):
    def __init__(self,kernel_size):
        super(Baseline_BN, self).__init__()
        # We encode the data onto the latent space
        self.encoder = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2),
            nn.BatchNorm1d(256),
            nn.ReLU()
            #nn.Conv1d(in_channels=256, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2) # <- note the 2*latent_features
        )            
        # The latent code must be decoded into the original image
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(in_channels=256*2, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=256*2, out_channels=256, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=256*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=0),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=0),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=128, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=128*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=64, kernel_size=kernel_size, padding=kernel_size//2, stride=2, output_padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.ConvTranspose1d(in_channels=64*2, out_channels=1, kernel_size=kernel_size, padding=kernel_size//2, stride=1, output_padding=0),
            nn.Tanh()
        )
    def forward(self, x):
        enc_outs = []
        for i, l in enumerate(self.encoder):
            x = l(x)
            if i in [8, 17]:
              x = F.dropout(x, p=0.2)
            enc_outs.append(x)

        for i, l in enumerate(self.decoder):
          if (i % 3 == 0) and (i < len(self.decoder)-1):
            x = l(torch.cat((x, enc_outs[-i-1]), 1))
          else:
            x = l(x)
          if i in [8, 17]:
            x = F.dropout(x, p=0.2)
          #print(x.shape)

        return x

# Choose the shape of the autoencoder
net = Baseline_BN(11)
net.forward(torch.randn(128, 1, 2000))

if cuda:
    net = net.cuda()

print(net)

Baseline_BN(
  (encoder): Sequential(
    (0): Conv1d(1, 64, kernel_size=(11,), stride=(1,), padding=(5,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv1d(64, 64, kernel_size=(11,), stride=(2,), padding=(5,))
    (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Conv1d(64, 64, kernel_size=(11,), stride=(2,), padding=(5,))
    (7): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU()
    (9): Conv1d(64, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (10): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): ReLU()
    (12): Conv1d(128, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (13): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): ReLU()
    (15): Conv1d(128, 128, kernel_size=(11,), stride=(2,), padding=(5,))
    (

## 2D

In [6]:
def c_win(win_size, hop_len, win_type=np.hamming):
  w = win_type(win_size)
  w = np.sqrt(w)
  K = np.sqrt(hop_len/sum(pow(w,2)))
  w = w*K
  return w
  
class CNN2d(nn.Module):
    def __init__(self, kernel_size):
        super(CNN2d, self).__init__()
        
        #self.latent_features = latent_features
        #self.num_samples = num_samples
        self.kernelSize = kernel_size
        self.stride = 2
        self.padding = 2
        self.depadding = 2
        # We encode the data onto the latent space
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=64, kernel_size=self.kernelSize, padding=self.padding, stride=1),#self.stride),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=self.kernelSize, padding=self.padding, stride=self.stride),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=self.kernelSize, padding=self.padding, stride=self.stride),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=self.kernelSize, padding=self.padding, stride=self.stride),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=self.kernelSize, padding=self.padding, stride=self.stride),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=self.kernelSize, padding=self.padding, stride=self.stride),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=self.kernelSize, padding=self.padding, stride=self.stride),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=self.kernelSize, padding=self.padding, stride=self.stride)
        )            
        

        # The latent code must be decoded into the original image
        self.decoder = nn.Sequential(
            #nn.ConvTranspose1d(in_channels=256*2, out_channels=256, kernel_size=kernelSize, padding=2, stride=2),
            nn.ConvTranspose2d(in_channels=256*2, out_channels=256, kernel_size=self.kernelSize, padding=self.depadding, stride=self.stride, output_padding=(1,1)),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=256*2, out_channels=256, kernel_size=self.kernelSize, padding=self.depadding, stride=self.stride, output_padding=(0,1)),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=256*2, out_channels=128, kernel_size=self.kernelSize, padding=self.depadding, stride=self.stride, output_padding=(0,1)),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=128*2, out_channels=128, kernel_size=self.kernelSize, padding=self.depadding, stride=self.stride, output_padding=(0,1)),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=128*2, out_channels=128, kernel_size=self.kernelSize, padding=self.depadding, stride=self.stride, output_padding=(0,0)),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=128*2, out_channels=64, kernel_size=self.kernelSize, padding=self.depadding, stride=self.stride, output_padding=(0,1)),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=64*2, out_channels=64, kernel_size=self.kernelSize, padding=self.depadding, stride=self.stride),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=64*2, out_channels=1, kernel_size=self.kernelSize, padding=self.depadding, stride=1)#self.stride)

        )

    def forward(self, x):
        enc_outs = []
        for i, l in enumerate(self.encoder):
            x = l(x)
            #if i in [8]:
              #x = F.dropout(x, p=0.2)
            enc_outs.append(x)

        for i, l in enumerate(self.decoder):
          if (i % 3 == 0):
            x = l(torch.cat((x, enc_outs[-i-1]), 1))
          else:
            x = l(x)
          #if i in [8]:
           #x = F.dropout(x, p=0.2)
          #print(x.shape)

        return x
      
        #x = self.input(x)
        #enc_outs = []
        #for i, l in enumerate(self.encoder):
        #    #print(l)
        #    #print(x)
        #    x = l(x)
        #    if i < len(self.decoder):
        #      x = F.relu(x)
        #    enc_outs.append(x)
        #for i, l in enumerate(self.decoder):
        #    #print(x.shape)
        #    #print( enc_outs[-i-1].shape)
        #    x = l(torch.cat((x, enc_outs[-i-1]), 1))
        #    if i < len(self.decoder)-1:
        #      x = F.relu(x)
        #   
        #    #print(l)
        ##x = self.output(x)
        ##print(x)
        #return x
      
    def reconstruct(self, x):
        x = x.detach().cpu()
        size = 251
        hop_len = 640//2
        window = torch.from_numpy(c_win(size, hop_len)).float()
        output = torch.zeros(x.shape)
        for frame in range(x.shape[-1]//hop_len):
          try:
            f = x[:,hop_len*frame:hop_len*frame+size]
            f = self.forward(f.unsqueeze(0).cuda()).detach().cpu()
            windowed = window.squeeze() * f.squeeze()
            output[:,hop_len*frame:hop_len*frame+size] += windowed
          except:
            pass
        return output
# Choose the shape of the autoencoder
with torch.no_grad():
  net = CNN2d(5)
  test = torch.randn(8, 1, 321, 251)
net.forward(test)
#print(net.reconstruct(torch.randn(8, 1, 321, 251)).shape)
if cuda:
    net = net.cuda()
print(net)
del net
del test
torch.cuda.empty_cache()

torch.Size([32, 1, 321, 251])
CNN2d(
  (encoder): Sequential(
    (0): Conv2d(1, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(64, 64, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Conv2d(64, 128, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (7): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU()
    (9): Conv2d(128, 128, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (10): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): ReLU()
    (12): Conv2d(128, 128, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (13): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): ReLU()
    (15): Conv2d(128, 256,

# Pipeline functions
Useful so we can make a loop where different hyper parameters are tested

Hyperparameters we want to test:

* Data type: waveform(Conv1D) or spectrogram(Conv2D) (CNN='1d' or CNN='2d')
* Input size/no of samples (chunk_size)
* Loss function (MAE, MSE, Huber, cosh. Measured on waveform/stft real+imaginary/stft mag/stft mag+phase)
* Regularization
* Optimizer (always Adam? Try AdamW (weight decay))
* Optimizer learning rate (optim.lr_scheduler)
* Batch size ?
* Epochs ?



## Data pipeline

In [0]:
#DataLoader
def PipelineData(SAMPLE_RATE=16000, CNN='2d',
                       batch_size=128, chunk_size=None,
                       GT_range=[[0,300]],
                       IR_range=[[0,12]], N_range=[[0,1]],
                       sequence_length_GT=16000*5,
                       sequence_length_IR=16000*1,
                       matrix_ops=True):
  
  #Data preprocessing:
  #total GT files: 4162
  #total IR files: 36
  Loader=DataLoader(GT_range=GT_range,
                         IR_range=IR_range,
                         N_range=N_range,
                         sequence_length_GT=sequence_length_GT,
                         sequence_length_IR=sequence_length_IR)

  #Loader.printFilenames('GT')
  #Loader.printFilenames('IR')

  Loader.cropAndPadIR(matrix_ops=matrix_ops)
  Loader.cropAndPadGT(matrix_ops=matrix_ops)
  Loader.add_reverb(matrix_ops=matrix_ops)
  Loader.normalize()
  Loader.cpu()
  if CNN=='2d':
    Loader.stft()
  if CNN=='2d_hann':
    Loader.stft(window=torch.hann_window())
  if CNN=='both':
    Loader.stft()
  
  
  #Iterator:
  if CNN=='2d':
    It=Iterator(Loader,mode='stft')
  elif CNN=='both':
    It=Iterator(Loader,mode='both')
  else:
    It=Iterator(Loader,mode='waveform')
    
  It.setBatchSize(batch_size)
  
  if chunk_size:
    It.setChunkSize(chunk_size)
    It.chunkify()
    
  print(f"Shape of GT: {It.GT.shape}")
  print(f"Shape of GT_IR: {It.GT_IR.shape}")
  if CNN=='both':
    print(f"Shape of GT_mag: {It.GT_mag.shape}")
    print(f"Shape of GT_IR_mag: {It.GT_IR_mag.shape}")
    print(f"Shape of GT_phase: {It.GT_phase.shape}")
    print(f"Shape of GT_IR_phase: {It.GT_IR_phase.shape}")
    print()
    print(f"Shape of GT_mag last: {It.GT_mag.shape[-1]}")
    print(f"Shape of GT last: {It.GT.shape[-1]}")
    print(f"Shape of GT_IR last: {It.GT_IR.shape[-1]}")
  
    

  return It

## Model pipeline

In [0]:
def PipelineModel(CNN='2d', kernel_size=11, optim='Adam',learning_rate=None,loss='Huber',use_cuda=True):

  #Define the model:
  assert CNN in ('2d', 'Baseline_2000', 'Baseline_1000', 'Baseline_4000', 'Baseline_BN')
  if CNN=='2d':
    net = CNN2d(kernel_size)
  elif CNN=='Baseline_BN':
    net = Baseline_BN(kernel_size)
  elif CNN=='Baseline_1000':
    net = Baseline_1000(kernel_size)
  elif CNN=='Baseline_2000':
    net = Baseline_2000(kernel_size)
  elif CNN=='Baseline_4000':
    net = Baseline_4000(kernel_size)
  net.apply(init_weights)

  if use_cuda:
    net = net.cuda()
  #Optimizer:
  if optim=='Adam':
    if learning_rate==None:
      learning_rate=0.0002
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
  elif optim=='AdamW':
    if learning_rate==None:
      learning_rate=0.0002
    optimizer = torch.optim.AdamW(net.parameters(), lr=learning_rate)
  elif optim=='RAdam':
    if learning_rate==None:
      learning_rate=0.0002
    optimizer = RAdam(net.parameters(), lr=learning_rate)
  #Loss:
  if loss=='L2':
    loss_function = nn.MSELoss()
  elif loss=='L1':
    loss_function = nn.L1Loss()
  elif loss=='Huber':
    loss_function = nn.SmoothL1Loss()
  elif loss=='stftMAE':
    loss_function = stftMAE
  elif loss=='stftMAE_hann':
    loss_function = stftMAE_hann
  elif loss=='stftHuber':
    loss_function = stftHuber
  elif loss=='magMAE':
    loss_function = magMAE
  else:
    print("Invalid loss function")

  return net, optimizer, loss_function

## Fitting pipeline

In [0]:
def save_checkpoint(state, old_loss, loss, is_best, filename='/content/drive/My Drive/reverberation/Models/test.pth.tar'):
    """Save checkpoint if a new best is achieved"""
    if is_best:
        print (f"=> Saving a new best loss improved from {round(old_loss,5)} to {round(loss,5)}")
    else:
        print ("=> Validation Accuracy did not improve")
    torch.save(state, filename)  # save checkpoint

In [0]:
# TO DO: 
## Return results
## Iterators can return 2 or 4 arguments
def PipelineFit(net, optimizer, loss_function,
                trainIterator, testIterator,
                num_epochs, save_name, 
                load_name=None,
                #scheduler_milestones=[10,50],
                #scheduler_gamma=1,
                plotting=False, return_results=True, 
                use_cuda=True):
  print("Training network...")
  tmp_img = "tmp_ae_out.png"
  train_loss = []
  valid_loss = []
  timings = []

  #Scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)
  #Scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=scheduler_milestones, gamma=scheduler_gamma)
  Scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=4) 

  if load_name:
    cp = torch.load(load_name)
    start_epoch = cp['epoch']
    Scheduler.load_state_dict(cp['scheduler'])
    net.load_state_dict(cp['state_dict'])
    optimizer.load_state_dict(cp['optimizer'])
    train_loss = cp['train_loss']
    valid_loss = cp['valid_loss']
    best_loss = cp['best_loss']
    best_state_dict = cp['best_state_dict']
  else:
    start_epoch = 0

  for epoch in range(start_epoch,num_epochs):
    start_time = time.time()
    batch_loss = []
    net.train()
    
    # Go through each batch in the training dataset using the loader
    # Note that y is not necessarily known as it is here
    #for ir in range(27):
    trainIter=iter(trainIterator)
    #count=0
    for x,target,x_phase,target_phase in trainIter:
      #if count % 1000 == 1:
        #print(count)
      #count+=1
      optimizer.zero_grad()
      x = Variable(x,requires_grad=True)
      if use_cuda:
        x = x.cuda()
        target=target.cuda()
      output = net(x)
      loss = loss_function(target, output)
      batch_loss.append(loss.item())
      loss.backward()
      optimizer.step()
    train_loss.append(np.mean(batch_loss))

    # Evaluate, do not propagate gradients
    with torch.no_grad():
      batch_loss = []
      net.eval()
      testIter=iter(testIterator)
      for x,target,x_phase,target_phase in testIter:
        x = Variable(x)
        if use_cuda:
          x = x.cuda()
          target = target.cuda()
        output = net(x)
        loss = loss_function(target, output)
        #Ferdi's stuff:
        batch_loss.append(loss.item())
      valid_loss.append(np.mean(batch_loss))
    
    Scheduler.step(np.mean(batch_loss))

    #if epoch % 2 == 0:
    #  path=f"/content/drive/My Drive/reverberation/Models/{save_name}.p"
    #  torch.save(net.state_dict(), path)
    if epoch+1 == 1:
      best_loss = valid_loss[epoch]
      best_state_dict = None

    # Get bool not ByteTensors
    is_best = bool(valid_loss[epoch] < best_loss)
    if is_best:
      old_loss = best_loss
      best_state_dict = net.state_dict()
    else:
      old_loss = 0
    # Get greater Tensor to keep track best acc
    best_loss = min(valid_loss[epoch], best_loss)

    #Time the epoch
    timer = time.time()-start_time  
    timings.append(timer)
    print(f"Epoch took {timer} seconds to run")

    # Save checkpoint if is a new best

    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict':  net.state_dict(),
        'best_state_dict': best_state_dict,
        'best_loss': best_loss,
        'time': timer,
        'valid_loss': valid_loss,
        'train_loss': train_loss,
        'optimizer': optimizer.state_dict(),
        'scheduler': Scheduler.state_dict()
    }, old_loss, best_loss, is_best, filename=f'/content/drive/My Drive/reverberation/Models/Evaluation/{save_name}.pth.tar')

    if epoch == 0:
      continue
    
    if plotting:

      # -- Plotting --
      f, ax1 = plt.subplots(figsize=(8,8))
    
      # Loss
      #ax = axarr[0]
      ax1.set_title("Loss")
      ax1.set_xlabel('Epoch')
      ax1.set_ylabel('Loss')
      #ax2.set_title("Validation error")
      #ax2.set_xlabel('Epoch')
      #ax2.set_ylabel('Error')


      ax1.plot(np.arange(epoch+1), train_loss, color="blue", linestyle="-.")
      ax1.plot(np.arange(epoch+1), valid_loss, color="red", linestyle="-.")
      ax1.legend(['Training','Validation'])
      plt.tight_layout()
      plt.savefig(tmp_img)
      plt.close(f)
      display(Image(filename=tmp_img))
      clear_output(wait=True)

      os.remove(tmp_img)
  
  
  if return_results:
    return train_loss,valid_loss, timings


## Evaluation pipeline

In [0]:
def P2R(radii, angles):
    complex_np=np.multiply(radii.detach().cpu().numpy(),np.exp(1j*angles.detach().cpu().numpy()))
    real=complex_np.real
    imag=complex_np.imag
    real=torch.from_numpy(real)
    imag=torch.from_numpy(imag)
    stack = torch.stack((real,imag),dim=4)
    return stack

def evaluationMetrics(net,testIterator,mode,window=None,cuda=True,cutoff=100):
  assert mode in ('waveform','stft')

  SAMPLE_RATE=16000
  net.eval()

  #Collect inputs, outputs and targets
  #inputs=np.empty(0)
  #outputs=np.empty(0)
  #targets=np.empty(0)
  lossPESQ = []
  lossSTOI = []
  dummyPESQ = []
  dummySTOI = []
  Iterator = iter(testIterator)
  count=0
  if mode=='waveform':
    for x,target,x_phase,target_phase in Iterator:
      if cuda:
        x=x.cuda()
      y=net(x)
      y=y.view(-1).cpu().detach().numpy()
      #outputs=np.append(outputs,y)
    
      x=x.view(-1).cpu().detach().numpy()
      #inputs=np.append(inputs,x)

      target=target.view(-1).cpu().detach().numpy()
      #targets=np.append(targets,target)
    
      #metrics:
      lossPESQ.append(pesq(SAMPLE_RATE, target, y, 'wb'))
      dummyPESQ.append(pesq(SAMPLE_RATE, target, x, 'wb'))
      lossSTOI.append(stoi(target, y, SAMPLE_RATE, extended=False))
      dummySTOI.append(stoi(target, x, SAMPLE_RATE, extended=False))

      count+=1
      if count % 100 == 1:
        print(count)
      if count==cutoff:
        break

  if mode=='stft':
    if window==None:
      window=torch.ones(640)
    for x_mag,target_mag,x_phase,target_phase,x,target in Iterator:
      if cuda:
        x_mag=x_mag.cuda()
      y_mag=net(x_mag)
      y=P2R(y_mag,x_phase)
      y=ta.functional.istft(y.squeeze(1),n_fft=640,hop_length=640//2,window=window)
      
      y=y.view(-1).cpu().detach().numpy()
      x=x.view(-1).cpu().detach().numpy()
      target=target.view(-1).cpu().detach().numpy()
      
      
      #metrics:
      lossPESQ.append(pesq(SAMPLE_RATE, target, y, 'wb'))
      dummyPESQ.append(pesq(SAMPLE_RATE, target, x, 'wb'))
      lossSTOI.append(stoi(target, y, SAMPLE_RATE, extended=False))
      dummySTOI.append(stoi(target, x, SAMPLE_RATE, extended=False))
      
      count+=1
      if count % 100 == 1:
        print(count)
      if count==cutoff:
        break

  
  print('The results below is based on the 64 reconstructed sound files')
  #print('Mean loss of MSE: ' + str(lossMSE))
  #print('Variance loss of MSE: '+ str(np.var(lossMSE)))
  print('Mean loss of PESQ: ' + str(np.mean(lossPESQ)))
  print('Variance loss of PESQ: '+ str(np.var(lossPESQ)))
  print('Mean loss of STOI: ' + str(np.mean(lossSTOI)))
  print('Variance loss of STOI: '+ str(np.var(lossSTOI)))


  print('The results below is based on the 64 noisy sound files')
  #print('Mean loss of MSE: ' + str(dummyMSE))
  #print('Variance loss of MSE: '+ str(np.var(lossMSE)))
  print('Mean loss of PESQ: ' + str(np.mean(dummyPESQ)))
  print('Variance loss of PESQ: '+ str(np.var(dummyPESQ)))
  print('Mean loss of STOI: ' + str(np.mean(dummySTOI)))
  print('Variance loss of STOI: '+ str(np.var(dummySTOI)))

# Scheduler

## Data step

In [12]:
 ## Get training and test iterators:
trainIterator=PipelineData(SAMPLE_RATE=16000, CNN='2d',
                       GT_range=[[0,20],[117,137],[226,246],
                                       [342,362],[451,471],[599,619],
                                       [728,748],[866,886],[997,1017],
                                       [1109,1129],[1232,1252],
                                       [1352,1372],[1472,1492],
                                       [1591,1611],[1701,1721],
                                       [1812,1832],[1928,1948],
                                       [2066,2086],[2197,2217],
                                       [2288,2309],[2406,2425], #Only 19 recordings from this speaker, so we use 21 from the previous speaker 
                                       [2425,2445],[2539,2559],
                                       [2653,2673],[2768,2788]
                                       ],
                       IR_range=[[0,6],[18,24]],
                       batch_size=8, chunk_size=None, 
                       matrix_ops=True) #Set matrix_ops to False, if cuda does not have enough memory

testIterator=PipelineData(SAMPLE_RATE=16000, CNN='2d',
                          GT_range=[[2876,2896],[2995,3015],
                                       [3110,3130],[3229,3249],
                                       [3347,3367]
                                       ],
                          IR_range=[[24,30]],
                          batch_size=8, chunk_size=None,
                          matrix_ops=True)

Found 500 ground truth files in data set
Found 12 impulse response files in data set
Found 1 noise files in data set

Running GPU.
GT
Processing file 1 of 500...
Processing file 101 of 500...
Processing file 201 of 500...
Processing file 301 of 500...
Processing file 401 of 500...
IR
Processing file 1 of 12...
N
Processing file 1 of 1...
Shape of GT: torch.Size([6000, 1, 81, 601])
Shape of GT_IR: torch.Size([6000, 1, 81, 601])
Found 100 ground truth files in data set
Found 6 impulse response files in data set
Found 1 noise files in data set

Running GPU.
GT
Processing file 1 of 100...
IR
Processing file 1 of 6...
N
Processing file 1 of 1...
Shape of GT: torch.Size([600, 1, 81, 601])
Shape of GT_IR: torch.Size([600, 1, 81, 601])


## Model step

In [0]:
 ## Get the model:
net,optimizer,loss_function=PipelineModel(CNN='2d', #Change this depending on the chunk size (1000, 2000 or 4000)
                                          kernel_size=5, #5 for 2d, 11 for 1d
                                          learning_rate=0.0003, #if running scheduler, set lr=0.0004. Otherwise, set lr=0.0002
                                          optim='AdamW',
                                          loss='Huber',
                                          use_cuda=cuda)

In [14]:
 ## See how many parameters are in the model (Optional)
pytorch_total_params = sum(p.numel() for p in net.parameters())
pytorch_trainable_params = sum(p.numel() for p in net.parameters() if p.requires_grad)

print(f"Total parameters in the model: {pytorch_total_params}")
print(f"Total trainable parameters in the model: {pytorch_trainable_params}")

Total parameters in the model: 9532097
Total trainable parameters in the model: 9532097


## Training step

In [15]:
 ## Perform the training loop:
train_loss,valid_loss,timings=PipelineFit(net,optimizer,loss_function,
                                  trainIterator,testIterator,
                                  num_epochs=80,
                                  save_name="2D_BN_Huber_lr3e-4", #Format: "Baseline_learningrate_chunksize_batchsize"
                                  #load_name='/content/drive/My Drive/reverberation/Models/Evaluation/LARGER_2D_BN_Huber_lr3e-4.pth.tar',
                                  plotting=True,return_results=True,use_cuda=cuda)

KeyboardInterrupt: ignored

## Evaluation step

Template for evaluating 1d models:

In [18]:
net = Baseline_BN(11)
if cuda:
  net = net.cuda()
cp = torch.load('/content/drive/My Drive/reverberation/Models/Evaluation/Baseline_BN_stftHuber_lr3e-4.pth.tar')
net.load_state_dict(cp['state_dict'])
print(cp['epoch'])

del cp

evalIterator=PipelineData(SAMPLE_RATE=16000, CNN='1d',
                          GT_range=[[2876,2896],[2995,3015],
                                    [3110,3130],[3229,3249],
                                    [3347,3367]
                                    ],
                          IR_range=[[24,30]],
                          batch_size=24, #With a batch size 24 and chunk size 2000, each batch has 48000 samples, which is the full 3 seconds of each noisy audio file 
                          chunk_size=2000,
                          matrix_ops=True)


evaluationMetrics(net,evalIterator,mode='waveform',window=None,cuda=True,cutoff=1000)



NameError: ignored

Template for evaluating 2d models:

In [14]:
net = CNN2d(5)
if cuda:
  net = net.cuda()
cp = torch.load('/content/drive/My Drive/reverberation/Models/Evaluation/LARGER_2D_BN_Huber_lr3e-4.pth.tar')
net.load_state_dict(cp['state_dict'])
del cp

evalIterator=PipelineData(SAMPLE_RATE=16000, CNN='both',
                          GT_range=[[2876,2896],[2995,3015],
                                    [3110,3130],[3229,3249],
                                    [3347,3367]
                                    ],
                          IR_range=[[24,30]],
                          batch_size=1, chunk_size=[80000,251],  
                          matrix_ops=True)


evaluationMetrics(net,evalIterator,mode='stft',window=None,cuda=True,cutoff=1000)



Found 100 ground truth files in data set
Found 6 impulse response files in data set
Found 1 noise files in data set

Running GPU.
GT
Processing file 1 of 100...
IR
Processing file 1 of 6...
N
Processing file 1 of 1...
Shape of GT: torch.Size([600, 1, 80000])
Shape of GT_IR: torch.Size([600, 1, 80000])
Shape of GT_mag: torch.Size([600, 1, 321, 251])
Shape of GT_IR_mag: torch.Size([600, 1, 321, 251])
Shape of GT_phase: torch.Size([600, 1, 321, 251])
Shape of GT_IR_phase: torch.Size([600, 1, 321, 251])

Shape of GT_mag last: 251
Shape of GT last: 80000
Shape of GT_IR last: 80000
1
101
201
301
401
501
The results below is based on the 64 reconstructed sound files
Mean loss of PESQ: 1.913664649128914
Variance loss of PESQ: 0.031005523979784465
Mean loss of STOI: 0.8944371952995724
Variance loss of STOI: 0.0005713530978530231
The results below is based on the 64 noisy sound files
Mean loss of PESQ: 2.043896409869194
Variance loss of PESQ: 0.10101924893385152
Mean loss of STOI: 0.887400764383

The results below is based on the 64 reconstructed sound files
Mean loss of PESQ: 1.8955407398939133
Variance loss of PESQ: 0.02847486814899287
Mean loss of STOI: 0.8931947293315599
Variance loss of STOI: 0.0006083316814258413
The results below is based on the 64 noisy sound files
Mean loss of PESQ: 2.043896409869194
Variance loss of PESQ: 0.10101924893385152
Mean loss of STOI: 0.887400764383231
Variance loss of STOI: 0.0008806964293024692

The results below is based on the 64 reconstructed sound files
Mean loss of PESQ: 2.2383980164925257
Variance loss of PESQ: 0.034944755567957723
Mean loss of STOI: 0.9329359117642793
Variance loss of STOI: 0.00029623403612361054
The results below is based on the 64 noisy sound files
Mean loss of PESQ: 2.043896409869194
Variance loss of PESQ: 0.10101924893385152
Mean loss of STOI: 0.887400764383231
Variance loss of STOI: 0.0008806964293024692

In [0]:
trainIter=iter(trainIterator)
x,target,x_phase,target_phase=next(trainIter)
print(x.shape)
print(x_phase.shape)
#y=net(x.cuda())
#print(y.shape)

torch.Size([64, 1, 161, 601])
torch.Size([64, 1, 161, 601])


In [0]:
def P2R(radii, angles):
    complex_np=np.multiply(radii.detach().cpu().numpy(),np.exp(1j*angles.detach().cpu().numpy()))
    real=complex_np.real
    imag=complex_np.imag
    real=torch.from_numpy(real)
    imag=torch.from_numpy(imag)
    stack = torch.stack((real,imag),dim=4)
    return stack
t_complex=P2R(target,target_phase)
#y_complex_x_ph=P2R(y,x_phase)
#y_complex_t_ph=P2R(y,target_phase)
x_complex=P2R(x,x_phase)
print(x_complex.shape)
print(x_complex[0,0,0,0:10])
print(x_complex[0,0,0,0:10,0])
#print(x_complex[0,0,0,0:10].real)
#print(x_complex[0,0,0,0:10].imag)

torch.Size([64, 1, 161, 601, 2])
tensor([[-6.2762e-03, -5.4869e-10],
        [-6.3913e-03, -5.5875e-10],
        [ 2.6923e-02,  0.0000e+00],
        [-6.4914e-03, -5.6750e-10],
        [-4.4784e-02, -3.9151e-09],
        [-1.5050e-02, -1.3157e-09],
        [-1.7641e-02, -1.5422e-09],
        [-7.7991e-03, -6.8182e-10],
        [-7.5171e-04, -6.5716e-11],
        [ 4.4923e-03,  0.0000e+00]])
tensor([-0.0063, -0.0064,  0.0269, -0.0065, -0.0448, -0.0150, -0.0176, -0.0078,
        -0.0008,  0.0045])


In [0]:
t_2=ta.functional.istft(t_complex.squeeze(1),n_fft=640,hop_length=640//2)
#yx_2=ta.functional.istft(y_complex_x_ph.squeeze(1),n_fft=640,hop_length=640//2)
#yt_2=ta.functional.istft(y_complex_t_ph.squeeze(1),n_fft=640,hop_length=640//2)
x_2=ta.functional.istft(x_complex.squeeze(1),n_fft=640,hop_length=640//2)

x_2.shape

torch.Size([64, 96000])

In [0]:
i=50

In [0]:
#Target
Audio(t_2[i,:].cpu().detach().numpy(),rate=16000)

In [0]:
#Input:
Audio(x_2[i,:].cpu().detach().numpy(),rate=16000)

In [0]:
#Output with input phase:
Audio(yx_2[i,:].cpu().detach().numpy(),rate=16000)

In [0]:
#Output with target phase:
Audio(yt_2[i,:].cpu().detach().numpy(),rate=16000)

In [0]:
target = target.view(1,1,-1)

In [0]:
i = 1

In [0]:
#Target
Audio(target[0,0,:].cpu().detach().numpy(),rate=16000)

In [0]:
y = y.view(1,1,-1)

In [0]:
#Input:
Audio(y[0,0,:].cpu().detach().numpy(),rate=16000)

NameError: ignored

In [0]:
x = x.view(1,1,-1)

In [0]:
#Output with input phase:
Audio(x[0,0,:].cpu().detach().numpy(),rate=16000)