# **MAAS (GPU) Filter Module**

This notebook details the process of filtering wav audio files for usage in the MAAS project. The first four steps are an implementation of the process described by Salamon and Gómez [2]. The rest is inspired by the research done by Wang [3].

DISCLAIMER: English was purposely used to write comments and cells as it suits our ideas better.

NOTE: This notebook is a module strictly required by MAAS_UI.ipynb to function.

In [None]:
# Define Mauricio's Audio Analysis System function
def MAAS_filter(data, sr = 44100, filename = None):
  ## **Step 1.** Sinusoid Extraction
  # 1.1. Equal Loudness Filtering
  # Filter approximation by [1]

  yule_b = cp.array([0.05418656406430, -0.02911007808948, -0.00848709379851, -0.00851165645469,
                     -0.00834990904936, 0.02245293253339, -0.02596338512915, 0.01624864962975,
                     -0.00240879051584, 0.00674613682247, -0.00187763777362])
  yule_a = cp.array([1.0, -3.47845948550071, 6.36317777566148, -8.54751527471874, 9.47693607801280,
                     -8.81498681370155, 6.85401540936998, -4.39470996079559, 2.19611684890774,
                     -0.75104302451432, 0.13149317958808])
  butter_b = cp.array([0.98500175787242, -1.97000351574484, 0.98500175787242])
  butter_a = cp.array([1.0, -1.96977855582618, 0.97022847566350])

  num = cp.convolve(yule_b, butter_b)
  den = cp.convolve(yule_a, butter_a)

  eq_data = sig.lfilter(num, den, data)
  # 1.2. Spectral Transform
  # Apply Short-Time Fourier Transform

  # Constants as calculated by [2]
  # Values specific to 44.1 kHz, should be scalable to other sample rates
  M = 2048
  N = 8192
  H = 128

  win = sig.windows.hann(M, False)
  f, t, Sx = sig.stft(eq_data, fs=sr, window=win, nperseg=M, noverlap=H, nfft=N, return_onesided=False)
    
  # 1.3. Frequency/Amplitude Correction
  # Identify local maxima of each time frame

  peaks = sig.argrelmax(abs(Sx), axis=0, order=2)
  pcp = max(abs(Sx[peaks[0], peaks[1]])) / 20
  peak_ids = abs(Sx[peaks[0], peaks[1]]) > pcp
  peaks_f = peaks[0][peak_ids]
  peaks_t = peaks[1][peak_ids]

  # x and y scale actually important. do not delete
  x_scale = data.size/Sx.shape[1]
  y_scale = sr/N
    
  # Correct frequency and amplitude based on peaks' phase
  # Done by computing instantaneous frequency (IF) and magnitude (Ai)
  # Further clarification about this implementation by [4] and [5]

  y_scale = sr/N
  k_offset = cp.angle(Sx[peaks_f, peaks_t]) - (2*cp.pi*H*peaks_f/N)
  mask = peaks_t > 0
  k_offset[mask] -= cp.angle(Sx[peaks_f[mask], peaks_t[mask]-1])
  k_offset = k_offset%(2*cp.pi)
  mask = k_offset > cp.pi
  k_offset[mask] -= 2*cp.pi
  Wh = cp.sinc(k_offset) / (2*(1 - k_offset**2))
  k_offset *= N/(2*cp.pi*H)

  IF = (peaks_f + k_offset) * y_scale
  Ai = abs(Sx[peaks_f, peaks_t]) / (2*Wh)
    
  ## **Step 2.** Salience Function Computation
  # Constant values given by [2]
  alpha = 0.8
  beta = 1
  gamma = 2
  Nh = 20

  # Useful peak indexing per frame
  peaks_by_t = cp.stack((peaks_t, cp.arange(peaks_t.size)), axis=1)
  peaks_by_t = peaks_by_t[cp.argsort(peaks_by_t[:,0])]

  # Calculate number of peaks and highest peak per frame
  peaks_per_f = cp.zeros(Sx.shape[1], dtype=int)
  max_peak = cp.zeros(peaks_per_f.size, dtype=int)

  for i in range(peaks_t.size):
    peaks_per_f[peaks_t[i]] += 1
    if IF[i] > max_peak[peaks_t[i]]:
      max_peak[peaks_t[i]] = i
  # Function definitions

  # Bin: Computes (discrete) bin number of given frequency
  def Bin(fi):
    return cp.floor(120*cp.log2(fi/55) + 1)

  # Threshold: Whether or not a given peak is loud enough
  # compared to highest peak in its frame
  def Threshold(a, t, g):
    tr = cp.log10(abs(Ai[max_peak[t]] / a))
    ea = cp.where(tr < g, 1, 0)
    return ea

  # Weight: Assigns (cos^2) weight to bin if the given peak is a
  # multiple of the bin's center frequency (harmonic)
  def Weight(a, b, h, fi):
    nh = cp.arange(h)+1
    d = abs(Bin(cp.outer(fi, (1 / nh))) - b)/10
    w = (a**(nh-1)) * cp.cos(cp.pi*d/2)**2
    w[(d > 1)] = 0
    return cp.sum(w, axis=1)
  # Computation of every bin's salience per frame

  Sb = cp.zeros((600, Sx.shape[1]))

  index = 0
  for l in range(Sb.shape[1]):
    idx = slice(index, index + peaks_per_f[l])
    ea = Threshold(Ai[peaks_by_t[idx,1]], l, gamma)
    ab = abs(Ai[peaks_by_t[idx,1]])**beta
    for b in range(Sb.shape[0]):
      w = Weight(alpha, b+1, Nh, IF[peaks_by_t[idx,1]])
      Sb[b, l] = cp.sum(w * ab)
    index += peaks_per_f[l]
      
  # Identify local maxima of each time frame

  Speaks = sig.argrelmax(Sb, axis=0, order=5)
  Speaks_arr = cp.array([Speaks[0], Speaks[1]]).T
  Speaks_arr = Speaks_arr[cp.argsort(Speaks_arr[:,1])]

  pcp2 = max(abs(Sb[Speaks_arr[:,0], Speaks_arr[:,1]])) / 10
  Speak_ids = abs(Sb[Speaks_arr[:,0], Speaks_arr[:,1]]) > pcp2
  Speaks_b = Speaks_arr[Speak_ids, 0].flatten()
  Speaks_t = Speaks_arr[Speak_ids, 1].flatten()
    
  ## **Step 3.** Peak Streaming
  # Calculate highest salience peaks per frame
  # and filter low salience peaks

  peak_Sb = Sb[Speaks_b, Speaks_t]
  max_Speak = cp.zeros(Sb.shape[1], dtype=int)

  for i in range(peak_Sb.size):
    if peak_Sb[i] > peak_Sb[max_Speak[Speaks_t[i]]]:
      max_Speak[Speaks_t[i]] = i

  max_id = max_Speak[Speaks_t]
  mask = peak_Sb < peak_Sb[max_id] * 0.9
  S_minus = cp.arange(peak_Sb.size)[mask]
  # Filter remaining peaks based on general frame salience

  boolarr = cp.ones(peak_Sb.size, dtype=bool)
  boolarr[S_minus] = False

  N = peak_Sb.size - S_minus.size
  s_mean = cp.average(peak_Sb[boolarr])
  s_dev = cp.std(peak_Sb[boolarr])
  min_S = s_mean - 0.9*s_dev

  mask = (boolarr == 1) & (peak_Sb < min_S)
  S_minus = cp.append(S_minus, cp.arange(peak_Sb.size)[mask])
  boolarr[mask] = 0
    
  # Actual contour creation
  # Travel through the remaining ordered peaks and group them based on
  # time and pitch continuity
  # Might use some filtered peaks to maintain the continuities

  S_plus = cp.array(cp.where(boolarr == 1)).flatten()
  contours = cp.zeros(S_plus.size, dtype=int)
  contour_ids = cp.zeros((0,2), dtype=int)
  first_f = min(Speaks_t)
  last_f = max(Speaks_t)
  contours_idx = 0

  while S_plus.size:
    wpeak_Sb = cp.copy(peak_Sb[S_plus])
    it = cp.argmax(wpeak_Sb)
    it2 = 0
    mask = Speaks_t[S_minus] == Speaks_t[it]
    if mask.any(): it2 = cp.arange(S_minus.size)[mask][0]

    c_contour = cp.zeros(1, dtype=int)
    c_delete = cp.zeros(1, dtype=int)
    c_delete2 = cp.zeros(1, dtype=int)
    c_contour[0] = S_plus[it]
    c_delete[0] = it

    # Forward
    last_p = S_plus[it]
    last = it
    os = 1
    last2 = it2
    os2 = 1
    gap = 1

    while gap < 35 and (Speaks_t[last_p] + gap) <= last_f:
      if (it+os) < S_plus.size and Speaks_t[S_plus[it+os]] == Speaks_t[last_p]:
        os += 1
      elif (it+os) < S_plus.size and Speaks_t[S_plus[it+os]] == Speaks_t[last_p] + gap:
        if abs(Speaks_b[S_plus[it+os]] - Speaks_b[last_p]) < 9:
          last = it + os
          last_p = S_plus[last]
          c_contour = cp.append(c_contour, last_p)
          c_delete = cp.append(c_delete, last)
          gap = 1
        os += 1
      else:
        if (it2+os2) < S_minus.size and Speaks_t[S_minus[it2+os2]] == Speaks_t[last_p]:
          os2 += 1
        elif (it2+os2) < S_minus.size and Speaks_t[S_minus[it2+os2]] == Speaks_t[last_p] + gap:
          if abs(Speaks_b[S_minus[it2+os2]] - Speaks_b[last_p]) < 9:
            last2 = it2 + os2
            last_p = S_minus[last2]
            c_contour = cp.append(c_contour, last_p)
            c_delete2 = cp.append(c_delete2, last2)
            gap = 1
          os2 += 1
        else: gap += 1

    # Backward
    c_contour = cp.flip(c_contour)
    last_p = S_plus[it]
    last = it
    os = 1
    last2 = it2
    os2 = 1
    gap = 1

    while gap < 35 and (Speaks_t[last_p] - gap) >= first_f:
      if (it-os) >= 0 and Speaks_t[S_plus[it-os]] == Speaks_t[last_p]:
        os += 1
      elif (it-os) >= 0 and Speaks_t[S_plus[it-os]] == Speaks_t[last_p] - gap:
        if abs(Speaks_b[S_plus[it-os]] - Speaks_b[last_p]) < 9:
          last = it - os
          last_p = S_plus[last]
          c_contour = cp.append(c_contour, last_p)
          c_delete = cp.append(c_delete, last)
          gap = 1
        os += 1
      else:
        if (it2-os2) >= 0 and Speaks_t[S_minus[it2-os2]] == Speaks_t[last_p]:
          os2 += 1
        elif (it2-os2) >= 0 and Speaks_t[S_minus[it2-os2]] == Speaks_t[last_p] - gap:
          if abs(Speaks_b[S_minus[it2-os2]] - Speaks_b[last_p]) < 9:
            last2 = it2 - os2
            last_p = S_minus[last2]
            c_contour = cp.append(c_contour, last_p)
            c_delete2 = cp.append(c_delete2, last2)
            gap = 1
          os2 += 1
        else: gap += 1

    c_contour = cp.flip(c_contour)
    contour_ids = cp.append(contour_ids, [[contours.size, contours.size + c_contour.size]], axis=0)
    contours[contours_idx] = c_contour

    S_plus = cp.delete(S_plus, c_delete)
    S_minus = cp.delete(S_minus, c_delete2)
    contours_idx += 1
      
  ## **Step 4.** Melody Selection
  # Calculate certain attributes for each contour

  p_mean = cp.array([cp.mean(Speaks_b[contours[c[0]:c[1]]]) for c in contour_ids])
  p_std = cp.array([cp.std(Speaks_b[contours[c[0]:c[1]]]) for c in contour_ids])
  s_mean = cp.array([cp.mean(peak_Sb[contours[c[0]:c[1]]]) for c in contour_ids])
  s_total = cp.array([cp.sum(peak_Sb[contours[c[0]:c[1]]]) for c in contour_ids])
  s_std = cp.array([cp.std(peak_Sb[contours[c[0]:c[1]]]) for c in contour_ids])
  # 4.1. Voicing Detection

  # Give contours with considerable pitch deviation "immunity"
  # for the rest of this step
  not_vib = p_std < 4

  # Filter contours with low salience
  s_meanmean = cp.mean(s_mean)
  s_meanstd = cp.std(s_mean)
  v = 0.4
  mask = not_vib & (s_mean < (s_meanmean - v*s_meanstd))
  dell = cp.flip(cp.where(mask)[0]).flatten()
  contour_ids = cp.delete(contour_ids, dell)
      
  # 4.2. Octave Errors and Pitch Outliers
  # Function definitions

  # Derive melody pitch mean from given contours
  def CreatePt(cont):
    Pt = cp.zeros(Sb.shape[1])
    Pt_d = cp.zeros(Sb.shape[1])

    for c in cont:
      k = Speaks_t[contours[c]]
      Pt[k] += Speaks_b[contours[c]]*peak_Sb[contours[c]]
      Pt_d[k] += peak_Sb[contours[c]]
    mask = Pt_d != 0
    Pt[mask] /= Pt_d[mask]

    Pt_ = cp.zeros(Pt.size)
    for i in range(Pt_.size):
      rge = slice(max([0, i-860]), min([Pt.size, i+861]))
      Pt_[i] = cp.sum(Pt[rge])
      n = cp.count_nonzero(Pt[rge])
      if n != 0: Pt_[i] /= n

    return Pt_

  # Find contour overlap sections
  def Overlaps(cont):
    overlaps = cp.zeros((0,4), dtype=int)
    timer = cp.zeros(0, dtype=int)
    timer_ids = cp.zeros((Sb.shape[1], 2), dtype=int)

    for i in range(cont.size):
      c = cont[i]
      for j in range(Speaks_t[contours[c][0]], Speaks_t[contours[c][-1]]+1):
        for a in range(timer_ids[j,0], timer_ids[j,1]):
          i2 = timer[a]
          c2 = cont[i2]
          done = (overlaps.T[0] == i) & (overlaps.T[1] == i2)
          if done.any(): continue

          last = min([Speaks_t[contours[c][-1]], Speaks_t[contours[c2][-1]]])
          overlaps = cp.append(overlaps, [[i, i2, j, last]], axis=0)

          new_x = overlaps.shape[0] - 1
          for k in range(overlaps.shape[0]-1):
            if (overlaps[k,0] == i or overlaps[k,1] == i or
                overlaps[k,0] == i2 or overlaps[k,1] == i2):
              if ((overlaps[k,3] - overlaps[k,2]) <
                  (overlaps[new_x,3] - overlaps[new_x,2])) and k > new_x:
                overlaps[k], overlaps[new_x] = overlaps[new_x], overlaps[k]
                new_x = k
        timer = cp.insert(timer, timer_ids[j,1], i)
        timer_ids.T[1, j:] += 1
        timer_ids.T[0, (j+1):] += 1

    return overlaps

  # Detect pairs of octave duplicates and pick closest to Pt
  def PickOctave(over, Pt):
    cont0 = contours[contour_ids[over[0]][0]:contour_ids[over[0]][1]]
    cont1 = contours[contour_ids[over[1]][0]:contour_ids[over[1]][1]]
    dif = cp.zeros(0)
    pt0 = cp.zeros(0)
    pt1 = cp.zeros(0)
    i0 = 0
    i1 = 0

    while Speaks_t[cont0[i0]] != over[2]:
      if (i0+1) >= (cont0.size): break
      else: i0 += 1
    while Speaks_t[cont1[i1]] != over[2]:
      if (i1+1) >= (cont1.size): break
      else: i1 += 1
    for t in range(over[2], over[3]+1):
      while Speaks_t[cont0[i0]] < t:
        if (i0+1) >= (cont0.size): break
        else: i0 += 1
      if Speaks_t[cont0[i0]] > t: continue
      while Speaks_t[cont1[i1]] < t:
        if (i1+1) >= (cont1.size): break
        else: i1 += 1
      if Speaks_t[cont1[i1]] > t: continue

      b0 = Speaks_b[cont0[i0]]
      b1 = Speaks_b[cont1[i1]]
      dif = cp.append(dif, abs(b0 - b1))
      pt0 = cp.append(pt0, abs(Pt[t] - b0))
      pt1 = cp.append(pt1, abs(Pt[t] - b1))

    if cp.mean(dif) >= 115 and cp.mean(dif) <= 125:
      if cp.mean(pt0) == cp.mean(pt1): return 0
      elif cp.mean(pt0) < cp.mean(pt1): return 1
      else: return 2
    else: return 0
  # Filter out octave duplicates and pitch outliers
  # Repeat process three times with updated melody pitch means

  Pt = CreatePt(contour_ids)
  overlaps = Overlaps(contour_ids)

  for _ in range(3):
    rest = range(contour_ids.size)

    dell = cp.zeros(0, dtype=int)
    # probably remove for
    for i in range(overlaps.shape[0]):
      if ((dell == overlaps[i,0]) | (dell == overlaps[i,1])).any(): continue
      result = PickOctave(overlaps[i], Pt)
      if result == 1: octave = overlaps[i,1]
      elif result == 2: octave = overlaps[i,0]
      if result != 0:
        dell = cp.append(dell, octave)
    rest = cp.delete(rest, dell)
    Pt = CreatePt(contour_ids[rest])

    dell = cp.zeros(0, dtype=int)
    for i in range(rest.size):
      cont = contours[contour_ids[rest[i]][0]:contour_ids[rest[i]][1]]
      ptdif = cp.zeros(0)
      t = Speaks_t[cont]
      ptdif = cp.append(ptdif, abs(Speaks_b[cont] - Pt[t]))
      if cp.mean(ptdif) > 120:
        dell = cp.append(dell, i)
    rest = cp.delete(rest, dell)
    Pt = CreatePt(contour_ids[rest])
      
  # 4.3. Final Melody Selection

  contour_ids = contour_ids[rest]
  s_total2 = s_total[rest]

  overlaps = Overlaps(contour_ids)
  not_final = cp.where(s_total2[overlaps.T[0]] < s_total2[overlaps.T[1]],
                       overlaps.T[0], overlaps.T[1])

  final = cp.delete(range(contour_ids.size), not_final)
  final_contours = contour_ids[final]
    
  ## **Step 5.** Interval Timestamping
  # Sort contours by time

  fstart = cp.zeros(final_contours.size, dtype=int)
  for i in range(fstart.size):
    fstart[i] = Speaks_t[contours[final_contours[i][0]]]
  f0 = cp.stack((final_contours, fstart), axis=1)
  f0 = f0[cp.argsort(f0[:,1])]

  final_peaks = cp.zeros(0, dtype=int)
  fp_ids = cp.zeros((fstart.size,2), dtype=int)
  for i in range(fstart.size):
    len = contours[f0[i,0]].size
    fp_ids[i] = [final_peaks.size, final_peaks.size + len]
    final_peaks = cp.append(final_peaks, contours[f0[i,0]])
  # Use contour with lowest pitch standard deviation
  # as a basis for pitch quantization
  # (Poor man's autotune)

  final_t = Speaks_t[final_peaks]
  final_b = Speaks_b[final_peaks]

  p_std = cp.array([cp.std(final_b[fp[0]:fp[1]]) for fp in fp_ids])
  p_base_id = fp_ids[cp.argmin(p_std)]
  p_base = round(cp.mean(final_b[p_base_id[0]:p_base_id[1]]))

  qnt = 5   # Basically quantize by quarter tones
  final_b -= p_base % qnt
  final_b = qnt*cp.floor((final_b/qnt) + 0.5) + (p_base % qnt)
    
  def Timestamp(a, b):
    duration = final_t[b] - final_t[a]
    time = final_t[a] + (duration / 2)
    interval = final_b[b] - final_b[a]
    return time, interval, duration
  notes = cp.zeros((1, 2), dtype=int)
  it = 0
  this_p = 0
  while it < final_peaks.size:
    if final_b[it] != this_p:
      notes[-1, 1] = it - 1
      notes = cp.append(notes, [[it, 0]], axis=0)
    this_p = final_b[it]
    it += 1
  notes[-1, 1] = final_peaks.size - 1
  notes = notes[1:]

  Time_array = cp.zeros(0)
  Interval_array = cp.zeros(0)
  for i in range(notes.shape[0] - 1):
    j = 1
    note1 = notes[i, 1]
    note2 = notes[i+j, 0]
    this_time, this_inter, base_dr = Timestamp(note1, note2)

    dr_offset = 0
    while dr_offset <= 50:
      Time_array = cp.append(Time_array, this_time)
      Interval_array = cp.append(Interval_array, this_inter)
      j += 1
      if i+j >= notes.shape[0]: break

      note2 = notes[i+j, 0]
      this_time, this_inter, dr_offset = Timestamp(note1, note2)
      dr_offset -= base_dr
        
  if (filename is not None):
    with open(filename, "w") as txt:
      txt.write("Time,Interval\n")
      for i in range(Time_array.size):
        txt.write(str(Time_array[i]) + "," + str(Interval_array[i]) + "\n")
  return [Time_array, Interval_array]

## References

[1] https://replaygain.hydrogenaud.io/equal_loudness.html \\
[2] https://www.justinsalamon.com/uploads/4/3/9/4/4394963/salamongomezmelodytaslp2012.pdf \\
[3] https://www.ee.columbia.edu/~dpwe/papers/Wang03-shazam.pdf \\
[4] https://dafx.de/paper-archive/2006/papers/p_247.pdf \\
[5] https://www.db-thueringen.de/servlets/MCRFileNodeServlet/dbt_derivate_00038847/ilm1-2017000136.pdf