In [1]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter

In [2]:
dataset_path = "/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set"

FRAME_SIZE = 50
BPF_LOW = 50
BPF_HIGH = 1000

N_FFT = 1024
HOP_LENGTH = 256

MARGIN_I, MARGIN_V = 2, 10
POWER = 2

In [3]:
wav_files = [dataset_path + "/" + f.name for f in list(os.scandir(dataset_path)) if f.name.endswith('.wav')]
data_files = [f.replace(".wav", "REF.txt") for f in wav_files]

In [4]:
x_data = []
y_data = []

In [5]:
def get_segments(data_file):
  data = []
  with open(data_file, 'r') as f:
    raw_data = f.readlines()
    for line in raw_data:
      data.append([float(x) for x in line.strip().split()])
  
  annotations = []
  for i in range(1, len(data)):
    temp = int(data[i][0] * 1000) - int(data[i-1][0] * 1000)
    while (temp > 0):
      temp -= 1
      annotations.append(data[i-1][1])

  frames = []
  for i in range(0, len(annotations)-FRAME_SIZE, FRAME_SIZE):
    frame = [i/1000, (i+FRAME_SIZE)/1000, 0]
    pitch = 0
    for j in range(i, i+FRAME_SIZE):
      pitch += annotations[j]
    pitch /= FRAME_SIZE
    frame[2] = pitch
    frames.append(frame)
  
  segments = []
  for frame in frames:
    if frame[2] != 0:
      segments.append(frame)
  segments = np.array(segments)
  
  # plt.figure(figsize=(20, 5))
  # plt.scatter(segments[:, 0], segments[:, 2])
  # plt.title("Annotations")
  # plt.xlabel("Time (s)")
  # plt.ylabel("Pitch (Hz)")
  # plt.grid()
  # plt.show()
  
  return segments

In [6]:
def bandpass_filter(y, sr):
  nyquist = 0.5 * sr
  low = BPF_LOW / nyquist
  high = BPF_HIGH / nyquist
  b, a = butter(5, [low, high], btype='band')
  data = lfilter(b, a, y)
  return np.array(data)

In [7]:
def make_data(wav_file, data_file):
  y, sr = librosa.load(wav_file)
  y_filtered = bandpass_filter(y, sr)
  
  segments = get_segments(data_file)
  for segment in segments:
    start, end, pitch = segment
    y_segment = y_filtered[int(start*sr) : int(end*sr)]    
    s_full, phase = librosa.magphase(librosa.stft(y_segment, n_fft=N_FFT, hop_length=HOP_LENGTH))
    s_filter = librosa.decompose.nn_filter(s_full, aggregate=np.median, metric='cosine')
    s_filter = np.minimum(s_full, s_filter)
    mask_v = librosa.util.softmask(s_full-s_filter, MARGIN_V * s_filter, power=POWER)
    s_foreground = mask_v * s_full
    spec_db_foregound = librosa.amplitude_to_db(s_foreground, ref=np.max)
    
    # plt.figure(figsize=(20, 5))
    # librosa.display.specshow(spec_db_foregound, sr=sr, x_axis='time', y_axis='log')
    # plt.colorbar()
    # plt.title("Spectrogram")
    # plt.xlabel("Time (s)")
    # plt.ylabel("Frequency (Hz)")
    # plt.show()
    
    x_data.append(spec_db_foregound.flatten())
    y_data.append(pitch)

  print(wav_file)

In [8]:
import concurrent.futures

In [9]:
max_threads = 8
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
  futures = [executor.submit(make_data, wav_file, data_file) for wav_file, data_file in zip(wav_files, data_files)]
  concurrent.futures.wait(futures)

/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/jazz2.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/jazz1.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/midi4.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/daisy3.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/pop4.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/midi1.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/daisy2.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/opera_male5.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/jazz3.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/jazz4.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/opera_fem4.wav
/home/adit/Downloads/EE798P/Datasets/Melody Estimation/adc2004_full_set/daisy1.wav
/ho

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)
x_train, x_test, y_train, y_test = np.array(x_train), np.array(x_test), np.array(y_train), np.array(y_test)

In [11]:
from sklearn.neural_network import MLPRegressor

In [12]:
model = MLPRegressor(hidden_layer_sizes=(100, 100, 100), max_iter=100, verbose=True)
model.fit(x_train, y_train)

Iteration 1, loss = 18553.55363229
Iteration 2, loss = 14046.81101381
Iteration 3, loss = 13911.93675717
Iteration 4, loss = 13807.75219208
Iteration 5, loss = 14626.05340489
Iteration 6, loss = 13838.14359192
Iteration 7, loss = 13402.54642544
Iteration 8, loss = 13024.61948730
Iteration 9, loss = 13286.29726332
Iteration 10, loss = 12550.14069251
Iteration 11, loss = 12218.16999043
Iteration 12, loss = 12552.31187962
Iteration 13, loss = 13110.23909429
Iteration 14, loss = 11984.27940785
Iteration 15, loss = 12296.51500547
Iteration 16, loss = 11891.90155636
Iteration 17, loss = 11553.74786334
Iteration 18, loss = 11148.56404451
Iteration 19, loss = 10892.04925496
Iteration 20, loss = 11312.78117622
Iteration 21, loss = 11187.00284014
Iteration 22, loss = 11223.79304632
Iteration 23, loss = 10271.21327164
Iteration 24, loss = 10831.01068701
Iteration 25, loss = 10301.90648861
Iteration 26, loss = 10404.04649447
Iteration 27, loss = 10417.07748462
Iteration 28, loss = 10421.80294340
I

In [13]:
# Evalutaing MAE
y_pred = model.predict(x_test)
mae = np.mean(np.abs(y_pred - y_test))
print("MAE: %.2f" % mae)

MAE: 96.97
