# 3. Normalise and Plot Melodies, and Save Data 

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import librosa
import IPython.display as ipd
from IPython.display import Audio
import mir_eval.sonify
from mir_eval.sonify import pitch_contour
import os
import pandas as pd
from glob import glob1

### Pitch Estimation and normalisation

In [6]:
def estimate_and_normalise(audio, plot = True, sonify = True):
    """ Estimate pitch of an audio input and returns the normalised frequency and time axes
    
    Parameters
    ---------
    audio: np.array
        A loaded audio wav file
        
    Returns
    -------
    f0 : np.array
        1D array of estimated frequencies, normalised between 0 and 1
    times : np.array
        1D time array, normalised between 0 and 1
    
    """
    # estimate frequency and time array from the audio input
    f0, _, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C3'), fmax=librosa.note_to_hz('C7'), fill_na = 0)

    # replace very large jumps (likely estimation error) with the last frequency value
    df = pd.DataFrame(f0) # transform array to dataframe
    df_temp = df.mask(df.sub(df.mean()).div(df.std()).abs().gt(2)) # replace outlier values with Nans
    df_temp = df_temp.replace(np.nan,0) # replace Nans with zeros
    f0 = df_temp.to_numpy() # transform back to numpy array
                
    # replace zeros (originally Nans) with the last non-zero value
    while True:
        I=np.nonzero(f0==0)[0]
        if len(I)==0: break
        f0[I] = f0[I-1]
        
    # make sure that all phrases are the same length
    f0 = f0.reshape(f0.shape[0])
    if f0.shape[0] < 431:
        num2fill = 431 - f0.shape[0]
        f0 = np.pad(f0, (0, num2fill), 'constant')   
    if f0.shape[0] > 431:
        f0 = f0[:431]
    
    # create time array
#     f0 = f0.reshape(431)
    times = librosa.times_like(f0)
    
    # sonify
    if sonify == True:
        pc = pitch_contour(times,f0, fs=fs)
        a = ipd.Audio(pc,rate=fs)
        ipd.display(a)

    # normalise frequency and time axes
    
    normaliser = 1 / np.sum(f0)
    f0 *= normaliser
    f0 = np.log(f0+1)
    
#     f0 /= np.max(np.abs(f0),axis=0)
#     times /= np.max(np.abs(times),axis=0)

#     f0 = (f0 - np.min(f0))/np.ptp(f0)
#     times = (times - np.min(times))/np.ptp(times)
    

    
    # plot
    if plot==True:
        plt.plot(times,f0,'-')
        plt.xlabel('time (sec)')
        plt.ylabel('frequency (Hz)')
        plt.show()
        
    f0 = f0.reshape(f0.shape[0],1)
    
    return f0, times

### Read dataset (single-phrases) and store in data matrix
(each row is one song)

In [7]:
data_set = np.empty((431,0), int)

file_count = 0
audio_dir = 'data/equal_phrases/'
for file_name in os.listdir(audio_dir):
        if file_name.endswith('.wav'):
            # load audio 
            audio, fs = librosa.load(audio_dir+file_name)
            audio,_ = librosa.effects.trim(audio)
            # pitch estimation and normalisation
            f0, times = estimate_and_normalise(audio, plot = False, sonify = False)
            # append pitch contour to data matrix
            data_set = np.append(data_set, f0, axis=1)
            
            file_count += 1
            print("progress:", "{:.2f}".format(file_count/len(os.listdir(audio_dir))*100), "%", end='\r')

progress: 7.61 %

KeyboardInterrupt: 

In [None]:
plt.plot(data_set)
plt.xlabel('Relative Time')
plt.ylabel('Relative Frequency')
plt.title('Full Data Set')
plt.show()

### Save dataset
(each row is one song)

In [None]:
# add file names to dataset
data_labels = np.array(glob1(audio_dir,"*.wav"))

for l in data_labels: # remove .wav
    l = l[:-4]

country = np.empty((0))
seperator = '_'
for l in data_labels:
    l = l.split(seperator, 1)[0]
    country = np.append(country,l)

In [None]:
np.savetxt('data.dat', data_set.T)
np.savetxt('country.dat', country.T, delimiter=" ", fmt="%s")