## Main steps
For several song samples:

- load a song stft from data/stft/fft-window-size_fft-hop-length/Album-name/song-name-dir/startoffset-endoffset.stft
- Use the method in [Hermes's paper](https://drive.google.com/file/d/1yYo9DB9Vh0tHoE63swfu78u2_p_fcAD5/view?usp=sharing) (see Fig. 1) to detect the one, two, or three pitches (fundamental frequencies) of the voices in the audio

In [None]:
import numpy as np
import librosa
import matplotlib.pyplot as plt
import os
import scipy.signal
import scipy.fft
import scipy
import librosa.display
import math
from scipy.signal import butter
from scipy.signal import argrelextrema
from scipy.fft import fft, ifft, rfft, irfft


data_dir = '/Akamai/voice/data/'
pitch_dir = ''
crepe_dir = ''
correction_dir = ''
tck = None
max_pitch = 2000
min_pitch = 60
crepe_interval_estimate = 0.2

In [None]:
def load_file (file):
    global data_dir
    y, sr = librosa.load(data_dir + file, sr=None)
    return [y, sr]

def load_crepe(name):
    name = name[:-3] + 'txt'
    t, freq = np.loadtxt(crepe_dir+name)[:,0], np.loadtxt(crepe_dir+name)[:,1]
    return (t, freq)

def load_corrections(name):
    name = name[:-3] + "txt"
    corrections = []
    try: correction_file = open(f'{correction_dir}{name}')
    except: 
        print('no correction file for', f'{correction_dir}{name}')
        return []
    for line in correction_file:
        current_corrections = list(map(int, line.split(' ')))
        corrections.append(current_corrections)
    return corrections

def find_correction_box (time, corrections):
    for correction in corrections:
        if correction[0] <= time <= correction[2]:
            return (correction[1], correction[3])

def smooth(x,window_len=11,window='hanning'):
    if window_len<3:
        return x
    s=np.r_[x[window_len-1:0:-1],x,x[-1:-window_len:-1]]
    if window == 'flat': #moving average
        w=np.ones(window_len,'d')
    else:
        w=eval('np.'+window+'(window_len)')
    y=np.convolve(w/w.sum(),s,mode='valid')
    
    return y[(window_len//2):-(window_len//2)]  
    
def init_cubic_spline_interpolation (x_points, y_points):
    # cubic spline interpolation
    # chose 168 different values since 48 reccommended for every octave
    
    global tck
    tck = scipy.interpolate.interp1d(x_points, y_points, kind="cubic", fill_value='extrapolate')
    
    
def calculate_P (x):
    return tck(x) * np.arctan(x)

def calculate_h_n (n):
    return 0.84**(n-1)

def estimate_pitch (vals, min_pitch, crepe_estimate, correction_box):
    lower_bound = min_pitch
    upper_bound = max_pitch
    
    if crepe_estimate-min_pitch > 0: 
        lower_bound = int(crepe_estimate * (1-crepe_interval_estimate))
        upper_bound = int(crepe_estimate * (1+crepe_interval_estimate))
        
        if correction_box is not None:
            lower_bound = correction_box[0]
            upper_bound = correction_box[1]
            
        vals[:max(lower_bound-min_pitch, 0)] = 0
        vals[min(upper_bound-min_pitch, len(vals)):] = 0    
    
    local_maximi_indices = argrelextrema(vals, np.greater)[1:-1]
    local_maximi_values = vals[local_maximi_indices]
    
#     if lower_bound:
#         print(local_maximi_indices, lower_bound, upper_bound)
    return np.argmax(local_maximi_values)

def preprocessing (sig, sr, threshold):
    yf = rfft(sig)
    N = sig.size
    T = 1/sr
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    yf[np.where(xf>threshold)] = 0
    pp_sig = irfft(yf)
    return pp_sig

In [None]:
from scipy.signal import argrelmax
from scipy.optimize import fminbound

def hermes (audio, sr, crepe_estimate, correction_box, graph=False):
#     audio = preprocessing(audio, sr, 800)
    
    N = 16
    hamming_window = scipy.signal.hamming(audio.shape[0])
    pad = 5000
    prod = np.multiply(audio, hamming_window)
    prod = np.pad(prod, (0, pad), 'constant')
    x = np.linspace(0, sr//2, prod.size//2)
    ham_fft = np.abs(scipy.fft.rfft(prod))[:-1]
        
    x = x[x <= (N+1) * max_pitch]
    ham_fft = ham_fft[:x.size]
#     ham_fft_original = np.array(ham_fft, copy=True)
    
#     if graph:
#         plt.figure(figsize=(20, 5))
#         plt.plot(ham_fft[:500])
        
    # detect maximums
#     maximi = argrelmax(ham_fft)[0]
#     for i in range(0, maximi.size-1):
#         s = int(maximi[i] + (20/x[1]))
#         while s < int(maximi[i+1] - (20/x[1])):
#             ham_fft[s] = 0
#             s += 1
    #detect maximums
    
#     if graph:
#         plt.plot(ham_fft[:500])

    han_fft = smooth(ham_fft)
    init_cubic_spline_interpolation(x, han_fft)
    
    m = 1
    n = np.arange(1, N+1)
    ss = np.arange(min_pitch, max_pitch+1, m)
    ns = np.tile(n, (ss.size, 1))
    ms = np.transpose(np.multiply(np.transpose(ns), ss))
    os = np.tile(calculate_h_n(n), (ss.size, 1))
    
    fin2 = np.apply_along_axis(calculate_P, 1, ms)
    values = smooth(smooth(np.sum(np.multiply(os, fin2), axis=1)))
    
    tck_2 = scipy.interpolate.interp1d(ss, -values, kind="cubic", fill_value='extrapolate')
    approximate_pitch = estimate_pitch(values, min_pitch, crepe_estimate, correction_box)
    pitch_est = fminbound(tck_2, min_pitch + approximate_pitch - 1,
                          min_pitch + approximate_pitch + 1, full_output=True)
        
    if graph:
        plt.figure(figsize=(20, 5))
        plt.plot(ss, -values)
        plt.axvline(pitch_est[0])
        plt.axvline(crepe_estimate, color='r')
        #     print("Estimated pitch: " + str(pitch_est[0]) + " Hz")
    
    return pitch_est[0] # min_pitch + estimate_pitch(values) * m

In [None]:
def create_and_store (names):
    done = 1
    for name in names:
        song_dir = pitch_dir + name
        song_dir = song_dir[:-4] + ".txt"
        
#         if os.path.isfile(song_dir): 
#             print('skipping', song_dir)
#             continue
        
        y, sr = load_file(name)
        
        try: ct, cfreq = load_crepe(name)
        except: continue
            
        corrections = load_corrections(name)
        
        time_listing = np.arange(0, y.size/sr, 0.01)[:-1]
        f = [None for i in range(time_listing.size)] #np.empty(time_listing.size)
        for i, t in enumerate(time_listing):
            a = int(t * sr)
            current_box = find_correction_box(t, corrections)
#             graph_or_not = i % 100 == 0
            if i < len(cfreq):
                f[i] = hermes(y[a:a+int(0.01 * sr * 1.75)], sr, cfreq[i], current_box, graph=False)
            else: 
                f[i] = hermes(y[a:a+int(0.01 * sr * 1.75)], sr, 0, current_box, graph=False)
            
            print(f"{i/len(f) * 100}%         ", end='\r')
        
        for i in range(len(f)):
            f[i] = str(time_listing[i]) + ' ' + str(f[i])
                    
        song_dir = pitch_dir + name
        song_dir = song_dir[:-4] + ".txt"
        
        fout = open(song_dir, "w+")
        fout.write("\n".join(f))
        fout.close()
        
        print(f"{done} done            ")
        done += 1

In [None]:
# Scherbaum Mshavandaze
parent_data_dir = '/Akamai/voice/data/Scherbaum Mshavanadze/'
parent_pitch_dir = '/Akamai/voice/data/pitches-raw-crepe-assisted/hermes/Scherbaum Mshavanadze/'
parent_crepe_dir = '/Akamai/voice/data/pitches-vuv-new/crepe/Scherbaum Mshavanadze/'
parent_correction_dir = '/Akamai/voice/data/pitch-corrections/crepe/Scherbaum Mshavanadze/'

for collection in os.listdir(parent_data_dir):
    if os.path.isdir(f"{parent_data_dir}{collection}") and '19' in collection:
        parts = []
        for part in os.listdir(f"{parent_data_dir}{collection}"):
            if part[-3:] == 'wav':
                parts.append(part)

        data_dir = parent_data_dir + collection + '/'
        pitch_dir = parent_pitch_dir + collection + '/'
        crepe_dir = parent_crepe_dir + collection + '/'
        correction_dir = parent_correction_dir + collection + '/'
        create_and_store(parts)
        
        
# Teach Yourself Megrelian Songs
# parent_data_dir = '/Akamai/voice/data/Teach Yourself Megrelian Songs/'
# parent_pitch_dir = '/Akamai/voice/data/pitches-raw-crepe-assisted/hermes/Teach Yourself Megrelian Songs/'
# parent_crepe_dir = '/Akamai/voice/data/pitches-vuv-new/crepe/Teach Yourself Megrelian Songs/'
# parent_correction_dir = '/Akamai/voice/data/pitch-corrections/crepe/Teach Yourself Megrelian Songs/'


# for collection in os.listdir(parent_data_dir):
#     if os.path.isdir(f"{parent_data_dir}{collection}"):
#         if collection != 'mp3':
#             parts = []
#             for part in os.listdir(f"{parent_data_dir}{collection}"):
#                 if part[-3:] == 'wav':
#                     parts.append(part)

#             data_dir = parent_data_dir + collection + '/'
#             pitch_dir = parent_pitch_dir + collection + '/'
#             crepe_dir = parent_crepe_dir + collection + '/'
#             correction_dir = parent_correction_dir + collection + '/'

#             create_and_store(parts)
            
# Teach Yourself Gurian Songs
# parent_data_dir = '/Akamai/voice/data/Teach Yourself Gurian Songs/'
# parent_pitch_dir = '/Akamai/voice/data/pitches-raw-crepe-assisted/hermes/Teach Yourself Gurian Songs/'
# parent_crepe_dir = '/Akamai/voice/data/pitches-vuv-new/crepe/Teach Yourself Gurian Songs/'
# parent_correction_dir = '/Akamai/voice/data/pitch-corrections/crepe/Teach Yourself Gurian Songs/'


# for collection in os.listdir(parent_data_dir):
#     if os.path.isdir(f"{parent_data_dir}{collection}"):
#         if collection != 'mp3':
#             parts = []
#             for part in os.listdir(f"{parent_data_dir}{collection}"):
#                 if part[-3:] == 'wav':
#                     parts.append(part)

#             data_dir = parent_data_dir + collection + '/'
#             pitch_dir = parent_pitch_dir + collection + '/'
#             crepe_dir = parent_crepe_dir + collection + '/'
#             correction_dir = parent_correction_dir + collection + '/'
#             create_and_store(parts)