### Main steps
For several song samples:

- load a song stft from data/stft/fft-window-size_fft-hop-length/Album-name/song-name-dir/startoffset-endoffset.stft
- Use the method in [Hermes's paper](https://drive.google.com/file/d/1yYo9DB9Vh0tHoE63swfu78u2_p_fcAD5/view?usp=sharing) (see Fig. 1) to detect the one, two, or three pitches (fundamental frequencies) of the voices in the audio

In [1]:
import numpy as np
import librosa
import matplotlib.pyplot as plt
import os
import scipy.signal
import scipy.fft
import scipy
import librosa.display
import math
from scipy.signal import butter
from scipy.fft import fft, ifft, rfft, irfft
from scipy.signal import argrelmax
import librosa


data_dir = '/Akamai/voice/data/'
parent_pitch_dir = ''
parent_data_dir = ''
pitch_dir = ''
tck = None
max_pitch = 500
min_pitch = 60
candidates = 50
voiced_unvoiced_cost = 20
octave_jump_cost = 20

In [2]:
def load_file (file):
    global data_dir
    y, sr = librosa.load(parent_data_dir + file, sr=None)
    return [y, sr]

def smooth(x,window_len=11,window='hanning'):
    if window_len<3:
        return x
    s=np.r_[x[window_len-1:0:-1],x,x[-1:-window_len:-1]]
    if window == 'flat': #moving average
        w=np.ones(window_len,'d')
    else:
        w=eval('np.'+window+'(window_len)')
    y=np.convolve(w/w.sum(),s,mode='valid')
    
    return y[(window_len//2):-(window_len//2)]  
    
def init_cubic_spline_interpolation (x_points, y_points):
    # cubic spline interpolation
    # chose 168 different values since 48 reccommended for every octave
    
    global tck
    tck = scipy.interpolate.interp1d(x_points, y_points, kind="cubic", fill_value='extrapolate')
    
    
def calculate_P (x):
    return tck(x) * np.arctan(x)

def calculate_h_n (n):
    return 0.84**(n-1)

def preprocessing (sig, sr, threshold):
    yf = rfft(sig)
    N = sig.size
    T = 1/sr
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    yf[np.where(xf>threshold)] = 0
    pp_sig = irfft(yf)
    return pp_sig

In [3]:
def estimate_pitch (ss, values):
    poss = argrelmax(values)[0]
    precise_poss = [[0, 0]]
    
    tck_2 = scipy.interpolate.interp1d(ss, -values, kind="cubic", fill_value='extrapolate')
    for pos in poss:
        cval = fminbound(tck_2, min_pitch + pos - 1, min_pitch + pos + 1, full_output=True)[0]
        precise_poss.append([cval, -tck_2(cval)/np.max(values)])
        
    return precise_poss

In [4]:
def initialize_min_and_max (song):
    global data_dir, min_pitch, max_pitch
    
    if song[-6] == '1' or song[-6] == '2' or song[-6] == '3':
        try:
            pitch_data = open(data_dir + 'minmax').readlines()
        except FileNotFoundError:
            print('Please create a minmax file. Using default values for now')
            max_pitch = 500
            min_pitch = 60
            return
        for each in pitch_data:
            if each[0] == song[-6]:
                min_pitch = int(each.strip().split(' ')[1])
                max_pitch = int(each.strip().split(' ')[2])
    else:
        max_pitch = 500
        min_pitch = 60

In [5]:
from scipy.signal import argrelmax
from scipy.optimize import fminbound

def hermes (audio, sr, graph=False):
#     audio = preprocessing(audio, sr, 800)
    
    N = 16
    hamming_window = scipy.signal.hamming(audio.shape[0])
    pad = 5000
    prod = np.multiply(audio, hamming_window)
    prod = np.pad(prod, (0, pad), 'constant')
    x = np.linspace(0, sr//2, prod.size//2)
    ham_fft = np.abs(scipy.fft.rfft(prod))[:-1]
        
    x = x[x <= (N+1) * max_pitch]
    ham_fft = ham_fft[:x.size]
#     ham_fft_original = np.array(ham_fft, copy=True)
    
    if graph:
        plt.figure(figsize=(20, 5))
        plt.plot(ham_fft[:500])
    # detect maximums
#     maximi = argrelmax(ham_fft)[0]
#     for i in range(0, maximi.size-1):
#         s = int(maximi[i] + (20/x[1]))
#         while s < int(maximi[i+1] - (20/x[1])):
#             ham_fft[s] = 0
#             s += 1
    #detect maximums
    
    if graph:
        plt.plot(ham_fft[:500])

    han_fft = smooth(ham_fft)
    init_cubic_spline_interpolation(x, han_fft)
    
    m = 1
    n = np.arange(1, N+1)
    ss = np.arange(min_pitch, max_pitch+1, m)
    ns = np.tile(n, (ss.size, 1))
    ms = np.transpose(np.multiply(np.transpose(ns), ss))
    os = np.tile(calculate_h_n(n), (ss.size, 1))
    
    fin2 = np.apply_along_axis(calculate_P, 1, ms)
    values = smooth(smooth(np.sum(np.multiply(os, fin2), axis=1)))
    
#     tck_2 = scipy.interpolate.interp1d(ss, -values, kind="cubic", fill_value='extrapolate')
    pitch_estimates = estimate_pitch(ss, values)
        
    if graph:
        plt.figure(figsize=(20, 5))
        plt.plot(ss, -values)
        plt.axvline(pitch_est[0])
        #     print("Estimated pitch: " + str(pitch_est[0]) + " Hz")
    return pitch_estimates # min_pitch + estimate_pitch(values) * m

### Viterbi Algorithm

In [50]:
dp = None

def transition_cost (f1, f2):
    global voiced_unvoiced_cost, octave_jump_cost
        
    if (f1 == 0 and f2 == 0):
        return 10
    elif f1 == 0 and f2 != 0:
        return 0
    elif f1 != 0 and f2 == 0:
        return voiced_unvoiced_cost
    else:
#         print(octave_jump_cost * abs(math.log2(f1/f2)))
        return octave_jump_cost * abs(math.log2(f1/f2))
    
    
def viterbi (all_pairs):
    global dp
    
    for f in range(all_pairs.shape[0]):
        for c in range(len(all_pairs[f])):
            if f == 0:
                dp[f][c] = (-all_pairs[f][c][1], None)
            else:
                cost, history = (float('inf'), float('inf'))
                for p in range(len(all_pairs[f-1])):
                    n_cost = dp[f-1][p][0] + transition_cost(all_pairs[f-1][p][0], all_pairs[f][c][0]) - all_pairs[f][c][1]
                    if n_cost < cost:
                        cost = n_cost
                        history = p
                dp[f][c] = (cost, history)

                
def back_track (all_pairs):
    global dp
    ans = []
    c, h = (float('inf'), 0)
    for i, each in enumerate(dp[len(dp)-1]):
        if (each != np.inf and each[0] < c):
            c = each[0]
            h = i 
    for i in range(len(dp)-1, 0, -1):
        c = all_pairs[i][h][0]
        ans.append(c)
        h = dp[i][h][1]
    return ans[::-1]

In [7]:
def create_and_store (names):
    global dp
    
    done = 1
    for name in names:
        initialize_min_and_max(name)
        y, sr = load_file(name)
        time_listing = np.arange(0, y.size/sr, 0.01)[:-1]
        f = [None for i in range(time_listing.size)] #np.empty(time_listing.size)
        i = 0
        for t in time_listing:
            a = int(t * sr)
            f[i] = hermes(y[a:a+int(0.01 * sr * 1.75)], sr)
            i += 1
            print(f"{i/len(f) * 100}%         ", end='\r')
        
        
        f = np.array(f)
        o = np.array(f)
        dp = np.full((f.shape[0], candidates), np.inf).tolist()
        viterbi(f)
        f = np.array(back_track(f))
        
        f = f.tolist()
        for i in range(len(f)):
            f[i] = str(time_listing[i]) + ' ' + str(f[i])
                    
        song_dir = pitch_dir + name
        song_dir = song_dir[:-4] + ".txt"
        
        fout = open(song_dir, "w+")
        fout.write("\n".join(f))
        fout.close()
        
        print(f"{done} done     ")
        done += 1

        return o

In [8]:
# Scherbaum Mshavandaze
# parent_data_dir = '/Akamai/voice/data/Scherbaum Mshavanadze/'
# parent_pitch_dir = '/Akamai/voice/data/pitches/hermes/Scherbaum Mshavanadze/'

# for collection in os.listdir(parent_data_dir):
#     if os.path.isdir(f"{parent_data_dir}{collection}"):
#         parts = []
#         for part in os.listdir(f"{parent_data_dir}{collection}"):
#             if part[-3:] == 'wav':
#                 parts.append(part)
                
#         data_dir = parent_data_dir + collection + '/'
#         pitch_dir = parent_pitch_dir + collection + '/'
#         create_and_store(parts)
        
# Teach Yourself Megrelian Songs
# parent_data_dir = '/Akamai/voice/data/Teach Yourself Megrelian Songs/'
# parent_pitch_dir = '/Akamai/voice/data/pitches/hermes/Teach Yourself Megrelian Songs/'

# for collection in os.listdir(parent_data_dir):
#     if os.path.isdir(f"{parent_data_dir}{collection}"):
#         if collection != 'mp3':
#             parts = []
#             for part in os.listdir(f"{parent_data_dir}{collection}"):
#                 if part[-3:] == 'wav':
#                     parts.append(part)
                
#             data_dir = parent_data_dir + collection + '/'
#             pitch_dir = parent_pitch_dir + collection + '/'
#             create_and_store(parts)
            
# Teach Yourself Gurian Songs
# parent_data_dir = '/Akamai/voice/data/Teach Yourself Gurian Songs/'
# parent_pitch_dir = '/Akamai/voice/data/pitches/hermes/Teach Yourself Gurian Songs/'

# check = True

# for collection in os.listdir(parent_data_dir):
#     if os.path.isdir(f"{parent_data_dir}{collection}"):
#         if collection == "Masp'indzelsa Mkhiarulsa":
#             check = False
            
#         if not check:
#             parts = []
#             for part in os.listdir(f"{parent_data_dir}{collection}"):
#                 if part[-3:] == 'wav':
#                     parts.append(part)
            
#             print(parts)
#             data_dir = parent_data_dir + collection + '/'
#             pitch_dir = parent_pitch_dir + collection + '/'
#             create_and_store(parts)
            

### Testing and Debugging

In [29]:
parent_data_dir = "/Akamai/voice/data/Teach Yourself Megrelian Songs/Ak'a Si Rekisho/"
parent_pitch_dir = "/Akamai/voice/data/pitches/hermes/Teach Yourself Megrelian Songs/Ak'a Si Rekisho/Ak'a Si Rekisho.wav"

# td = create_and_store(['Adila-Alipasha_AHDS1M.wav'])
y, sr = load_file("Ak'a Si Rekisho_AHDS1M.wav")
time_listing = np.arange(0, y.size/sr, 0.01)[:-1]
f = [None for i in range(time_listing.size)] 
i = 0
for t in time_listing:
    a = int(t * sr)
    f[i] = hermes(y[a:a+int(0.01 * sr * 1.75)], sr)
    i += 1
    print(f"{i/len(f) * 100}% ", end='\r')

100.0% 2535575679%    

In [52]:
%matplotlib inline

def test (jump_cost, uv_cost):
    global octave_jump_cost, voiced_unvoiced_cost
    
    octave_jump_cost = jump_cost
    voiced_unvoiced_cost = uv_cost
    strength_classifier = 10

    td = np.array(f)
    dp = np.full((td.shape[0], candidates), np.inf).tolist()
    viterbi(td)
    td = np.array(back_track(td))

    def separate(adir):
        conv={}
        conv[0] = lambda s: float(s.strip() or 0)
        x,y = np.loadtxt(adir, unpack=True, usecols=(0,1), converters=conv)
        return (x,y)

    x, y = separate("/Akamai/voice/data/pitches/crepe/Teach Yourself Megrelian Songs/Ak'a Si Rekisho/Ak'a Si Rekisho_AHDS1M.txt")
    # a, b = separate('/Akamai/voice/data/pitches/hermes/Teach Yourself Gurian Songs/Adila-Alipasha/Adila-Alipasha_AHDS1M.txt')

    def apply_detection (uy, ay):
        uy = uy[:min(uy.size, ay.size)]
        ay = ay[:min(uy.size, ay.size)]
        ay[uy == 0] = 0
        return ay

    td = apply_detection(y, td)

    plt.figure(figsize=(20, 5))
    plt.plot(td, '.', markersize=2, label="hermes")
    plt.plot(y ,'.', markersize=2, label="crepe")
    plt.legend()

#     plt.figure(figsize=(20, 5))
#     plt.plot(y, '.', markersize=2, label="crepe")
#     plt.plot(td, '.', markersize=2, label="hermes")
#     plt.legend()

test(5, 1)

TypeError: 'NoneType' object is not subscriptable