In [None]:
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import os
from collections import Counter

In [None]:
def standard_deviation (estimates):
    std = np.std(np.array(estimates))
    return std


def median (estimates):
    median = np.median(estimates)
    return median

        
def separate(adir):
    conv={}
    conv[0] = lambda s: float(s.strip() or 0)
    x,y = np.loadtxt(adir, unpack=True, usecols=(0,1), converters=conv)
    return (x,y)

In [None]:
# old model
# def estimate_pitch (estimates):
#     estimates = sorted(estimates)
#     X = np.array(estimates).reshape(-1, 1)
#     clustering = AgglomerativeClustering(n_clusters=None, affinity='l1', linkage='average', distance_threshold=5).fit(X)
#     clusters = []
#     for i in range(np.max(clustering.labels_)+1):
#         ccount = 0
#         csum = 0
#         for j, e in enumerate(clustering.labels_):
#             if e == i:
#                 ccount += 1
#                 csum += estimates[j] 
#         clusters.append((ccount, csum / ccount))
#     clusters.sort(reverse=True)
#     return clusters[0][1]

In [None]:
# new model
def calculate_labels (estimates):
    current_threshold = 5
    X = np.array(estimates).reshape(-1, 1)
    current_clustering = None
    
    while True:
        aglo_clust = AgglomerativeClustering(n_clusters=None, 
                                         affinity='euclidean', 
                                         linkage='average', 
                                         distance_threshold=current_threshold)
        current_clustering = aglo_clust.fit(X)
        stats = Counter(current_clustering.labels_).most_common()
        
        if len(stats) >= 2 and stats[0][1] == stats[1][1]:
            current_threshold += 1
        else: break
            
    return current_clustering.labels_
            

def estimate_pitch (estimates):
    estimates = sorted(estimates)
    clustering = calculate_labels(estimates)
    clusters = []
    for i in range(np.max(clustering)+1):
        ccount = 0
        csum = 0
        for j, e in enumerate(clustering):
            if e == i:
                ccount += 1
                csum += estimates[j] 
        clusters.append((ccount, csum / ccount))
    clusters.sort(reverse=True)
    return clusters[0][1]

In [None]:
def find_optimal_pitch (current_collections, collection, part):
    pitch_estimates = []
    time = current_collections[collection]['boersma'][part][0]
    
    for algo in algorithms:
        if part in current_collections[collection][algo]:
            pitch_estimates.append(current_collections[collection][algo][part][1])    
            
            if len(current_collections[collection][algo][part][0]) < len(time):
                time = current_collections[collection][algo][part][0]
    
    best_estimate = np.empty(len(time))
    std = np.empty(len(time))
    med = np.empty(len(time))
    
    for i in range(len(time)):
        current_pitches = []
        for pitches in pitch_estimates:
            current_pitches.append(pitches[i])
        best_estimate[i] = estimate_pitch(current_pitches)
        std[i] = standard_deviation(current_pitches)
        med[i] = median(current_pitches)
        
    return (time, best_estimate, std, med)

### Scherbaum Mshavanadze

In [None]:
collections = {
    "GVM009": {},
    "GVM017": {},
    "GVM019": {},
    "GVM031": {},
    "GVM097": {}
}

algorithms = ['boersma', 'crepe', 'hermes', 'maddox', 'noll', 'praat']
data_dir = '/Akamai/voice/data/pitches-vuv/'

for collection in collections:
    for algo in algorithms:
        collections[collection][algo] = {}

for algorithm in os.listdir(data_dir):
    if not algorithm in algorithms:
        continue
    for collection in os.listdir(f"{data_dir}{algorithm}"):
        if collection != 'Scherbaum Mshavanadze':
            continue
        for song in os.listdir(f"{data_dir}{algorithm}/{collection}"):
            for part in os.listdir(f"{data_dir}{algorithm}/{collection}/{song}"):
                print(part)
                if part[:6] in collections:
                    if 'AHDS' in part:
                        x, y = separate(f"{data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections[part[:6]][algorithm][part[part.index('AHDS'):part.index('AHDS')+6]] = (x, y)
                    elif 'ALRX' in part:
                        x, y = separate(f"{data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections[part[:6]][algorithm][part[part.index('ALRX'):part.index('ALRX')+6]] = (x, y)
                    elif 'AOLS' in part:
                        x, y = separate(f"{data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections[part[:6]][algorithm][part[part.index('AOLS'):part.index('AOLS')+6]] = (x, y)
                    elif 'VSOA' in part:
                        x, y = separate(f"{data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections[part[:6]][algorithm][part[part.index('VSOA'):part.index('VSOA')+6]] = (x, y)

In [None]:
parts = ['AHDS1M', 'AHDS2M', 'AHDS3M', 'ALRX1M', 'ALRX2M', 'ALRX3M', 'VSOAX4', 'AOLS5S']

res_dir = '/Akamai/voice/data/ground-estimate/Scherbaum Mshavanadze/'
std_dir = '/Akamai/voice/data/statistics/standard-deviation/Scherbaum Mshavanadze/'
med_dir = '/Akamai/voice/data/statistics/median/Scherbaum Mshavanadze/'

for collection in collections:
    for part in parts:
        t, estimate, cstd, cmed = find_optimal_pitch(collections, collection, part)
        if not os.path.isdir(res_dir + collection):
            os.mkdir(res_dir + collection)
            
        if not os.path.isdir(std_dir + collection):
            os.mkdir(std_dir + collection)
        
        if not os.path.isdir(med_dir + collection):
            os.mkdir(med_dir + collection)
        
        # ground estimate
        np.savetxt(res_dir + collection + '/' + part + '.txt', np.c_[t, estimate], delimiter=' ', fmt='%f')
        # standard deviation
        np.savetxt(std_dir + collection + '/' + part + '.txt', np.c_[t, cstd], delimiter=' ', fmt='%f')
        # median
        np.savetxt(med_dir + collection + '/' + part + '.txt', np.c_[t, cmed], delimiter=' ', fmt='%f')
        
        print(f"{part}:{collection} done")

### Teach Yourself Megrelian Songs

In [None]:
collections2 = {
    "Ak'a Si Rekisho": {},
    'Gepshvat Ghvini': {},
    'Io _ Chkin Kiana': {},
    'Mesishi Vardi': {},
    'Meureme': {},
    'Mi Re Sotsodali_': {},
    "Mole Chit'i Gilakhe": {},
    'O Da': {},
    'Vojanudi Chkim Jargvals': {}
}

algorithms = ['boersma', 'crepe', 'hermes', 'maddox', 'noll']
data_dir = '/Akamai/voice/data/pitches-vuv/'

for collection in collections2:
    for algo in algorithms:
        collections2[collection][algo] = {}

def root_name (name):
    s = None; e = None
    for i in range(len(name)-2, 0, -1):
        if name[i] == '.': e = i;
        elif name[i:i+2] == '_A': 
            s = i
            break
    
    if s == None: return name[:e];
    else: return name[:s];

for algorithm in os.listdir(data_dir):
    if not algorithm in algorithms:
        continue
    for collection in os.listdir(f"{data_dir}{algorithm}"):
        if collection != 'Teach Yourself Megrelian Songs':
            continue
        for song in os.listdir(f"{data_dir}{algorithm}/{collection}"):
            for part in os.listdir(f"{data_dir}{algorithm}/{collection}/{song}"):
                if root_name(part) in collections2:
                    print(part)
                    if 'AHDS' in part:
                        x, y = separate(f"{data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections2[root_name(part)][algorithm][part[part.index('AHDS'):part.index('AHDS')+6]] = (x, y)
                    else:
                        x, y = separate(f"{data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections2[root_name(part)][algorithm]['VSOAX4'] = (x, y)

In [None]:
parts = ['AHDS1M', 'AHDS2M', 'AHDS3M', 'VSOAX4']

res_dir = '/Akamai/voice/data/ground-estimate/Teach Yourself Megrelian Songs/'
std_dir = '/Akamai/voice/data/statistics/standard-deviation/Teach Yourself Megrelian Songs/'
med_dir = '/Akamai/voice/data/statistics/median/Teach Yourself Megrelian Songs/'


for collection in collections2:
    for part in parts:
        t, estimate, cstd, cmed = find_optimal_pitch(collections2, collection, part)
        if not os.path.isdir(res_dir + collection):
            os.mkdir(res_dir + collection)
        if not os.path.isdir(std_dir + collection):
            os.mkdir(std_dir + collection)
        if not os.path.isdir(med_dir + collection):
            os.mkdir(med_dir + collection)
        
        # estimate pitches
        np.savetxt(res_dir + collection + '/' + part + '.txt', np.c_[t, estimate], delimiter=' ', fmt='%f')
        # std
        np.savetxt(std_dir + collection + '/' + part + '.txt', np.c_[t, cstd], delimiter=' ', fmt='%f')
        # median
        np.savetxt(med_dir + collection + '/' + part + '.txt', np.c_[t, cmed], delimiter=' ', fmt='%f')
        print(f"{part}:{collection} done")

### Teach Yourself Gurian Songs

In [None]:
collections3 = {
    "Adila-Alipasha": {},
    "Indi-Mindi": {},
    'Mival Guriashi (1)': {} ,
    'Pikris Simghera': {},
    "Alaverdi": {},
    "K'alos Khelkhvavi": {},
    'Mival Guriashi (2)' : {}, 
    "Sabodisho": {},
    "Khasanbegura": {},     
    "Mok'le Mravalzhamieri": {},
    'Sadats Vshobilvar': {},
    "Beri Ak'vans Epareba": {}, 
    "Lat'aris Simghera": {},    
    "Mts'vanesa Da Ukudosa": {}, 
    "Shermanduli": {},
    "Brevalo": {},             
    "Manana": {},         
    'Nanina (1)': {},      
    "Shvidk'atsa": {},
    "Chven-Mshvidoba": {},    
    "Maq'ruli": {},               
    'Nanina (2)': {},          
    'Supris Khelkhvavi': {},
    'Didi Khnidan': {},     
    "Masp'indzelsa Mkhiarulsa": {}, 
    "Orira": {},                
    "Ts'amok'ruli": {},
    "Gakhsovs, T'urpa": {},
    "Me-Rustveli": {},        
    "P'at'ara Saq'varelo": {}
}

algorithms = ['boersma', 'crepe', 'hermes', 'maddox', 'noll']
data_dir = '/Akamai/voice/data/pitches-vuv/'

for collection in collections3:
    for algo in algorithms:
        collections3[collection][algo] = {}

def root_name (name):
    s = None; e = None
    for i in range(len(name)-2, 0, -1):
        if name[i] == '.': e = i;
        elif name[i:i+2] == '_A': 
            s = i
            break
    
    if s == None: return name[:e];
    else: return name[:s];

for algorithm in os.listdir(data_dir):
    if not algorithm in algorithms:
        continue
    for collection in os.listdir(f"{data_dir}{algorithm}"):
        if collection != 'Teach Yourself Gurian Songs':
            continue
        for song in os.listdir(f"{data_dir}{algorithm}/{collection}"):
            for part in os.listdir(f"{data_dir}{algorithm}/{collection}/{song}"):
                print(part)
                if root_name(part) in collections3:
                    if 'AHDS' in part:
                        x, y = separate(f"{data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections3[root_name(part)][algorithm][part[part.index('AHDS'):part.index('AHDS')+6]] = (x, y)
                    else:
                        x, y = separate(f"{data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections3[root_name(part)][algorithm]['VSOAX4'] = (x, y)

In [None]:
parts = ['AHDS1M', 'AHDS2M', 'AHDS3M', 'VSOAX4']

res_dir = '/Akamai/voice/data/ground-estimate/Teach Yourself Gurian Songs/'
std_dir = '/Akamai/voice/data/statistics/standard-deviation/Teach Yourself Gurian Songs/'
med_dir = '/Akamai/voice/data/statistics/median/Teach Yourself Gurian Songs/'

for collection in collections3:
    for part in parts:
        t, estimate, cstd, cmed = find_optimal_pitch(collections3, collection, part)
        if not os.path.isdir(res_dir + collection):
            os.mkdir(res_dir + collection)
        if not os.path.isdir(std_dir + collection):
            os.mkdir(std_dir + collection)
        if not os.path.isdir(med_dir + collection):
            os.mkdir(med_dir + collection)
            
        # pitch estimate
        np.savetxt(res_dir + collection + '/' + part + '.txt', np.c_[t, estimate], delimiter=' ', fmt='%f')
        # std
        np.savetxt(std_dir + collection + '/' + part + '.txt', np.c_[t, cstd], delimiter=' ', fmt='%f')
        # median
        np.savetxt(med_dir + collection + '/' + part + '.txt', np.c_[t, cmed], delimiter=' ', fmt='%f')

        print(f"{part}:{collection} done")

### Testing and Debugging

In [None]:
data_dir = '/Akamai/voice/data/ground-estimate/Scherbaum Mshavanadze/'
data_dir += os.listdir(data_dir)[0] + '/'
data_dir += os.listdir(data_dir)[0]

x, y = separate(data_dir)
plt.figure(figsize=(20, 5))
plt.plot(x, y, '.', markersize=2)

In [None]:
estimate_pitch([201.9599, 201.3181, 201.1465, 200.005, 67.73271])