### This notebook populates the ground estimates and MAD statistics.

It takes Crepe estimates from `/Akamai/voice/data/pitches-postprocessed/`
and estimates for Boersma, Hermes, Noll, Yin froom `/Akamai/voice/data/pitches-vuv-crepe-assisted/`

In [None]:
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import os
from collections import Counter
from scipy.stats import median_abs_deviation
from statistics import median

In [None]:
parent_correction_dir = '/Akamai/voice/data/pitch-overrides/crepe/'

In [None]:
def match_dir(parent_dir, curr_dir):
    for each in os.listdir(parent_dir):
        if curr_dir in each:
            return each
    return curr_dir

In [None]:
def check_correction_box (corrections, time):
    for correction in corrections:
        if correction[0] <= time <= correction[2]:
            return True
    return False

def check_correction_pitch (corrections, time, pitch):
    for correction in corrections:
        if correction[0] <= time <= correction[2]:
            return (correction[1] <= pitch <= correction[3])
    return True

In [None]:
def load_corrections(name):
    corrections = [] # (t1, t2, p1, p2)
    try: correction_file = open(f'{name}')
    except: 
        print('no correction file for', name)
        return []
    for line in correction_file:
        current_corrections = list(map(float, line.split(' ')))
        corrections.append(current_corrections)
    return corrections

In [None]:
def find_mean (estimates):
    estimates = estimates.T
    mean_output = np.empty(len(estimates))
    for i in range(len(estimates)):        
        mean_output[i] = np.mean(estimates[i][estimates[i] >= 0])
    return mean_output

def find_median (estimates):
    estimates = estimates.T
    median_output = np.empty(len(estimates))
    for i in range(len(estimates)):
        median_output[i] = np.median(estimates[i][estimates[i] >= 0])
    return median_output

def find_std (estimates):
    estimates = estimates.T
    std_output = np.empty(len(estimates))
    for i in range(len(estimates)):
        std_output[i] = np.std(estimates[i][estimates[i] >= 0])
    return std_output

def find_mad (estimates):
    estimates = estimates.T
    std_output = np.empty(len(estimates))
    for i in range(len(estimates)):
        # some 0 estimates are not precisely 0
        std_output[i] = median_abs_deviation(np.where(estimates[i] >= 1, estimates[i], 0*estimates[i]))
    return std_output

In [None]:
def separate(adir):
    conv={}
    conv[0] = lambda s: float(s.strip() or 0)
    x,y = np.loadtxt(adir, unpack=True, usecols=(0,1), converters=conv)
    return (x,y)

In [None]:
# new model
def calculate_labels (estimates):
    current_threshold = 5
    X = np.array(estimates).reshape(-1, 1)
    current_clustering = None
    
    while True:
        aglo_clust = AgglomerativeClustering(n_clusters=None, 
                                         affinity='euclidean', 
                                         linkage='average', 
                                         distance_threshold=current_threshold)
        current_clustering = aglo_clust.fit(X)
        stats = Counter(current_clustering.labels_).most_common()
        
        if len(stats) >= 2 and stats[0][1] == stats[1][1]:
            current_threshold += 1
        else: break
            
    return current_clustering.labels_
            

def estimate_pitch (estimates):
    return median(estimates)

#     if min(estimates) < 0:
#         print("estimates:", list(map(lambda x: round(x, 2), estimates)))

#     if len(estimates) == 1:
#         print(estimates)
#         return estimates[0]
    
#     estimates = sorted(estimates)
#     clustering = calculate_labels(estimates)
#     clusters = []
#     for i in range(np.max(clustering)+1):
#         ccount = 0
#         csum = 0
#         for j, e in enumerate(clustering):
#             if e == i:
#                 ccount += 1
#                 csum += estimates[j] 
#         clusters.append((ccount, csum / ccount))
#     clusters.sort(reverse=True)
    
#     if clusters[0][1] > 900 or min(estimates) < 0:
#         print("estimates:", list(map(lambda x: round(x, 2), estimates)), "Estimate:", round(clusters[0][1], 2))
#     return clusters[0][1]

In [None]:
def find_optimal_pitch (current_collections, collection, part):
    pitch_estimates = []
    time = len(current_collections[collection]['boersma'][part][0])
    
    full_collection = match_dir(correction_dir, collection)
    full_part = match_dir(os.path.join(correction_dir, full_collection), part)
    current_corrections = load_corrections(os.path.join(correction_dir, 
                                                            full_collection, 
                                                            full_part))
    
    for algo in algorithms:
        if part in current_collections[collection][algo]:
            if algo == 'crepe':
                crepe_pitches = current_collections[collection][algo][part][1]
                pitch_estimates.append(crepe_pitches)
                if len(crepe_pitches) < time:
                    time = len(crepe_pitches)
            else:
                pitch_estimates.append(current_collections[collection][algo][part][1])
                if len(current_collections[collection][algo][part][0]) < time:
                    time = len(current_collections[collection][algo][part][0])
    
    best_estimate = np.zeros(time)

    for i in range(time):
        current_pitches = []
        for j, pitches in enumerate(pitch_estimates):
            include_pitch = True
            if (algorithms[j] == 'boersma' or algorithms[j] == 'yin' or algorithms[j] == 'hermes') and pitches[i] == 0:
                include_pitch = False
            if include_pitch:
                current_pitches.append(pitches[i])
        
        current_pitches = list(filter(lambda x: check_correction_pitch(current_corrections, i/100, x), current_pitches))
        
        try:
            best_estimate[i] = median(current_pitches)
        except:
            print("No pitches for time:", i/100)
#             print(old_pitches)        
    
    return (np.arange(0, time), best_estimate)

In [None]:
def find_statistics (current_collections, collection, part, debug=False):
    pitch_estimates = []
    time = len(current_collections[collection]['boersma'][part][0])

      
    full_collection = match_dir(correction_dir, collection)
    full_part = match_dir(os.path.join(correction_dir, full_collection), part)
    current_corrections = load_corrections(os.path.join(correction_dir, 
                                                            full_collection, 
                                                            full_part))
    
    for algo in algorithms:
        if part in current_collections[collection][algo]:
            if algo == 'crepe':
                crepe_pitches = current_collections[collection][algo][part][1]
                for i in range(len(crepe_pitches)):
                    if check_correction_box(current_corrections, i/100):
                        crepe_pitches[i] = -1
                pitch_estimates.append(crepe_pitches)
                if len(crepe_pitches) < time:
                    time = len(crepe_pitches)
            else:    
                pitch_estimates.append(current_collections[collection][algo][part][1])
                if len(current_collections[collection][algo][part][0]) < time:
                    time = len(current_collections[collection][algo][part][0])
      
    pitch_estimates = np.array([pitch_estimate[:time] for pitch_estimate in pitch_estimates])
    current_mean = find_mean(pitch_estimates)
    current_median = find_median(pitch_estimates)
    current_std = find_std(pitch_estimates)
    print("Finding mad")
    current_mad = find_mad(pitch_estimates)

    
    if debug:
        print(pitch_estimates[:,6393:6407])
    
    return current_mean, current_median, current_std, current_mad

### Scherbaum Mshavanadze

In [None]:
collections = {
    "GVM009": {},
    "GVM017": {},
    "GVM019": {},
    "GVM031": {},
    "GVM097": {}
}

algorithms = ['boersma', 'crepe', 'hermes', 'noll', 'yin']
data_dir = '/Akamai/voice/data/pitches-vuv-crepe-assisted/'
cdata_dir = '/Akamai/voice/data/pitches-postprocessed/'

for collection in collections:
    for algo in algorithms:
        collections[collection][algo] = {}

for algorithm in os.listdir(data_dir):
    if not algorithm in algorithms:
        continue
    for collection in os.listdir(f"{data_dir}{algorithm}"):
        if collection != 'Scherbaum Mshavanadze':
            continue
        for song in os.listdir(f"{data_dir}{algorithm}/{collection}"):
            for part in os.listdir(f"{data_dir}{algorithm}/{collection}/{song}"):
                if "shifted" in part: continue
                print(part)
                cur_data_dir = cdata_dir if algorithm == 'crepe' else data_dir

                if part[:6] in collections:
                    if 'AHDS' in part:
                        x, y = separate(f"{cur_data_dir}{algorithm}/{collection}/{song}/{part}")
                        if 'crepe' in algorithm:
                            collections[part[:6]][algorithm][part[part.index('AHDS'):part.index('AHDS')+6]] = (x, y)
                    elif 'ALRX' in part:
                        x, y = separate(f"{cur_data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections[part[:6]][algorithm][part[part.index('ALRX'):part.index('ALRX')+6]] = (x, y)
                    elif 'AOLS' in part:
                        x, y = separate(f"{cur_data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections[part[:6]][algorithm][part[part.index('AOLS'):part.index('AOLS')+6]] = (x, y)
                    elif 'VSOA' in part:
                        x, y = separate(f"{cur_data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections[part[:6]][algorithm][part[part.index('VSOA'):part.index('VSOA')+6]] = (x, y)

In [None]:
parts = ['AHDS1M', 'AHDS2M', 'AHDS3M', 'ALRX1M', 'ALRX2M', 'ALRX3M', 'VSOAX4', 'AOLS5S']

res_dir = '/Akamai/voice/html/georgian/data/ground-estimate/Scherbaum Mshavanadze/'
std_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/standard-deviation/Scherbaum Mshavanadze/'
mean_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/mean/Scherbaum Mshavanadze/'
med_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/median/Scherbaum Mshavanadze/'
mad_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/mad/Scherbaum Mshavanadze/'

correction_dir = os.path.join(parent_correction_dir, 'Scherbaum Mshavanadze')

for collection in collections:
    for part in parts:
        if not part in collections[collection]['boersma']:
            print(f"{part} not found in boermsa. Skipping this voice type.")
            continue
        
        t, estimate = find_optimal_pitch(collections, collection, part)
        cmean, cmed, cstd, cmad = find_statistics(collections, collection, part)
        
        if not os.path.isdir(res_dir + collection):
            os.mkdir(res_dir + collection)
        if not os.path.isdir(std_dir + collection):
            os.mkdir(std_dir + collection)
        if not os.path.isdir(med_dir + collection):
            os.mkdir(med_dir + collection)
        if not os.path.isdir(mean_dir + collection):
            os.mkdir(mean_dir + collection)
        if not os.path.isdir(mad_dir + collection):
            os.mkdir(mad_dir + collection)
            
        # pitch estimate
        np.savetxt(res_dir + collection + '/' + part + '.txt', np.c_[t, estimate], delimiter=' ', fmt='%f')
        # std
        np.savetxt(std_dir + collection + '/' + part + '.txt', np.c_[t, cstd], delimiter=' ', fmt='%f')
        # median
        np.savetxt(med_dir + collection + '/' + part + '.txt', np.c_[t, cmed], delimiter=' ', fmt='%f')
        # mean
        np.savetxt(mean_dir + collection + '/' + part + '.txt', np.c_[t, cmean], delimiter=' ', fmt='%f')
        # mad
        np.savetxt(mad_dir + collection + '/' + part + '.txt', np.c_[t, cmad], delimiter=' ', fmt='%f')

        print(f"{part}:{collection} done")

### Teach Yourself Megrelian Songs

In [None]:
collections2 = {
    "Ak'a Si Rekisho": {},
    'Gepshvat Ghvini': {},
    'Io _ Chkin Kiana': {},
    'Mesishi Vardi': {},
    'Meureme': {},
    'Mi Re Sotsodali_': {},
    "Mole Chit'i Gilakhe": {},
    'O Da': {},
    'Vojanudi Chkim Jargvals': {}
}

algorithms = ['boersma', 'crepe', 'hermes', 'yin', 'noll']
data_dir = '/Akamai/voice/data/pitches-vuv-crepe-assisted/'
cdata_dir = '/Akamai/voice/data/pitches-postprocessed/'


for collection in collections2:
    for algo in algorithms:
        collections2[collection][algo] = {}

def root_name (name):
    s = None; e = None
    for i in range(len(name)-2, 0, -1):
        if name[i] == '.': e = i;
        elif name[i:i+2] == '_A': 
            s = i
            break
    if s == None: return name[:e];
    else: return name[:s];

for algorithm in os.listdir(data_dir):
    if not algorithm in algorithms:
        continue
    for collection in os.listdir(f"{data_dir}{algorithm}"):
        if collection != 'Teach Yourself Megrelian Songs':
            continue
        for song in os.listdir(f"{data_dir}{algorithm}/{collection}"):
            for part in os.listdir(f"{data_dir}{algorithm}/{collection}/{song}"):
                if "shifted" in part: continue
                print(part)
                if root_name(part) in collections2:
                    cur_data_dir = cdata_dir if algorithm == 'crepe' else data_dir
                    if 'AHDS' in part:
                        x, y = separate(f"{cur_data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections2[root_name(part)][algorithm][part[part.index('AHDS'):part.index('AHDS')+6]] = (x, y)
                    else:
                        x, y = separate(f"{cur_data_dir}{algorithm}/{collection}/{song}/{part}")
                        collections2[root_name(part)][algorithm]['VSOAX4'] = (x, y)

In [None]:
parts = ['AHDS1M', 'AHDS2M', 'AHDS3M', 'VSOAX4']

res_dir = '/Akamai/voice/html/georgian/data/ground-estimate/Teach Yourself Megrelian Songs/'
std_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/standard-deviation/Teach Yourself Megrelian Songs/'
med_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/median/Teach Yourself Megrelian Songs/'
mean_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/mean/Teach Yourself Megrelian Songs/'
mad_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/mad/Teach Yourself Megrelian Songs/'


correction_dir = os.path.join(parent_correction_dir, 'Teach Yourself Megrelian Songs')


for collection in collections2:
    for part in parts:
        if not part in collections2[collection]['boersma']:
            print(f"{part} not found in boermsa. Skipping this voice type.")
            continue

        
        t, estimate = find_optimal_pitch(collections2, collection, part)
        cmean, cmed, cstd, cmad = find_statistics(collections2, collection, part)

        if not os.path.isdir(res_dir + collection):
            os.mkdir(res_dir + collection)
        if not os.path.isdir(std_dir + collection):
            os.mkdir(std_dir + collection)
        if not os.path.isdir(med_dir + collection):
            os.mkdir(med_dir + collection)
        if not os.path.isdir(mean_dir + collection):
            os.mkdir(mean_dir + collection)
        if not os.path.isdir(mad_dir + collection):
            os.mkdir(mad_dir + collection)

        # pitch estimate
        np.savetxt(res_dir + collection + '/' + part + '.txt', np.c_[t, estimate], delimiter=' ', fmt='%f')
        # std
        np.savetxt(std_dir + collection + '/' + part + '.txt', np.c_[t, cstd], delimiter=' ', fmt='%f')
        # median
        np.savetxt(med_dir + collection + '/' + part + '.txt', np.c_[t, cmed], delimiter=' ', fmt='%f')
        # mean
        np.savetxt(mean_dir + collection + '/' + part + '.txt', np.c_[t, cmean], delimiter=' ', fmt='%f')
        # mad
        np.savetxt(mad_dir + collection + '/' + part + '.txt', np.c_[t, cmad], delimiter=' ', fmt='%f')

        print(f"{part}:{collection} done")

### Teach Yourself Gurian Songs

In [None]:
collections3_test = {"Sabodisho": {}}
collections3 = {
    "Adila-Alipasha": {},
    "Indi-Mindi": {},
    'Mival Guriashi (1)': {} ,
    'Pikris Simghera': {},
    "Alaverdi": {},
    "K'alos Khelkhvavi": {},
    'Mival Guriashi (2)' : {}, 
    "Sabodisho": {},
    "Khasanbegura": {},     
    "Mok'le Mravalzhamieri": {},
    'Sadats Vshobilvar': {},
    "Beri Ak'vans Epareba": {}, 
    "Lat'aris Simghera": {},    
    "Mts'vanesa Da Ukudosa": {}, 
    "Shermanduli": {},
    "Brevalo": {},             
    "Manana": {},         
    'Nanina (1)': {},      
    "Shvidk'atsa": {},
    "Chven-Mshvidoba": {},    
    "Maq'ruli": {},               
    'Nanina (2)': {},          
    'Supris Khelkhvavi': {},
    'Didi Khnidan': {},     
    "Masp'indzelsa Mkhiarulsa": {}, 
    "Orira": {},                
    "Ts'amok'ruli": {},
    "Gakhsovs, T'urpa": {},
    "Me-Rustveli": {},        
    "P'at'ara Saq'varelo": {}
}

algorithms = ['boersma', 'crepe', 'hermes', 'yin', 'noll']
data_dir = '/Akamai/voice/data/pitches-vuv-crepe-assisted/'
cdata_dir = '/Akamai/voice/data/pitches-postprocessed/'
working_song=''

colls=collections3
for collection in colls:
    for algo in algorithms:
        colls[collection][algo] = {}

def root_name (name):
    s = None; e = None
    for i in range(len(name)-2, 0, -1):
        if name[i] == '.': e = i;
        elif name[i:i+2] == '_A': 
            s = i
            break
    
    if s == None: return name[:e];
    else: return name[:s];

for algorithm in os.listdir(data_dir):
    if not algorithm in algorithms:
        continue
    for collection in os.listdir(f"{data_dir}{algorithm}"):
        if collection != 'Teach Yourself Gurian Songs':
            continue
        for song in os.listdir(f"{data_dir}{algorithm}/{collection}"):
            if song != working_song and working_song != "":
                continue
            for part in os.listdir(f"{data_dir}{algorithm}/{collection}/{song}"):
                if "shifted" in part: continue
                print(part)
                if root_name(part) in colls:
                    cur_data_dir = cdata_dir if algorithm == 'crepe' else data_dir
                    if 'AHDS' in part:
                        x, y = separate(f"{cur_data_dir}{algorithm}/{collection}/{song}/{part}")
                        colls[root_name(part)][algorithm][part[part.index('AHDS'):part.index('AHDS')+6]] = (x, y)
                    else:
                        x, y = separate(f"{cur_data_dir}{algorithm}/{collection}/{song}/{part}")
                        colls[root_name(part)][algorithm]['VSOAX4'] = (x, y)

In [None]:
parts = ['AHDS1M', 'AHDS2M', 'AHDS3M', 'VSOAX4']

res_dir = '/Akamai/voice/html/georgian/data/ground-estimate/Teach Yourself Gurian Songs/'
mean_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/mean/Teach Yourself Gurian Songs/'
std_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/standard-deviation/Teach Yourself Gurian Songs/'
med_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/median/Teach Yourself Gurian Songs/'
mad_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/mad/Teach Yourself Gurian Songs/'
correction_dir = os.path.join(parent_correction_dir, 'Teach Yourself Gurian Songs')


for collection in colls:
    for part in parts:
        if not part in colls[collection]['boersma']:
            print(f"{part} not found in boermsa. Skipping this voice type.")
            continue

        t, estimate = find_optimal_pitch(colls, collection, part)
        cmean, cmed, cstd, cmad = find_statistics(colls, collection, part)

        if not os.path.isdir(res_dir + collection):
            os.mkdir(res_dir + collection)
        if not os.path.isdir(std_dir + collection):
            os.mkdir(std_dir + collection)
        if not os.path.isdir(med_dir + collection):
            os.mkdir(med_dir + collection)
        if not os.path.isdir(mean_dir + collection):
            os.mkdir(mean_dir + collection)
        if not os.path.isdir(mad_dir + collection):
            os.mkdir(mad_dir + collection)
            

        # pitch estimate
        np.savetxt(res_dir + collection + '/' + part + '.txt', np.c_[t, estimate], delimiter=' ', fmt='%f')
        # std
        np.savetxt(std_dir + collection + '/' + part + '.txt', np.c_[t, cstd], delimiter=' ', fmt='%f')
        # median
        np.savetxt(med_dir + collection + '/' + part + '.txt', np.c_[t, cmed], delimiter=' ', fmt='%f')
        # mean
        np.savetxt(mean_dir + collection + '/' + part + '.txt', np.c_[t, cmean], delimiter=' ', fmt='%f')
        # mad
        np.savetxt(mad_dir + collection + '/' + part + '.txt', np.c_[t, cmad], delimiter=' ', fmt='%f')

        print(f"{part}:{collection} done")

### Testing and Debugging

In [None]:
data_dir = '/Akamai/voice/html/georgian/data/ground-estimate-statistics/'
for a in os.listdir(os.path.join(data_dir, 'mean')):
    if not a.startswith('.'):
        for b in os.listdir(os.path.join(data_dir, 'mean', a)):
            for c in os.listdir(os.path.join(data_dir, 'mean', a, b)):
                os.rmdir(os.path.join(data_dir, 'mad', a, b, c))

In [None]:
a = collections3["Lat'aris Simghera"]['crepe']['AHDS1M'][1]
print(len(a[a < 0]))

In [None]:
a = np.array([0, 0, 1.5, 1.6, 2])
median_absolute_deviation(a)*1.4826