# EpaDB 

In [27]:
# Data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Utils
import os
import subprocess
from tqdm import tqdm
from pathlib import Path
from IPython import embed

# Audio
import librosa
import librosa.display
import soundfile as sf
from scipy.signal.windows import tukey
from IPython.display import Audio
from pymediainfo import MediaInfo
from IPython.display import Audio, display


# Files
import json
import pickle
import joblib
from addict import Dict
import shutil

## Load EpaDB

In this section we load the annotations, labels and phone durations from the files in the labels folder. Alternatively, they can be loaded directly from the textgrid files in the transcriptions folder.  

The labels in the folder are computed using the assign_reference.py script that finds the best match between the manual annotations and the list of possible target pronunciations for that sentence in the transcriptions_sae.txt file. This allows us to consider a set of possible target pronunciation instead of one. 

It can be substituted by any dynamic alignment algorithm. For example, we also use a version of the ALINE algorithm (Grzegorz Kondrak, 2000), ported to Python and adapted to work with ARPAbet symbols. 

Note that the target pronunciations where chosen with respect to a so called 'Standard American English'. They do not pretend to be, by any means, representative of all Englishes and they were mostly chosen to match the dictionaries in existing forced alignment tools. Feel free to change them or suggest us new ones. 

In [6]:
# Path to where EpaDB is
EPA_PATH = '/home/jazmin/Documents/Code/repos/EpaDB/train'

In [7]:
# Load EpaDB from labels
df_epadb_0 = pd.concat([pd.read_csv(file, sep=' ', header=None) for file in Path(EPA_PATH).rglob('*/labels/*.csv')])

In [13]:
def _labels2int(x):
    if x == "+":
        return(1) 
    else: 
        return(0)

def _logid(x):
    p = x.split("_")
    return(p[0]+"_"+p[1])

def _name(x):
    p = x.split("_")
    return(p[0])

# df_epadb_1 = df_epadb_1[df_epadb_1.reference != "0"]
# df_epadb_1 = df_epadb_1[df_epadb_1.annotation != "0"]
df_epadb_1 = df_epadb_0.copy()
df_epadb_1 = df_epadb_1.rename(columns={0: 'phone_id', 1: 'reference', 2: 'annotation', 3: 'label', 4: 'start', 5: 'end'})
df_epadb_1['speaker_id'] = df_epadb_1['phone_id'].map(_name)
df_epadb_1['label'] = df_epadb_1['label'].map(_labels2int)
df_epadb_1['logid'] = df_epadb_1['phone_id'].map(_logid)
df_epadb_1['wav_path'] = df_epadb_1.apply(lambda x: str(Path(EPA_PATH, x['speaker_id'], 'waveforms', f"{x['logid']}.wav")), axis=1)
df_epadb_1['duration'] = df_epadb_1['end']-df_epadb_1['start']
df_epadb_1 = df_epadb_1.set_index('logid')

df_epadb_1

Unnamed: 0_level_0,phone_id,reference,annotation,label,start,end,speaker_id,wav_path,duration
logid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
spkr17_58,spkr17_58_0,DH,DH,1,1.325105,1.390000,spkr17,/home/jazmin/Documents/Code/repos/EpaDB/train/...,0.064895
spkr17_58,spkr17_58_1,AX,EH,0,1.390000,1.450000,spkr17,/home/jazmin/Documents/Code/repos/EpaDB/train/...,0.060000
spkr17_58,spkr17_58_2,L,L,1,1.450000,1.519755,spkr17,/home/jazmin/Documents/Code/repos/EpaDB/train/...,0.069755
spkr17_58,spkr17_58_3,OW,AW,0,1.519755,1.680553,spkr17,/home/jazmin/Documents/Code/repos/EpaDB/train/...,0.160798
spkr17_58,spkr17_58_4,ER,AX,0,1.680553,1.830000,spkr17,/home/jazmin/Documents/Code/repos/EpaDB/train/...,0.149447
...,...,...,...,...,...,...,...,...,...
spkr31_45,spkr31_45_6,Kh,Kh,1,1.320000,1.450000,spkr31,/home/jazmin/Documents/Code/repos/EpaDB/train/...,0.130000
spkr31_45,spkr31_45_7,OW,OW,1,1.450000,1.600000,spkr31,/home/jazmin/Documents/Code/repos/EpaDB/train/...,0.150000
spkr31_45,spkr31_45_8,T,T,1,1.600000,1.680000,spkr31,/home/jazmin/Documents/Code/repos/EpaDB/train/...,0.080000
spkr31_45,spkr31_45_9,AO,AO,1,1.710000,1.770000,spkr31,/home/jazmin/Documents/Code/repos/EpaDB/train/...,0.060000


## Explore EpaDB

In [None]:
wav_file = df_epadb_1[''][0]
IPython.display.Audio(wav_file)

## Extract Metadata

In [19]:
MEDIAINFO_KEYS = ('general/audio_codecs', 'general/codec', 'general/complete_name', 'general/duration', 'general/file_size',
                  'general/file_name', 'general/file_extension', 'general/overall_bit_rate',
                  'audio/bit_rate_mode', 'audio/bit_depth', 'audio/sampling_rate', 'general/other_duration',
                  'audio/channel_s')

MEDIA_SUFFIX = ['.wav', '.aac', '.mp3', '.webm', '.ogg', '.flac']

def get_mediainfo(audio_file):
    """
    Extracts useful information from the audio files
    """
    m = MediaInfo.parse(audio_file).to_data()
    d = {}
    for k in MEDIAINFO_KEYS:
        for t in m['tracks']:
            if k.split('/')[0] == t['track_type'].lower() and k.split('/')[1] in t:
                d[k.split('/')[1]] = t[k.split('/')[1]]
    if 'other_duration' in d:
        d['other_duration'] = d['other_duration'][-1]
    if 'duration' in d:
        d['mediainfo_duration'] = d['duration'] / 1000
        d.pop('duration')
    if 'overall_bit_rate' in d:
        d['overall_bit_rate'] = round(d['overall_bit_rate'] / 1000)
    return d

def dir_scan_media(dir_path, restrict_suffix=True):
    for path in Path(dir_path).rglob('*'):
        if restrict_suffix and path.suffix in MEDIA_SUFFIX:
            yield path
        elif not restrict_suffix and path.is_file():
            yield path

def dir2mediainfo(dir_path):
    """
    Gets info about the codec of a dir of audio files
    """
    data = []
    for path in dir_scan_media(dir_path):
        d = get_mediainfo(path)
        d['path'] = str(path)
        d['audio_file'] = str(path.name)
        data.append(d)
    return pd.DataFrame(data)

In [24]:
epa_media_info = dir2mediainfo(EPA_PATH)
epa_media_info 

Unnamed: 0,audio_codecs,codec,complete_name,file_size,file_name,file_extension,overall_bit_rate,bit_rate_mode,bit_depth,sampling_rate,other_duration,channel_s,mediainfo_duration,path,audio_file
0,PCM,Wave,/home/jazmin/Documents/Code/repos/EpaDB/train/...,237612,spkr17_55,wav,768,CBR,16,48000,00:00:02.474,1,2.474,/home/jazmin/Documents/Code/repos/EpaDB/train/...,spkr17_55.wav
1,PCM,Wave,/home/jazmin/Documents/Code/repos/EpaDB/train/...,311340,spkr17_62,wav,768,CBR,16,48000,00:00:03.242,1,3.242,/home/jazmin/Documents/Code/repos/EpaDB/train/...,spkr17_62.wav
2,PCM,Wave,/home/jazmin/Documents/Code/repos/EpaDB/train/...,598060,spkr17_46,wav,768,CBR,16,48000,00:00:06.229,1,6.229,/home/jazmin/Documents/Code/repos/EpaDB/train/...,spkr17_46.wav
3,PCM,Wave,/home/jazmin/Documents/Code/repos/EpaDB/train/...,442412,spkr17_19,wav,768,CBR,16,48000,00:00:04.608,1,4.608,/home/jazmin/Documents/Code/repos/EpaDB/train/...,spkr17_19.wav
4,PCM,Wave,/home/jazmin/Documents/Code/repos/EpaDB/train/...,204844,spkr17_45,wav,768,CBR,16,48000,00:00:02.133,1,2.133,/home/jazmin/Documents/Code/repos/EpaDB/train/...,spkr17_45.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,PCM,Wave,/home/jazmin/Documents/Code/repos/EpaDB/train/...,294956,spkr31_61,wav,706,CBR,16,44100,00:00:03.343,1,3.343,/home/jazmin/Documents/Code/repos/EpaDB/train/...,spkr31_61.wav
1890,PCM,Wave,/home/jazmin/Documents/Code/repos/EpaDB/train/...,245804,spkr31_38,wav,706,CBR,16,44100,00:00:02.786,1,2.786,/home/jazmin/Documents/Code/repos/EpaDB/train/...,spkr31_38.wav
1891,PCM,Wave,/home/jazmin/Documents/Code/repos/EpaDB/train/...,229420,spkr31_67,wav,706,CBR,16,44100,00:00:02.600,1,2.600,/home/jazmin/Documents/Code/repos/EpaDB/train/...,spkr31_67.wav
1892,PCM,Wave,/home/jazmin/Documents/Code/repos/EpaDB/train/...,376876,spkr31_50,wav,706,CBR,16,44100,00:00:04.272,1,4.272,/home/jazmin/Documents/Code/repos/EpaDB/train/...,spkr31_50.wav


In [26]:
total_duration = epa_media_info['mediainfo_duration'].sum(axis=0) # Total duration for 30 speakers in seconds
total_duration

5902.526