# Preprocess

In [1]:
# !mkdir ../dataset/mass_textgrid/
# ! mv ../dataset/wav/*.TextGrid ../dataset/maus_textgrid/

# Utils

In [2]:
# -*- encoding: utf8 -*-
#Collection of supporting functions for the coupe_verset audio slicer
#2.0v 15/04/2019 MZ BOITO

import glob, os, codecs
from praatio import tgio
from pprint import pprint

class Element():
    def __init__ (self, text_key, key, interval):
        self.text_key = text_key
        self.key = key
        self.interval = interval

    def to_string(self):
        return " ".join([self.text_key, str(self.interval.start), str(self.interval.end)])
    
    def _shift_interval(self, interval, value):
        if (interval.start - value) < 0 or (interval.end - value) < 0:
            raise Exception("Invalid value for shift interval function")
        return tgio.Interval(format_number(interval.start - value), format_number(interval.end - value), interval.label)

class TextgridWord(Element):
    def __init__(self, text_key, key, graphemic, phonetic, phones_list):
        Element.__init__(self, text_key, key, graphemic)
        self.graphemic = self.interval
        self.phonetic = phonetic
        self.phones_list = phones_list

    def shift_interval(self, value):
        self.interval = self._shift_interval(self.interval, value)
        self.graphemic = self.interval
        self.phonetic = self._shift_interval(self.phonetic, value)
        self.phones_list = [self._shift_interval(element, value) for element in self.phones_list]

class TextgridSilence(Element):
    def __init__(self, text_key, key, interval):
        Element.__init__(self, text_key, key, interval)

    def shift_interval(self, value):
        self.interval = self._shift_interval(self.interval, value)

def get_files_list(path):
    return glob.glob(path + "/*")

def get_prefix(file_name):
    return file_name.split("/")[-1].split(".")[0]

def shift_intervals(texgrid_list, value):
    for word_obj in texgrid_list:
        word_obj.shift_interval(value)

def create_textgrid_obj(textgrid_list):
    new_dict = dict()
    keys = ["ORT-MAU", "KAN-MAU", "MAU"]
    for key in keys:
        new_dict[key] = tgio.TextgridTier(key, [], 0.0, textgrid_list[-1].interval.end)
        new_dict[key].tierType = tgio.INTERVAL_TIER

    for element in textgrid_list: 
        new_dict["ORT-MAU"].entryList.append(element.interval)
        try:
            phonetic = element.phonetic
            phones_list = element.phones_list
        except AttributeError:
            phonetic = element.interval
            phones_list = [element.interval]
    
        new_dict["KAN-MAU"].entryList.append(phonetic)
        new_dict["MAU"].entryList += phones_list

    textgrid_obj = tgio.Textgrid()
    for key in keys:
        textgrid_obj.addTier(new_dict[key])


    return textgrid_obj

def print_elements_dictionary(elements_dictionary, key):
    for element in elements_dictionary[key]:
        print(element.to_string())

def format_number(float_number):
    return float("{:.2f}".format(float_number))

def elements_counter(elements_dictionary):
    sil = 0
    words = 0
    for element_list in elements_dictionary.values():
        for element in element_list:
            try:
                element.graphemic
                words +=1
            except AttributeError:
                sil +=1
    return words, sil

def create_log_file(file_name, dictionary_sequence, textgrid_text):
    with codecs.open(file_name, "w","utf-8") as log:
        log.write("{}\t{}\n".format(len(dictionary_sequence.split(" ")), len(textgrid_text.split(" ")) ) )
        try:
            for i in range(len(dictionary_sequence.split(" "))):
                log.write("\t".join([dictionary_sequence.split(" ")[i], textgrid_text.split(" ")[i]]) + "\n")
        except Exception:
            pass

def check_root(root_directory):
    try:
        os.stat(root_directory)
    except:
        os.makedirs(root_directory)

# Config

In [3]:
TEXTGRID_SUFFIX = ".TextGrid"
# WAV_SUFFIX = "_one_channel.wav"
WAV_SUFFIX = ".wav"
SIL_KEY = "SIL"
SEP_STR = "_verse_"
langs = ["en","es","eu","fr","ro","ru","hu","fi"]

# Parser

In [4]:
# -*- encoding: utf8 -*-
#Collection of cleaning/parsing functions for the coupe_verset audio slicer
#2.0v 15/04/2019 MZ BOITO

import re, codecs

def split_lab(text, language=None):
    p_lines = []
    regexp = re.compile(r'([0-9]+-[0-9]+)')
    number_flag = False
    last_number = 0
    if language == "eu":

        if re.compile(r'[0-9]+\s*-\s*[0-9]+').search(text):
            #print(text)
            text = re.sub(r'([0-9]+)\s*-\s*([0-9]+)', r'(\1-\2)', text) 
            #text = re.sub(r'(\D)\s*-\s*(\D)')
            #print(text)
            #exit(1)

    for line in text.split("    "): #4 space
#     for line in text.strip().split(): #4 space
#         ipdb.set_trace()
        if regexp.search(line) and language != "hu": #e.g.   (1-39)  text
            l = line.split("(")
            l1, number, l3 = l[0], l[1].split(")")[0], l[1].split(")")[1]
            number = number.split("-")[-1] #removes first part of "(START-END)"
            if int(number) > int(last_number):
                last_number = number
            if not number_flag:
                p_lines += [l1, last_number, l3]
            else: #adds the number to match transcription
                p_lines += [l1, last_number, l[1]]

        else:
            if line.replace(" ","").isdigit():
                number_flag = True
                last_number = line.replace(" ","")
            else:
                number_flag = False
            p_lines.append(line)
    return p_lines

def txt_to_dict(txt_path, language=None):
    output_dict = dict()
    last_key = 0 # zero is the key for the chapter's title
    with codecs.open(txt_path, "r", "utf-8") as txt_file:
        for line in txt_file:
            for possible_line in split_lab(line, language=language): 
                line = clean(possible_line,language=language)
                
#                 ipdb.set_trace()
#                 if isinstance(line, int): #verse number
#                     last_key = line 
#                 elif line: #text from the last verse
#                     output_dict[last_key] = line
                    
                output_dict[last_key] = line
                last_key += 1
    return output_dict

def remove_double_space(text, language=None):
    if language == "es":
        split_entry = text.split(" ")
        i  = 0
        while(i < len(split_entry)):
            if split_entry[i] == '\xad': #\xad is a 'soft hyphen', but due to coding problem it is printed as an invisible character
                del split_entry[i] 
            i+=1
        text = " ".join(split_entry)
    while "  " in text:
        text = text.replace("  "," ")
    return text

def clean_textgrid(dictionary_case, language):
    if language == "es" or language == "hu":
        token = '\xad' if language == "es" else '\x92'
        i  = 0
        while(i < len(dictionary_case.entryList)):
            if dictionary_case.entryList[i].label == token: 
                del dictionary_case.entryList[i]
            i+=1
    return dictionary_case

def clean(line, language=None):
    marks = ["“", "”","’"]
    punc = [".","!","?",","]

    if language == "en":
        line = re.sub(r'(\D)’s', r'\1 ’s', line) #space before the apostrophe missing 
        line = line.replace("—"," ")
        for symbol in [" ", ".", ",", "?", "!"]:
            line = line.replace(" ’ s" + symbol, " ’s" + symbol)
    elif language == "ru":
        line = line.replace("\'","").replace("--","")
    elif language == "es":
        line = line.replace("»","").replace("«","").replace("–","").replace('\xad',"")
        line = re.sub(r'(\D)¿(\D)', r'\1 ¿\2', line)
    elif language == "fr":
        line = line.replace("»","").replace("«","").replace("–","").replace('\xad',"").replace("…","")
    elif language == "eu":
        line = line.replace("»"," ").replace("«"," ").replace("—"," ").replace("-","").replace(":","").replace("/", " ").replace("…","") #« between words—
    elif language == "fi":
        line = line.replace("-","").replace("‘","").replace(":","")
    elif language == "hu":
        line = line.replace("\""," ").replace(":"," ").replace("-","").replace("\x92"," ").replace(",", " ")
    elif language == "ro":
        line = line.replace("–","").replace(":","").replace("»","").replace("…","")

    line = re.sub(r',(\D)', r', \1', line) #space missing after a comma
    line = re.sub(r'(\D)!(\D)', r'\1! \2', line) #space missing after exclamation point 
    line = re.sub(r'(\D)’(\D)', r'\1’ \2', line) #space missing after ending of a quote 
    line = re.sub(r'(\D)”(\D)', r'\1” \2', line) #space missing after ending of a quote 
    line = re.sub(r'(\D)“(\D)', r'\1 “\2', line) #space missing before beginning of a quote 

    line = line.replace("’ "," ")

    for symbol in punc + [")","(",";", "]", "["] + marks:
        line = line.replace(symbol, "")

    line = remove_double_space(line)
    line = line.replace("\t","")

    if line and line[0] == " ":
        line = line[1:]
    if line and line[-1] == " ":
        line = line[:-1]

    try:
        line = int(line) #verse
        return line
    except ValueError: #real text
        return line.strip("\n")

# Main

In [5]:
# !pip install alignment

In [6]:
# -*- encoding: utf8 -*-
# 2.0v last modified the 15/09/2019 MZ BOITO

import os, sys, codecs, argparse
from praatio import tgio
from pprint import pprint
# from multiprocessing import Process
# from utils import *
# from parser import *
# from config import langs, TEXTGRID_SUFFIX, WAV_SUFFIX, SIL_KEY, SEP_STR
import ipdb

from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

def imperfect_raw_grid_align(dictionary_sequence, textgrid_sequence, verbose=False):
    
    '''
    /!\ ALLOWING NOT PERFECT ALIGNMENTS 
    remove "imperfect_" from the function name and comment (or remove) the raw_grid_align function
    add the following imports at the beginning of this script:

    from alignment.sequence import Sequence
    from alignment.vocabulary import Vocabulary
    from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

    /!\ DO NOT USE THE --force option with this function
    You might need to remove some asserts (and have some headaches) to make this option work
    We do not advise using it
    '''
#     a = Sequence(dictionary_sequence.split()) #dictionary
#     b = Sequence(textgrid_sequence.split()) #textgrid
    
    a = Sequence(dictionary_sequence.strip().split())
    b = Sequence(textgrid_sequence.strip().split())
    
    
#     ipdb.set_trace()
    
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)
    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    _, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

    if not encodeds:
        raise Exception("Alignment Module failed")
    
    # Iterate over optimal alignments, print them if verbose
    for encoded in encodeds:
        alignment = v.decodeSequenceAlignment(encoded)
        if verbose:
            for tup in list(alignment):
                print(tup)
            print ('Alignment score:', alignment.score)
            print ('Percent identity:', alignment.percentIdentity())
    
#     ipdb.set_trace()
    return alignment

def raw_grid_align(dictionary_sequence, textgrid_sequence, verbose=False):
    '''
    This function considers perfect textual alignment between chapter 
    raw text and textgrid (after parser.py cleaning). If not the case, 
    please check the README and imperfect_raw_grid_alignment for more 
    information.
    '''
    
    dictionary_sequence = dictionary_sequence.split(" ")
    textgrid_sequence = textgrid_sequence.split(" ")
        
#     dictionary_sequence = dictionary_sequence.strip().split()
#     textgrid_sequence = textgrid_sequence.strip().split()
    
    alignment = list()
    
    for i in range(min(len(textgrid_sequence), len(dictionary_sequence))):
        alignment.append((dictionary_sequence[i], textgrid_sequence[i]))
            
    return alignment

#     if (len(textgrid_sequence) != len(dictionary_sequence)):
#         ipdb.set_trace()
#     add_empty = 0    
#         if (len(textgrid_sequence) > len(dictionary_sequence)):
#         if (len(dictionary_sequence) > len(textgrid_sequence)):
#             add_empty = abs((len(dictionary_sequence) - len(textgrid_sequence)))
#     #         for i in range(add_empty):
#     #             textgrid_sequence.append("")                    
#     for i in range(len(dictionary_sequence)):

    

def get_tier_by_interval(start, end, tier_dictionary):
    return [element for element in tier_dictionary.entryList if element.start >= start and element.end <= end]

def get_key_by_index(dictionary, index):
    keys = list(dictionary.keys())
    key_index = 0
    while(index >= 0 and key_index < len(keys)):
        line = dictionary[keys[key_index]].split(" ")
        l_length = len(line)
        if index >= l_length:
            index -= l_length
            key_index +=1
        else: #index < l_length, the word is at line[index], key is at keys[key_index]
            return line[index], keys[key_index]
    raise Exception("Key not found: Alignment index problem")

def add_time_windows(dictionary, textgrid, alignment):
    #ORT-MAU -> words; #KAN-MAU -> phonetic transcription; #MAU -> phoneme alignment
    richer_alignment = []
    last_verse = 0
#     ipdb.set_trace()
#     for i in range(len(alignment)):
    for i in range(
        min(len(alignment), len(textgrid.tierDict["ORT-MAU"].entryList))):
        dict_word, tg_word = alignment[i]
        try:
            word, verse = get_key_by_index(dictionary, i)
            last_verse = verse
        except Exception: #didn't find the match on the dictionary, uses last alignment found
            verse = last_verse

        if args.force:
            assert word == dict_word, "Alignment mismatch between the dictionary and the textgrid"
        
        graphemic_transcription = textgrid.tierDict["ORT-MAU"].entryList[i]
#         graphemic_transcription = textgrid.tierDict["ORT"].entryList[i]

        if args.force:
            assert graphemic_transcription.label == tg_word, "Graphemic alignment mismatch"

        phonetic_transcription = get_tier_by_interval(graphemic_transcription.start, graphemic_transcription.end, textgrid.tierDict["KAN-MAU"])[0]
        phones_list = get_tier_by_interval(graphemic_transcription.start, graphemic_transcription.end, textgrid.tierDict["MAU"])
        tg_word = TextgridWord(tg_word, verse, graphemic_transcription, phonetic_transcription, phones_list)
        richer_alignment.append(tg_word)

    return richer_alignment

def merge_silence(textgrid, alignment):
    merged_list = []
    silence_list = textgrid.tierDict["ORT-MAU"].getNonEntries()
#     silence_list = textgrid.tierDict["ORT"].getNonEntries()
    sil_index = 0
    text_index = 0
    last_verse = 0
    while(text_index < len(alignment) or sil_index < len(silence_list)):
        if sil_index == len(silence_list): #finished with the silence
            merged_list.append(alignment[text_index])
            text_index +=1
        elif text_index == len(alignment) or alignment[text_index].graphemic.start > silence_list[sil_index].start: 
            #finished with the text or the silence comes first
            sil_obj = TextgridSilence(SIL_KEY,last_verse, silence_list[sil_index])
            merged_list.append(sil_obj)
            sil_index +=1
        else: #word comes first
            merged_list.append(alignment[text_index])
            last_verse = alignment[text_index].key
            text_index +=1
    return merged_list

def split_by_verse(alignment):
    dictionary = dict()
    for element in alignment:
        try:
            dictionary[element.key].append(element)
        except KeyError:
            dictionary[element.key] = [element]
    return dictionary

def split_silence(silence_object):
    old_interval = silence_object.interval
    new_ending = format_number((old_interval.start + (old_interval.end - old_interval.start)/2.0))
    new_interval = tgio.Interval(old_interval.start, new_ending, old_interval.label)
    new_obj = TextgridSilence(silence_object.text_key, silence_object.key, new_interval)
    new_interval = tgio.Interval(new_ending, old_interval.end, old_interval.label)
    carry = TextgridSilence(silence_object.text_key, -1, new_interval)
    return new_obj, carry

def split_boundary_silence(alignment_dictionary):
    new_dictionary = dict()
    keys = list(alignment_dictionary.keys())
    carry = None
    for key in keys:
        if carry:
            carry.key = key
            new_dictionary[key] = [carry]
            carry = None #consumes carry
        else:
            new_dictionary[key] = []
        
        if key == keys[-1]: #last key, nothing to pass for the next 
            new_dictionary[key] += alignment_dictionary[key]
        else:
            if alignment_dictionary[key][-1].text_key == SIL_KEY: #if the verse ends with silence
                new_dictionary[key] += alignment_dictionary[key][:-1] #everything but the silence goes to the next dictionary
                new_obj, carry = split_silence(alignment_dictionary[key][-1])
                new_dictionary[key].append(new_obj)

            else: #the verse doesn't start or end with silence
                new_dictionary[key] += alignment_dictionary[key]

    return new_dictionary

def align(file, lab_dictionary, grid, verbose=False, language=None):
    dictionary_sequence = " ".join(lab_dictionary.values()) #get the text from the dictionary
    tg=tgio.openTextgrid(grid)
    
    tg.tierDict["ORT-MAU"] = clean_textgrid(tg.tierDict["ORT-MAU"], language) #remove enconding problems for alignment's sake
#     tg.tierDict["ORT"] = clean_textgrid(tg.tierDict["ORT"], language) #remove enconding problems for alignment's sake

    entryList = tg.tierDict["ORT-MAU"].entryList
#     entryList = tg.tierDict["ORT"].entryList
    concatenated_ort = " ".join([entry.label for entry in entryList if entry != "­"])
    
    if args.verbose:
        print("\tDICIONARY OUTPUT")
        pprint(lab_dictionary)
        print("\tTEXTGRID OUTPUT")
        print(concatenated_ort)

    sys.setrecursionlimit(2000) #/!\ this might be a problem
#     split_entry = concatenated_ort.split(" ")
    split_entry = concatenated_ort.strip().split(" ")

    if args.force:
        try:
            assert len(dictionary_sequence.split(" ")) == len(split_entry), "Number of words mismatch between lab and textgrid"
        except AssertionError:
            create_log_file(file.split("/")[-1] + "_error_log", dictionary_sequence, concatenated_ort)
            exit(1)
    
    
    alignment = raw_grid_align(dictionary_sequence, concatenated_ort)
#     alignment = imperfect_raw_grid_align(dictionary_sequence, concatenated_ort)
#     ipdb.set_trace()
    
    if args.force:
        assert len(alignment) == len(dictionary_sequence.split(" ")), "Number of words mismatch between final alignment and dictionary"
    
#     ipdb.set_trace()
    richer_alignment = add_time_windows(lab_dictionary, tg, alignment)
    complete_alignment = merge_silence(tg, richer_alignment)
    splitted_alignment = split_by_verse(complete_alignment)
    final_alignment = split_boundary_silence(splitted_alignment)

    words, sil = elements_counter(final_alignment)
    
    if verbose:
        print("Final alignment has %d words and %d silence marks" % (words, sil))
    
    assert words == len(alignment), "The script lost part of the words during the alignment"
    
    return final_alignment

def generate_audio_cuts(alignment_dictionary):
    return [(key, alignment_dictionary[key][0].interval.start, alignment_dictionary[key][-1].interval.end) for key in alignment_dictionary.keys()]

def slice_audio(audio, output_prefix, windows, verbose=False):
    if verbose:
        print("Cutting audio %s" % (audio))
    for (key, start, end) in windows:
        output_file = output_prefix + SEP_STR + str(key) + WAV_SUFFIX
        if verbose:
            print(output_file, start, end)
#         ipdb.set_trace()
        os.system("sox {} {} trim {} ={}".format(audio, output_file, start, end))

def write_new_textgrids(output_prefix, windows, alignment_dictionary):
    assert len(windows) == len(alignment_dictionary.keys()), "Size Mismatch between audio windows and textgrids"
    for (key, start, _) in windows:
        if start != 0:
            shift_intervals(alignment_dictionary[key], start)
        
#         ipdb.set_trace()
        
        obj = create_textgrid_obj(alignment_dictionary[key])
        output_file = output_prefix + SEP_STR + str(key) + TEXTGRID_SUFFIX
        obj.save(output_file)

def write_text_files(output_prefix, lab_dictionary):
    for key in lab_dictionary.keys():
        output_file = output_prefix + SEP_STR + str(key) + ".txt"
        with codecs.open(output_file, "w","utf-8") as output_file:
            output_file.write(lab_dictionary[key] + "\n")

def process_document(lab_file, args):
    if args.verbose:
            print(lab_file)
    
    # we get the prefix of the file (e.g 'B05___05_Acts________ENGESVN1DA')
    file_prefix = get_prefix(lab_file)
    
    # we get the text file in dictionary form {0: 'Acts 5', 1:'But a man ...' ...}
    lab_dictionary = txt_to_dict(lab_file, args.language)
    
    # get the the texgrid file conresponding to the 'file_prefix'
    textgrid_file = os.path.join(args.textgrid, file_prefix + TEXTGRID_SUFFIX)
    
#     ipdb.set_trace()
    
    # 
    alignment_dictionary = align(lab_file, lab_dictionary, textgrid_file, language=args.language, verbose= args.verbose)
    
    # 
    windows = generate_audio_cuts(alignment_dictionary)
    
    output_prefix = os.path.join(args.output, file_prefix) 
    slice_audio(os.path.join(args.wav,  file_prefix + WAV_SUFFIX), output_prefix, windows, verbose=args.verbose)
    
    write_new_textgrids(output_prefix, windows, alignment_dictionary)
    write_text_files(output_prefix, lab_dictionary)

def process(args):   
    labs = get_files_list(args.lab)
    textgrids = get_files_list(args.textgrid)
    wavs = get_files_list(args.wav)

    assert len(labs) == len(textgrids) and len(textgrids) == len(wavs), "Different number of files inside the folders"

    for lab_file in labs:
        # 1) REMOVE THE COMMENT BELOW TO REMOVE MULTIPROCESSING
        process_document(lab_file, args)
        # 2) COMMENT THE FOLLOWING 3 LINES TO REMOVE MULTIPROCESSING
        #p = Process(target=process_document, args=(lab_file, args))
        #p.start()
    #p.join()

In [7]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--lab', type=str, nargs='?', help='lab folder')
    parser.add_argument('--textgrid', type=str, nargs='?', help='textgrid folder')
    parser.add_argument('--wav', type=str, nargs='?', help='wav folder')
    parser.add_argument("--verbose", "-v", help="increases output verbosity", action="store_true")
    parser.add_argument("--force", "-f", help="forces a perfect alignment between textgrid and lab", action="store_true")
    parser.add_argument('--output', type=str, nargs='?', help="name for the output folder")
    parser.add_argument('--language', type=str, nargs='?', help='specifies language for cleaning and alignment')
    
    # https://github.com/spyder-ide/spyder/issues/3883
    import sys
    sys.argv=['']#; del sys 

    args = parser.parse_args()

    args.lab = "../dataset/english/"
    args.textgrid = "../dataset/maus_textgrid/"
    args.wav = "../dataset/wav/"
    args.output = "../dataset/allign/"
    args.language = "English"
    args.verbose = False
    args.force = False
    
#     args.lab = "../dataset_old/english/"
#     args.textgrid = "../dataset_old/textgrid/"
#     args.wav = "../dataset_old/wav_verse/"
#     args.output = "../dataset_old/allign/"
#     args.language = "English"
#     args.verbose = True
#     args.force = True
        
    if not (args.lab and args.textgrid and args.wav and args.output):
        parser.print_help()
        print("LIST OF SUPPORTED LANGUAGES: %s" % (" ".join(langs)))
        exit(1)
    
    check_root(args.output)
    process(args)

IndexError: list index out of range

# Move the splited speech and text

In [9]:
!mkdir ../dataset/wav_verse
!mkdir ../dataset/English

!mv ../dataset/allign/*.wav ../dataset/wav_verse/
!mv ../dataset/allign/*.txt ../dataset/English/

mkdir: cannot create directory ‘../dataset/wav_verse’: File exists
mkdir: cannot create directory ‘../dataset/English’: File exists
