In [31]:
import os,time,itertools,random
from allosaurus.app import read_recognizer
import pandas as pd
from datetime import datetime

In [2]:
#ok, so here is a generator which iterates through the names of the wavefiles and their texts
def get_wave_names_and_texts():
    with open( "./bigdata/ALFFA_PUBLIC/ASR/SWAHILI/data/train/text", "rt" ) as text_in:
        for line in text_in:
            if " " in line:
                split_index = line.index( "\t" )
                wave_name = line[:split_index].strip()
                text = line[split_index+1:].strip()
                yield {'wave_name':wave_name, 'text': text }


In [3]:
#I now want to filter out all the sentances which have less thans in 
#because I don't want to work with music or untranslatiable stuff.
def drop_music_and_stuffs( source ):
    for thing in source:
        if '<' not in thing['text']:
            yield thing
            

In [4]:
#now I need the full paths to these wave files.
def add_full_paths( source ):
    def find_for( wave_name ):
        search_path = './bigdata/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/'
        for root, _, files in os.walk( search_path ):
            for file in files:
                if file == wave_name + ".wav":
#                     print( f"root is {root}" )
#                     print( f"file is {file}" )
                    return os.path.join( root, file )
                    
#                 else:
#                     print( f"{file} isn't \"{wave_name + '.wav'}\"") 
#                     print( f"wave_name is \"{wave_name}\"")
#                     time.sleep(.2)
        return None
    
    for thing in source:
        full_path = find_for( thing['wave_name'] )
        if full_path is not None:
            thing['full_path'] = full_path
            yield thing
        else:
            print( f"Couldn't find {thing['wave_name']}")
            time.sleep(1)
        

In [5]:
#here we run allo
def add_allo( source, emit=1 ):
    model = read_recognizer()
    
    for thing in source:
        result = model.recognize( thing['full_path'], emit=emit )
        thing['allosaurus'] = result.replace( ' ', '' )
        yield thing
    

In [6]:
#snag the epitran translation from Colin's spreadsheet
def add_epitran( source ):
    colin_epitran_filename = './data/ALFFA_dataset_ allosaurus vs epitran.ods'
    #we can read directly from ods
    #https://stackoverflow.com/questions/17834995/how-to-convert-opendocument-spreadsheets-to-a-pandas-dataframe
    colin_epitran = pd.read_excel(colin_epitran_filename, engine="odf")
    
    
    for thing in source:
        matching_rows = colin_epitran[ colin_epitran['filename'] == thing['wave_name'] ]
        if len( matching_rows ) > 0:
            thing['epitran'] = matching_rows['cleaned_transcript_epitran'].tolist()[0].strip()
            yield thing
        else:
            print( f"No epitran for {thing['wave_name']}")

{'wave_name': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part10', 'text': 'rais wa tanzania jakaya mrisho kikwete', 'full_path': './bigdata/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SWH-05-20101106/SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part10.wav', 'allosaurus': 'ʔɒl̪ɒiɪs̪iiːk͡p̚waɒt̪tʂʌŋɴdtsʌɒɴniiːjʔaɒtʂiekʰɒaɒjamәɴiːnʂɻ̩k͡p̚tʂiːəkk͡p̚uəeɪt̪tʂen', 'epitran': 'ɾais wa tanzania ʄakaja mɾiʃo kikwete'}
{'wave_name': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part100', 'text': 'yanayo andaliwa nami pendo pondo idhaa ya kiswahili', 'full_path': './bigdata/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SWH-05-20101106/SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part100.wav', 'allosaurus': 'ʔeɪnʌaɪɴnɪlʲɪb̞anɪðlmiːɪiːpenɴdouəpʁuəouənɴduəoviːweɪivaæɪksk͡p̚uəonjuəiːɴllʲi', 'epitran': 'janajo andaliwa nami pendo pondo iðaa ja kiswahili'}
{'wave_name': 'SWH-05-20101106_16k-emission_swahili_05h30_-_06h00_tu_20101106_part10

In [37]:
def drop_column( source, key ):
    for thing in source:
        del thing[key]
        yield thing

def shuffle_it( source ):
    source = list(source)
    random.shuffle(source)
    return source
    
def print_progress( source, total_length, skip_length=5 ):
    start_time = datetime.now()
    count = 0
    for thing in source:
        if count > 0 and count % skip_length == 0:
            elapsed_time = datetime.now()-start_time
            end_time = total_length/count*elapsed_time+start_time
            print( f"{count}/{total_length} Elapsed {elapsed_time} Estimated end time: { end_time.strftime('%m/%d/%Y, %H:%M:%S')}")
            
        yield thing
        count += 1

In [35]:
x = get_wave_names_and_texts()
x = drop_music_and_stuffs( x )
x = shuffle_it( x )
x = list(x)
length = len(x)
x = add_full_paths( x )
x = add_allo( x, emit=1.5 )
x = add_epitran( x )
x = print_progress( x, length )
print( next(x) )
print( next(x) )
print( next(x) )

{'wave_name': 'SWH-15-20110310_16k-emission_swahili_15h00_-_16h00_tu_20110310_part392', 'text': 'kwa nini kuna ugumu kwa nato kuingia moja kwa moja', 'full_path': './bigdata/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SWH-15-20110310/SWH-15-20110310_16k-emission_swahili_15h00_-_16h00_tu_20110310_part392.wav', 'allosaurus': 'xuəaɴliɴiːɪk͡p̚uəɴɒouəɡuəmәuəxuəaɒɴnɒatʂuəouəðək͡p̚uəeiɴtɕijʔəɒmuəotɕieʁkʰuəʌnmuəoɴijiaɒn', 'epitran': 'kwa nini kuna uɠumu kwa nato kuinɡia moʄa kwa moʄa'}
{'wave_name': 'SWH-05-20101113_16k-emission_swahili_05h30_-_06h00_tu_20101113_part148', 'text': 'na ndio maana kumekuwa na umuhimu wa kufika', 'full_path': './bigdata/ALFFA_PUBLIC/ASR/SWAHILI/data/train/wav/SWH-05-20101113/SWH-05-20101113_16k-emission_swahili_05h30_-_06h00_tu_20101113_part148.wav', 'allosaurus': 'teæɴnɴtɕiomanaskomeɪskuəonouəŋ̟mouːniiːŋ̟mopuəfuəixʁeɪn', 'epitran': 'na ndio maana kumekuwa na umuhimu wa kufika'}
{'wave_name': 'SWH-05-20110321_16k-emission_swahili_05h30_-_06h00_tu_20110321_part68', 'te

In [None]:
x = get_wave_names_and_texts()
x = drop_music_and_stuffs( x )
x = shuffle_it( x )
x = list(x)
length = len(x)
x = add_full_paths( x )
x = add_allo( x, emit=1.5 )
x = add_epitran( x )
x = print_progress( x, length )
x = drop_column( x, 'full_path' )

# #make sure it works first.
# x = [next(x), next(x), next(x), next(x)]

#now I need to save it back out.
pandas_to_save = pd.DataFrame.from_records( x )


expanded_epitran_filename = './data/ALFFA_dataset_more_phonemes.ods'

with pd.ExcelWriter(expanded_epitran_filename, engine="odf") as writer:
    pandas_to_save.to_excel(writer, index=False)  

print( "done. :-)")

5/9423 Elapsed 0:00:09.173124 Estimated end time: 11/27/2022, 17:10:54
10/9423 Elapsed 0:00:10.205455 Estimated end time: 11/27/2022, 15:03:03
15/9423 Elapsed 0:00:11.265704 Estimated end time: 11/27/2022, 14:20:44
20/9423 Elapsed 0:00:12.175580 Estimated end time: 11/27/2022, 13:58:23
25/9423 Elapsed 0:00:13.234745 Estimated end time: 11/27/2022, 13:45:55
30/9423 Elapsed 0:00:14.235145 Estimated end time: 11/27/2022, 13:37:18
35/9423 Elapsed 0:00:15.158274 Estimated end time: 11/27/2022, 13:30:47
40/9423 Elapsed 0:00:16.139981 Estimated end time: 11/27/2022, 13:26:09
45/9423 Elapsed 0:00:17.197050 Estimated end time: 11/27/2022, 13:22:47
50/9423 Elapsed 0:00:18.139469 Estimated end time: 11/27/2022, 13:19:45
55/9423 Elapsed 0:00:19.044509 Estimated end time: 11/27/2022, 13:17:09
60/9423 Elapsed 0:00:20.249767 Estimated end time: 11/27/2022, 13:15:47
65/9423 Elapsed 0:00:21.178757 Estimated end time: 11/27/2022, 13:13:57
70/9423 Elapsed 0:00:22.292171 Estimated end time: 11/27/2022, 13