In [None]:
#default_exp utils.imports.common_voice

In [None]:
#export
import os, sys
import shutil
import multiprocessing
import math

import pandas as pd
import tensorflow as tf

import ffmpeg
import json

from typing import TextIO

import audioengine
from audioengine.utils.schema import verify_audioengine_dataset
from audioengine.utils.misc import (log_init, log_error, log_info, log_debug,
                                    change_file_extension, get_json_file_integrity)

2021-07-23 14:35:43.939268: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [None]:
#This is not to be exported
from IPython.display import display

In [None]:
DEBUG = True 

if DEBUG:
    log_init()
    dataset_output_location = '/project/Datasets/audioengine_single_word'
    dataset_input_location = '/project/Datasets/common_voice_single_word'
    dataset_audio_clips_directory = 'clips'
    #dataset_name = 'dev.tsv'
    
    
    
    train_dataset_name = 'train.tsv'
    test_dataset_name = 'test.tsv'
    val_dataset_name = ''
    
    generate_train_dataset = True
    generate_test_dataset = True
    generate_validation_dataset = True
    
    train_test_val_split = (0.7, 0.2, 0.1)
    
    assert(train_test_val_split[0] + train_test_val_split[1] + train_test_val_split[2] == 1.0)
    
    if(os.path.isdir(dataset_output_location)):
        shutil.rmtree(dataset_output_location)
    else:
        pass
    os.mkdir(dataset_output_location)

[INFO] Logging now set to file: /project/Development/ML/audio/logs/audioengine.log with level DEBUG


AssertionError: 

In [None]:
#export

def read_tsv(tsv_filepointer: TextIO) -> pd.DataFrame:
    """The input filepointer should already be open with no lines read"""
    tsv_data_df = pd.read_csv(tsv_filepointer, sep = '\t')
    return tsv_data_df

In [None]:
#export

def delete_pandas_columns(df: pd.DataFrame, column_list: list) -> pd.DataFrame:
    for column in column_list:
        del df[column]
    return df

In [None]:
if DEBUG:    
    frames = []
    if(train_dataset_name != ''):
        common_voice_train_filepath = os.path.join(dataset_input_location, train_dataset_name)
        common_voice_train_filepointer = open(common_voice_train_filepath, 'r')
        common_voice_train_df = read_tsv(common_voice_train_filepointer)
        frames.append(common_voice_train_df)
    elif(generate_train_dataset):
        train_dataset_name = 'train.json'
    else:
        pass
    
    if(test_dataset_name != ''):
        common_voice_test_filepath = os.path.join(dataset_input_location, test_dataset_name)
        common_voice_test_filepointer = open(common_voice_test_filepath, 'r')
        common_voice_test_df = read_tsv(common_voice_test_filepointer)
        frames.append(common_voice_test_df)
    elif(generate_test_dataset):
        test_dataset_name = 'test.json'
    else:
        pass
    
    if(val_dataset_name != ''):
        common_voice_val_filepath = os.path.join(dataset_input_location, val_dataset_name)
        common_voice_val_filepointer = open(common_voice_val_filepath, 'r')
        common_voice_val_df = read_tsv(common_voice_val_filepointer)
        frames.append(common_voice_val_df)
    elif(generate_validation_dataset):
        val_dataset_name = 'val.json'
    else:
        pass
    
    if(frames == []):
        log_critical('No input files to convert')
        sys.exit(1)
    else:
        pass
    
    common_voice_df = pd.concat(frames)
    drop_columns = ['age', 'gender', 'accent', 'locale', 'segment', 'up_votes', 'down_votes', 'client_id']
    
    if(generate_train_dataset):
        #Generate the train set
        train_set_size = math.floor(train_test_val_split[0] * len(common_voice_df))
        if(train_set_size < 1):
            log_error('Size of train set less than one')
            sys.exit(1)
        else:
            pass
        common_voice_train_df = common_voice_df[0:train_set_size]
        common_voice_df[train_set_size:]
        common_voice_train_df = delete_pandas_columns(common_voice_train_df, drop_columns)
    else:
        pass
    if(generate_test_dataset):
        #Generate the test set
        test_val_split = (train_test_val_split[1], train_test_val_split[2])
        new_train_percentage = test_val_split[0] / (test_val_split[0] + test_val_split[1])
        test_set_size = math.floor(test_val_split[0] * len(common_voice_df))
        if(train_set_size < 1):
            log_error('Size of test set less than one')
            sys.exit(1)
        else:
            pass
        common_voice_test_df = common_voice_df[0:test_set_size]
        common_voice_df = common_voice_df[test_set_size:]
        common_voice_test_df = delete_pandas_columns(common_voice_test_df, drop_columns)
    else:
        pass
    if(generate_validation_dataset):
        #Assign the val set
        if(len(common_voice_df) < 1):
            log_error('Zero length val dataset')
            sys.exit(1)
        else:
            pass
        common_voice_val_df = common_voice_df #By this point the rest of the df has already been taken for training or test
        common_voice_val_df = delete_pandas_columns(common_voice_val_df, drop_columns)
    else:
        pass

    display(common_voice_train_df)

In [None]:
def convert_df_to_audioengine_label(df: pd.DataFrame, path_column_name: str, label_column_name: str) -> dict:
    audio_data_json = {}
    categories = []
    categories_id_map = {}
    categories_json_list = []
    audio_json_list = []
    for i, v in df.iterrows():
        old_filename = v[path_column_name]
        filename = change_file_extension(old_filename, '.wav')
        
        label = v[label_column_name]
        if(label not in categories):
            categories.append(label)
            categories_id_map[label] = len(categories)
            categories_json = {'id': len(categories),
                              'name': label,
                              'supercategory': 'Word'}
            categories_json_list.append(categories_json.copy())
        else:
            pass
        category_id = categories_id_map[label]
        audio_json = {'id': i,
                     'category_id': category_id,
                     'file_name': filename}
        audio_json_list.append(audio_json.copy())
    licenses_json = {
        'id': 1,
        'name': 'CC0',
        'url': 'https://creativecommons.org/share-your-work/public-domain/cc0/'
    }
    info_json = {
        'year': 2021,
        'version': '1.0',
        'description': 'Mozilla single word dataset',
        'contributor': 'Mozilla and contributors',
        'url': 'https://commonvoice.mozilla.org/en',
        'date_created': '2021-06-30',
        'task': 'classification',
    }
    audioengine_dataset = {'info': info_json,
                          'licenses': [licenses_json],
                          'audio': audio_json_list,
                          'categories': categories_json_list}
    return audioengine_dataset

In [None]:
if DEBUG:
    if(generate_train_dataset):
        audioengine_train_json = convert_df_to_audioengine_label(common_voice_train_df, 'path', 'sentence')
        if(not verify_audioengine_dataset(audioengine_train_json)):
            log_error('The common voice import failed because the audioengine_json did not match the audioengine_dataset schema')
        else:
            log_info('Success the common voice descriptor file has successfully been converted into the audioengine JSON format')
    else:
        pass
    
    if(generate_test_dataset):
        audioengine_test_json = convert_df_to_audioengine_label(common_voice_test_df, 'path', 'sentence')
        if(not verify_audioengine_dataset(audioengine_test_json)):
            log_error('The common voice import failed because the audioengine_json did not match the audioengine_dataset schema')
        else:
            log_info('Success the common voice descriptor file has successfully been converted into the audioengine JSON format')
    else:
        pass
    
    if(generate_validation_dataset):
        audioengine_val_json = convert_df_to_audioengine_label(common_voice_val_df, 'path', 'sentence')
        if(not verify_audioengine_dataset(audioengine_val_json)):
            log_error('The common voice import failed because the audioengine_json did not match the audioengine_dataset schema')
        else:
            log_info('Success the common voice descriptor file has successfully been converted into the audioengine JSON format')
    else:
        pass

In [None]:
def convert_audio_file_to_wav(in_filename: str, out_filename: str) -> bool:
    if(os.path.isfile(in_filename)):
        if(os.path.isfile(out_filename)):
            return True #File exists
        else:
            pass
        
        ffmpeg.output(ffmpeg.input(in_filename), out_filename).run()
        if(os.path.isfile(out_filename)):
            return True
        else:
            log_error('Failed to convert {} to {}'.format(in_filename, out_filename))
            return False
    else:
        log_error('Failed to convert {} file does not exist'.format(in_filename))
        return False
    return False

def convert_audio_file_to_wav_multiprocessing_wrapper(filepath: dict) -> bool:
    return convert_audio_file_to_wav(filepath['old'], filepath['new'])

In [None]:
def multiprocess_convert_files_to_wav(df: pd.DataFrame, 
                                      path_column_name: str, 
                                      old_directory: str,
                                      new_directory: str,
                                      num_cores: int=0) -> list:
    if(num_cores<=0):
        num_cores = (math.floor(multiprocessing.cpu_count() * 0.9) if math.floor(multiprocessing.cpu_count() * 0.9) >= 1 else 1) 
    else:
        pass
    
    filepaths = []
    for i, v in df.iterrows():
        old_filename = v[path_column_name]
        new_filename = change_file_extension(old_filename, '.wav')
        
        old_filepath = os.path.join(old_directory, old_filename)
        new_filepath = os.path.join(new_directory, new_filename)
        filepath_json = {'old': old_filepath, 'new': new_filepath}
        filepaths.append(filepath_json.copy())
    
    success_list = []
    
    if(not os.path.isdir(new_directory)):
        os.mkdir(new_directory)
    else:
        pass
    
    with multiprocessing.Pool(processes=num_cores) as pool:
        for success in pool.imap_unordered(convert_audio_file_to_wav_multiprocessing_wrapper, filepaths):
            success_list.append(success)
        pool.close()
    return success_list

In [None]:
if DEBUG:
    input_clips_path = os.path.join(dataset_input_location, dataset_audio_clips_directory) 
    output_clips_path = os.path.join(dataset_output_location, dataset_audio_clips_directory)
    if(generate_train_dataset):
        multiprocess_convert_files_to_wav(common_voice_train_df, 'path', input_clips_path, output_clips_path)
    else:
        pass
    if(generate_test_dataset):
        multiprocess_convert_files_to_wav(common_voice_test_df, 'path', input_clips_path, output_clips_path)
    else:
        pass
    if(generate_validation_dataset):
        multiprocess_convert_files_to_wav(common_voice_val_df, 'path', input_clips_path, output_clips_path)
    else:
        pass

In [None]:
if DEBUG:
    if(generate_train_dataset):
        new_train_dataset_filename = change_file_extension(train_dataset_name, '.json')
        new_train_dataset_filepath = os.path.join(dataset_output_location, new_train_dataset_filename)

        audioengine_train_file = open(new_train_dataset_filepath, 'w')
        json.dump(audioengine_train_json, audioengine_train_file, indent = 4)
        audioengine_train_file.close()

        if(get_json_file_integrity(audioengine_train_json, new_train_dataset_filepath) == False):
            log_error('The JSON in the file {} does not match the json in memory'.format(new_train_dataset_filename))
        else:
            log_debug('|VALID| The JSON in the file {} matches the version in memory'.format(new_train_dataset_filename))
    else:
        pass
    
    if(generate_test_dataset):
        new_test_dataset_filename = change_file_extension(test_dataset_name, '.json')
        new_test_dataset_filepath = os.path.join(dataset_output_location, new_test_dataset_filename)

        audioengine_test_file = open(new_test_dataset_filepath, 'w')
        json.dump(audioengine_test_json, audioengine_test_file, indent = 4)
        audioengine_test_file.close()

        if(get_json_file_integrity(audioengine_test_json, new_test_dataset_filepath) == False):
            log_error('The JSON in the file {} does not match the json in memory'.format(new_test_dataset_filename))
        else:
            log_debug('|VALID| The JSON in the file {} matches the version in memory'.format(new_test_dataset_filename))
    else:
        pass
    
    if(generate_validation_dataset):
        new_val_dataset_filename = change_file_extension(val_dataset_name, '.json')
        new_val_dataset_filepath = os.path.join(dataset_output_location, new_val_dataset_filename)

        audioengine_val_file = open(new_val_dataset_filepath, 'w')
        json.dump(audioengine_val_json, audioengine_val_file, indent = 4)
        audioengine_val_file.close()

        if(get_json_file_integrity(audioengine_val_json, new_val_dataset_filepath) == False):
            log_error('The JSON in the file {} does not match the json in memory'.format(new_val_dataset_filename))
        else:
            log_debug('|VALID| The JSON in the file {} matches the version in memory'.format(new_val_dataset_filename))
    else:
        pass