In [None]:
#default_exp utils.imports.common_voice

In [None]:
#export
import os, sys
import shutil
import multiprocessing
import math

import pandas as pd
import tensorflow as tf
import tensorflow_io as tfio

import ffmpeg
import json

from typing import TextIO

import audioengine
from audioengine.utils.schema import verify_audioengine_dataset
from audioengine.utils.misc import (log_init, log_error, log_info, log_debug,
                                    change_file_extension, get_json_file_integrity)

In [None]:
#This is not to be exported
from IPython.display import display

In [None]:
DEBUG = True 

if DEBUG:
    log_init()
    dataset_output_location = '/project/Datasets/audioengine_single_word'
    dataset_input_location = '/project/Datasets/common_voice_single_word'
    dataset_audio_clips_directory = 'clips'
    dataset_name = 'dev.tsv'
    if(os.path.isdir(dataset_output_location)):
        shutil.rmtree(dataset_output_location)
    else:
        pass
    os.mkdir(dataset_output_location)

[INFO] Logging now set to file: /project/Development/ML/audio/logs/audioengine.log with level DEBUG


In [None]:
#export

def read_tsv(tsv_filepointer: TextIO) -> pd.DataFrame:
    """The input filepointer should already be open with no lines read"""
    tsv_data_df = pd.read_csv(tsv_filepointer, sep = '\t')
    return tsv_data_df

In [None]:
#export

def delete_pandas_columns(df: pd.DataFrame, column_list: list) -> pd.DataFrame:
    for column in column_list:
        del df[column]
    return df

In [None]:
if DEBUG:
    common_voice_train_filepath = os.path.join(dataset_input_location, dataset_name)
    common_voice_train_filepointer = open(common_voice_train_filepath, 'r')
    common_voice_train_df = read_tsv(common_voice_train_filepointer)
    drop_columns = ['age', 'gender', 'accent', 'locale', 'segment', 'up_votes', 'down_votes', 'client_id']
    common_voice_train_df = delete_pandas_columns(common_voice_train_df, drop_columns)
    display(common_voice_train_df)

Unnamed: 0,path,sentence
0,common_voice_en_22216022.mp3,two
1,common_voice_en_22216028.mp3,zero
2,common_voice_en_22216029.mp3,three
3,common_voice_en_22216033.mp3,eight
4,common_voice_en_22216036.mp3,Firefox
...,...,...
8212,common_voice_en_22174176.mp3,two
8213,common_voice_en_22174177.mp3,six
8214,common_voice_en_22174227.mp3,no
8215,common_voice_en_22174230.mp3,four


In [None]:
def convert_df_to_audioengine_label(df: pd.DataFrame, path_column_name: str, label_column_name: str) -> dict:
    audio_data_json = {}
    categories = []
    categories_id_map = {}
    categories_json_list = []
    audio_json_list = []
    for i, v in df.iterrows():
        split_filename = v[path_column_name].split('.')
        split_filename.pop(len(split_filename)-1)
        filename = '.'.join(split_filename) + '.wav'
        label = v[label_column_name]
        #contents = tf.io.read_file(filepath)
        #audio_data, _ = tf.audio.decode_wav(contents)
        #audio_data = tf.squeeze(audio, axis=-1)
        #audio_data = {}
        if(label not in categories):
            categories.append(label)
            categories_id_map[label] = len(categories)
            categories_json = {'id': len(categories),
                              'name': label,
                              'supercategory': 'Word'}
            categories_json_list.append(categories_json.copy())
        else:
            pass
        category_id = categories_id_map[label]
        audio_json = {'id': i,
                     'category_id': category_id,
                     'file_name': filename}
        audio_json_list.append(audio_json.copy())
    licenses_json = {
        'id': 1,
        'name': 'CC0',
        'url': 'https://creativecommons.org/share-your-work/public-domain/cc0/'
    }
    info_json = {
        'year': 2021,
        'version': '1.0',
        'description': 'Mozilla single word dataset',
        'contributor': 'Mozilla and contributors',
        'url': 'https://commonvoice.mozilla.org/en',
        'date_created': '2021-06-30',
        'task': 'classification',
    }
    audioengine_dataset = {'info': info_json,
                          'licenses': [licenses_json],
                          'audio': audio_json_list,
                          'categories': categories_json_list}
    return audioengine_dataset

In [None]:
if DEBUG:
    audioengine_json = convert_df_to_audioengine_label(common_voice_train_df, 'path', 'sentence')
    if(not verify_audioengine_dataset(audioengine_json)):
        log_error('The common voice import failed because the audioengine_json did not match the audioengine_dataset schema')
    else:
        log_info('Success the common voice descriptor file has successfully been converted into the audioengine JSON format')

In [None]:
def convert_audio_file_to_wav(in_filename: str, out_filename: str) -> bool:
    if(os.path.isfile(in_filename)):
        if(os.path.isfile(out_filename)):
            return True #File exists
        else:
            pass
        
        ffmpeg.output(ffmpeg.input(in_filename), out_filename).run()
        if(os.path.isfile(out_filename)):
            return True
        else:
            log_error('Failed to convert {} to {}'.format(in_filename, out_filename))
            return False
    else:
        log_error('Failed to convert {} file does not exist'.format(in_filename))
        return False
    return False

def convert_audio_file_to_wav_multiprocessing_wrapper(filepath: dict) -> bool:
    return convert_audio_file_to_wav(filepath['old'], filepath['new'])

In [None]:
def multiprocess_convert_files_to_wav(df: pd.DataFrame, 
                                      path_column_name: str, 
                                      old_directory: str,
                                      new_directory: str,
                                      num_cores: int=0) -> list:
    if(num_cores<=0):
        num_cores = (math.floor(multiprocessing.cpu_count() * 0.9) if math.floor(multiprocessing.cpu_count() * 0.9) >= 1 else 1) 
    else:
        pass
    
    filepaths = []
    for i, v in df.iterrows():
        old_filename = v[path_column_name]
        split_filename = old_filename.split('.')
        split_filename.pop(len(split_filename)-1)
        new_filename = '.'.join(split_filename) + '.wav'
        
        old_filepath = os.path.join(old_directory, old_filename)
        new_filepath = os.path.join(new_directory, new_filename)
        filepath_json = {'old': old_filepath, 'new': new_filepath}
        filepaths.append(filepath_json.copy())
    
    success_list = []
    
    if(not os.path.isdir(new_directory)):
        os.mkdir(new_directory)
    else:
        pass
    
    with multiprocessing.Pool(processes=num_cores) as pool:
        for success in pool.imap_unordered(convert_audio_file_to_wav_multiprocessing_wrapper, filepaths):
            success_list.append(success)
        pool.close()
    return success_list

In [None]:
if DEBUG:
    input_clips_path = os.path.join(dataset_input_location, dataset_audio_clips_directory) 
    output_clips_path = os.path.join(dataset_output_location, dataset_audio_clips_directory)
    multiprocess_convert_files_to_wav(common_voice_train_df, 'path', input_clips_path, output_clips_path)

/project/Datasets/audioengine_single_word/clips


In [None]:
if DEBUG:
    new_dataset_filename = change_file_extension(dataset_name, '.json')
    new_dataset_filepath = os.path.join(dataset_output_location, new_dataset_filename)
    audioengine_file = open(new_dataset_filepath, 'w')
    json.dump(audioengine_json, audioengine_file, indent = 4)
    audioengine_file.close()

    if(get_json_file_integrity(audioengine_json, new_dataset_filepath) == False):
        log_error('The JSON in the file {} does not match the json in memory'.format(new_dataset_filename))
    else:
        log_debug('|VALID| The JSON in the file {} matches the version in memory'.format(new_dataset_filename))