In [6]:
from os.path import join
from praatio import tgio
import praatio
from pydub import AudioSegment
import pandas as pd
import os
import re

In [7]:
print("\n------> Loading directories and files... ------\n")
source_dir = '/srv/scratch/chacmod/checked'
print(f'.wav files and TextGrid files are stored at: {source_dir}')

save_dir = '/srv/scratch/chacmod/auskidtalk_scripted_audio_new'
print(f'Split .wav files are stored at: {save_dir}')

save_df_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_dataframe_new_175_speakers.csv'
print(f'Dataframe file is stored at: {save_df_fp}')


------> Loading directories and files... ------

.wav files and TextGrid files are stored at: /srv/scratch/chacmod/checked
Split .wav files are stored at: /srv/scratch/chacmod/auskidtalk_scripted_audio_new
Dataframe file is stored at: /srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_dataframe_new_175_speakers.csv


In [8]:
extracted_audio = []
extracted_transcription = []
extracted_speakerID = []
extracted_filepath = []
extracted_duration = []

In [9]:
for root, dirs, files in os.walk(source_dir):
    for filename in files:
        if filename.endswith('.wav'):
            wav_filepath = os.path.join(root, filename)
            speaker_id = os.path.basename(root)
            textgrid_filename = speaker_id + "_task1_kaldi" + '.TextGrid'
            textgrid_filepath = os.path.join(root, textgrid_filename)
            if not os.path.exists(textgrid_filepath): # if the TextGrid file with the name convention speakerID_task1_kaldi.TextGrid does not exist
                textgrid_filename = speaker_id + "_task1_GT" + '.TextGrid'
                textgrid_filepath = os.path.join(root, textgrid_filename)
                if not os.path.exists(textgrid_filepath):
                    continue  # if the TextGrid file with the name convention speakerID_task1_GT.TextGrid does not exist
            
            # extracting speeches from TextGrid files
            audio = AudioSegment.from_wav(wav_filepath)
            try:
                tg = tgio.openTextgrid(textgrid_filepath)
            except AssertionError:
                print("AssertionError occurred:")
                print(f"The TextGrid file is {textgrid_filepath} \n")
            tier_names = tg.tierNameList
            
            print('Currently Processing:')
            print('TextGrid File:', textgrid_filepath)
            print('Tier names:', tier_names)
            print('\n')
            
            count = 1
            for xmin, xmax, text in tg.tierDict[tier_names[2]].entryList:
                if text != 'sil':
                    split_audio = audio[xmin*1000:xmax*1000] # the unit is in milliseconds
                    save_filepath = os.path.join(save_dir, speaker_id+'_'+str(count)+'.wav')
                    split_audio.export(save_filepath, format='wav')

                    chars_to_ignore = '[\,\?\.\!\-\;\_\:\"]'
                    text = re.sub(chars_to_ignore, ' ', text).lower()

                    extracted_audio.append(split_audio)
                    extracted_transcription.append(text)
                    extracted_speakerID.append(speaker_id)
                    extracted_filepath.append(save_filepath)
                    extracted_duration.append(xmax-xmin)

                    count = count+1

Currently Processing:
TextGrid File: /srv/scratch/chacmod/checked/439/439_task1_kaldi.TextGrid
Tier names: ['Prompt', 'speaker_1-kaldi-words', 'hu-wrd', 'flag']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/checked/637/637_task1_kaldi.TextGrid
Tier names: ['Prompt', 'speaker_2-kaldi-words', 'hu-wrd', 'flag']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/checked/457/457_task1_kaldi.TextGrid
Tier names: ['Prompt', 'speaker_2-kaldi-words', 'hu-wrd', 'flag']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/checked/772/772_task1_kaldi.TextGrid
Tier names: ['Prompt', 'speaker_0-kaldi-words', 'hu-wrd', 'flag']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/checked/655/655_task1_kaldi.TextGrid
Tier names: ['Prompt', 'speaker_0-kaldi-words', 'hu-wrd', 'flag']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/checked/970/970_task1_kaldi.TextGrid
Tier names: ['Prompt', 'speaker_2-kaldi-words', 'hu-wrd', 'flag']


Currently Processing:


In [10]:
print(len(extracted_filepath))
print(len(extracted_duration))
print(len(extracted_speakerID))
print(len(extracted_transcription))

25687
25687
25687
25687


In [11]:
print("\n------> Generating dataframe ... ------\n")
df = pd.DataFrame(
        {'filepath': extracted_filepath,
         'duration': extracted_duration,
         'speaker_id': extracted_speakerID,
         'transcription': extracted_transcription,
         })
# preverse leading zeros
df['speaker_id'] = df['speaker_id'].astype(str) 

print("\n------> Saving dataframe to csv file... ------\n")
df.to_csv(save_df_fp, index=False)
print('Successfully saved dataframe to csv file at: ', save_df_fp)
print("Total number of speakers:", len(set(extracted_speakerID)))
print("Total hours:", df['duration'].sum()/(60*60))


------> Generating dataframe ... ------


------> Saving dataframe to csv file... ------

Successfully saved dataframe to csv file at:  /srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_dataframe_new_175_speakers.csv
Total number of speakers: 175
Total hours: 5.406107334492203
