In [None]:
# Based on "https://www.khiajohnson.com/post/2021/textgrid-sqlite-database/"

In [27]:
from os.path import join
from praatio import tgio
import praatio
from pydub import AudioSegment
import pandas as pd
import os
import re


In [33]:
print("\n------> Loading directories and files... ------\n")
tg_dir = '/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples'
print(f'TextGrid directory is at: {tg_dir}')

save_dir = '/srv/scratch/chacmod/auskidtalk_spontaneous'
print(f'Split .wav files are stored at: {save_dir}')

save_df_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_spontaneous_dataframe.csv'
print(f'Dataframe file is stored at: {save_df_fp}')


------> Loading directories and files... ------

TextGrid directory is at: /srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples
Split .wav files are stored at: /srv/scratch/chacmod/auskidtalk_spontaneous
Dataframe file is stored at: /srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_spontaneous_dataframe.csv


In [34]:
print("\n------> Loading TextGrid files... ------\n")
tg_file_names = [file for file in os.listdir(tg_dir) if file.endswith('TextGrid')]
tg_files = [tg_dir+'/'+file_name for file_name in tg_file_names]

print(f'The number of TextGrid files is {len(tg_files)}\n')
print('TextGrid files include:')
for tg_file in tg_files:
    print(tg_file)


------> Loading TextGrid files... ------

The number of TextGrid files is 8

TextGrid files include:
/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/819_task3.TextGrid
/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/1050_task3.TextGrid
/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/255_task3.TextGrid
/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/651_task3.TextGrid
/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/516_task3.TextGrid
/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/228_task3.TextGrid
/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/1150_task3.TextGrid
/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/1075_task3.TextGrid


In [49]:
print("\n------> Extracting speeches from TextGrid files... ------\n")
#count0 = 0
extracted_audio = []
extracted_transcription = []
extracted_speakerID = []
extracted_filepath = []
extracted_duration = []
for tg_file in tg_files:
# extract file_ID                                     '/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/1075_task3.TextGrid'
    file_basename = os.path.basename(tg_file)             # 1075_task3.TextGrid 
    task_ID = os.path.splitext(file_basename)[0]          # 1075_task3
    split_by_underline = task_ID.split('_')               # ['1075','task3']
    speaker_ID = split_by_underline[0]                    # 1075

    # generate corresponding .wav.file
    # if TextGrid filename is '1075_task3.TextGrid', then wav filename should be '1075_task3.wav'
    wav_file = os.path.join(tg_dir, task_ID+'.wav')
    audio = AudioSegment.from_wav(wav_file)

    tg = tgio.openTextgrid(tg_file)  # Give it a file name, get back a Textgrid object
    tier_names = tg.tierNameList

    print('Currently Processing:')
    print('TextGrid File:', tg_file)
    print('Tier names:', tier_names)
    print('\n')
    
    #count1 = 0
    
    # tier_names[0] contains 'text'
    count = 1
    for xmin, xmax, text in tg.tierDict[tier_names[0]].entryList:
        if text != 'sil':
            split_audio = audio[xmin*1000:xmax*1000] # the unit is in milliseconds
            save_filepath = os.path.join(save_dir, speaker_ID+'_'+str(count)+'.wav')
            split_audio.export(save_filepath, format='wav')
            
            chars_to_ignore = '[\,\?\.\!\-\;\_\:\"]'
            text = re.sub(chars_to_ignore, ' ', text).lower()
            
            extracted_audio.append(split_audio)
            extracted_transcription.append(text)
            extracted_speakerID.append(speaker_ID)
            extracted_filepath.append(save_filepath)
            extracted_duration.append(xmax-xmin)
            
            count = count+1
'''
        count1 = count1 + 1
        if count1 == 20:
            break
    count0 = count0 + 1
    if count0 == 1:
        break
    
print('\n')
print(extracted_audio)
print(extracted_transcription)
print(extracted_speakerID)
print(extracted_filepath)
print(extracted_duration)
'''


------> Extracting speeches from TextGrid files... ------

Currently Processing:
TextGrid File: /srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/819_task3.TextGrid
Tier names: ['text']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/1050_task3.TextGrid
Tier names: ['text']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/255_task3.TextGrid
Tier names: ['text']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/651_task3.TextGrid
Tier names: ['text']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/516_task3.TextGrid
Tier names: ['text']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/228_task3.TextGrid
Tier names: ['text']


Currently Processing:
TextGrid File: /srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/1150_task3.TextGrid
Ti

"\n        count1 = count1 + 1\n        if count1 == 20:\n            break\n    count0 = count0 + 1\n    if count0 == 1:\n        break\n    \nprint('\n')\nprint(extracted_audio)\nprint(extracted_transcription)\nprint(extracted_speakerID)\nprint(extracted_filepath)\nprint(extracted_duration)\n"

In [48]:
print("\n------> Generating dataframe ... ------\n")
df = pd.DataFrame(
        {'filepath': extracted_filepath,
         'duration': extracted_duration,
         'speaker_id': extracted_speakerID,
         'transcription': extracted_transcription,
         })

print("\n------> Saving dataframe to csv file... ------\n")
df.to_csv(save_df_fp, index=False)
print('Successfully saved dataframe to csv file at: ', save_df_fp)
print("Total number of speakers:", len(tg_files))
print("Total hours:", df['duration'].sum()/(60*60))


------> Generating dataframe ... ------


------> Saving dataframe to csv file... ------

Successfully saved dataframe to csv file at:  /srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_spontaneous_dataframe.csv
Total number of speakers: 8
Total hours: 0.1021427761485315


In [47]:
tg = tgio.openTextgrid("/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/1075_task3.TextGrid")
print(tg_file)
print(tg.tierNameList)
file_basename = os.path.basename(tg_file)
print(file_basename)
task_ID = os.path.splitext(file_basename)[0]
print(file_ID)
split_by_underline = file_ID.split('_') 
print(split_by_underline)
speaker_ID = split_by_underline[0]
print(speaker_ID)


/srv/scratch/chacmod/auskidtalk_spontaneous/task3_samples/1075_task3.TextGrid
['text']
1075_task3.TextGrid
1075_task3
['1075', 'task3']
1075


In [20]:
tier_names[0]

'text'