In [1]:
import pandas as pd
import librosa

In [32]:
wav_data = '/srv/scratch/speechdata/speech-corpora/AusTalk/wav.scp'
text_data = '/srv/scratch/speechdata/speech-corpora/AusTalk/text'
root_dir = '/srv/scratch/speechdata/speech-corpora/AusTalk/speakers/'
save_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/AusTalk_story_sentences_full_dataframe.csv'
save_unique_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/AusTalk_story_sentences_unique_transcription_dataframe.csv'

In [3]:
# read the 'wav.scp' and extract the audio IDs and associated filepaths
audio_paths = {}
with open (wav_data, 'r') as wav_file:
    for line in wav_file:
        sections = line.strip().split()
        audio_id = sections[0]
        filepath = root_dir + sections[1]
        audio_paths[audio_id] = filepath

In [4]:
# read the 'text' and extract the audio IDs and transcriptions
transcriptions = {}
with open (text_data, 'r') as text_file:
    for line in text_file:
        sections = line.strip().split()
        audio_id = sections[0]
        transcription = ' '.join(sections[1:])
        transcriptions[audio_id] = transcription

In [5]:
def get_duration(wav_filepath):
    signal, sample_rate = librosa.load(wav_filepath)
    dur = len(signal)/sample_rate
    return dur

In [15]:
#story_audio_ids = [audio_id for audio_id in audio_paths if 'story' in audio_paths[audio_id]]
story_sentences_audio_ids = [audio_id for audio_id in audio_paths if 'story' in audio_paths[audio_id] or 'sentences' in audio_paths[audio_id]]

In [16]:
print(f'There are {len(story_sentences_audio_ids)} files under the "story" and "sentences" directory')

There are 47212 files under the "story" and "sentences" directory


In [17]:
story_sentences_audio_paths = []
story_sentences_durations = []
story_sentences_speaker_ids = []
story_sentences_transcriptions = []
story_sentences_audio_ids_extract = []

In [18]:
for audio_id in story_sentences_audio_ids:
    audio_path = audio_paths[audio_id]
    transcription = transcriptions[audio_id]
    duration = get_duration(audio_path)
    speaker_id = audio_id.split('-')[0] # if the audio_id is "1_982-1_982_2_16_004", the corresponding speaker_id is 1_982
    
    story_sentences_audio_paths.append(audio_path)
    story_sentences_durations.append(duration)
    story_sentences_speaker_ids.append(speaker_id)
    story_sentences_transcriptions.append(transcription)
    story_sentences_audio_ids_extract.append(audio_id)
    

In [19]:
print("\n--------------------------> Generating dataframe ... -------------------------\n")
df = pd.DataFrame(
        {'filepath': story_sentences_audio_paths,
         'duration': story_sentences_durations,
         'speaker_id': story_sentences_speaker_ids,
         'transcription': story_sentences_transcriptions,
         'audio_id': story_sentences_audio_ids_extract
         })
# preverse leading zeros
df['speaker_id'] = df['speaker_id'].astype(str) 


--------------------------> Generating dataframe ... -------------------------



In [25]:
# preverse leading zeros
df['speaker_id'] = df['speaker_id'].astype(str) 

# when accessing the dataframe, run 'df = pd.read_csv(save_df_fp, dtype={'speaker_id': str})' to avoid missing leading zeros
print("\n---------------------------> Saving dataframe to csv file... ------------------\n")
df.to_csv(save_df_fp, index=False)
print('Successfully saved dataframe to csv file at: ', save_df_fp)
print("Total number of speakers:", len(set(df["speaker_id"])))
print("Total hours:", df['duration'].sum()/(60*60))


---------------------------> Saving dataframe to csv file... ------------------

Successfully saved dataframe to csv file at:  /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/AusTalk_story_sentences_full_dataframe.csv
Total number of speakers: 777
Total hours: 114.78499428067525


In [61]:
# keep unique transcriptions such that each transcription appears only once in the dataframe
print("\n--------------------------> Generating unique transcription ... -------------------------\n")
# frac=1 ==> sample the entire DataFrame,include all rows in the sample
# random_state=230 ==> seed
shuffled_df = df.sample(frac=1, random_state=230).reset_index(drop=True)
result_df = shuffled_df.drop_duplicates(subset='transcription', keep='first')

print("\n---------------------------> Saving dataframe to csv file... ------------------\n")
result_df.to_csv(save_unique_df_fp, index=False)
print('Successfully saved dataframe to csv file at: ', save_unique_df_fp)
print("Total number of speakers:", len(set(result_df["speaker_id"])))
print("Total hours:", result_df['duration'].sum()/(60*60))


--------------------------> Generating unique transcription ... -------------------------


---------------------------> Saving dataframe to csv file... ------------------

Successfully saved dataframe to csv file at:  /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/AusTalk_story_sentences_unique_transcription_dataframe.csv
Total number of speakers: 65
Total hours: 0.16506199294532628


In [60]:
result_df

Unnamed: 0,filepath,duration,speaker_id,transcription,audio_id
0,/srv/scratch/speechdata/speech-corpora/AusTalk...,5.479955,3_1027,THE FLAG CEREMONY OVERWHELMED ME AND I WAS MOV...,3_1027-3_1027_2_16_039
1,/srv/scratch/speechdata/speech-corpora/AusTalk...,6.687392,2_852,HELEN PICKED A GOOD SPOT NEAR THE WATER AND SP...,2_852-2_852_2_16_016
2,/srv/scratch/speechdata/speech-corpora/AusTalk...,5.294150,4_1328,HE GLIMPSED THE BADGE OF THE TRAFFIC COP OUT O...,4_1328-4_1328_2_16_042
3,/srv/scratch/speechdata/speech-corpora/AusTalk...,8.730703,4_1036,SHARON WATCHED THE HELICOPTER AS IT LIFTED OFF...,4_1036-4_1036_2_16_059
4,/srv/scratch/speechdata/speech-corpora/AusTalk...,4.458277,3_402,WAS IT THE BLUE GLOBE THAT BROKE WHEN HE SWITC...,3_402-3_402_2_16_040
...,...,...,...,...,...
191,/srv/scratch/speechdata/speech-corpora/AusTalk...,5.201270,3_846,ISN'T IT COMMON KNOWLEDGE THAT THE KANGAROO HA...,3_846-3_846_2_16_051
222,/srv/scratch/speechdata/speech-corpora/AusTalk...,48.390385,4_628,JUST THEN THE OLD RAT CAUGHT SIGHT OF YOUNG AR...,4_628-4_628_1_3_005
261,/srv/scratch/speechdata/speech-corpora/AusTalk...,41.331519,2_1078,ONCE UPON A TIME THERE WAS A YOUNG RAT NAMED A...,2_1078-2_1078_1_3_001
272,/srv/scratch/speechdata/speech-corpora/AusTalk...,4.829751,3_531,ALAN TOOK A HAM SANDWICH TO SCHOOL,3_531-3_531_2_16_012
