In [1]:
# Purpose: Extract and group the speakers from the datasets to ensure that the speakers appear in train datasets
#          do not appear in dev or test datasets.
# Requirement: prepare_scripted_datasets.py (ipynb)
#              Check_duration_15sec.py (ipynb)
# Based on https://github.com/monomest/Thesis/blob/3a15f747dfd934535ffb7a02bf3fee97d9c546cb/s5/wav2vec_projects/OGI_getSpkrs.py

In [2]:
print("\n------> Importing libraries... ------\n")

import pandas as pd


------> Importing libraries... ------



In [3]:
print("\n------> Loading files... ------\n")

# Path where the dataframe are stored
df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full_v4_select_speaker/AusTalk_story_sentences_full_dataframe_v4.csv'

# Path to save the group-by-speaker daraframe 
speaker_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full_v4_select_speaker/AusTalk_story_sentences_full_speaker_dataframe_v4.csv'

print(f'Dataframe is stored at {df_fp}')
print(f'Group-by-speaker daraframe is stored at {speaker_df_fp}')



------> Loading files... ------

Dataframe is stored at /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full_v4_select_speaker/AusTalk_story_sentences_full_dataframe_v4.csv
Group-by-speaker daraframe is stored at /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full_v4_select_speaker/AusTalk_story_sentences_full_speaker_dataframe_v4.csv


In [4]:
df = pd.read_csv(df_fp, dtype={'speaker_id': str})
print("\n------> Describing dataset... ------\n")
print(df.speaker_id.describe())


------> Describing dataset... ------

count       201
unique      175
top       4_934
freq          3
Name: speaker_id, dtype: object


In [5]:
df.drop(columns=['filepath', 'transcription'], inplace=True) 


print("\n------> Grouping according to speakers... ------\n")
speaker_df = df.groupby('speaker_id')['duration'].sum()
speaker_df = speaker_df.reset_index()
print(speaker_df)


------> Grouping according to speakers... ------

    speaker_id   duration
0       1_1018   7.801905
1       1_1021   9.102268
2       1_1028   6.780272
3       1_1097   5.108390
4       1_1111   5.294150
..         ...        ...
170      4_864   5.665714
171      4_887   5.108390
172      4_932   4.272517
173      4_934  14.210658
174      4_977   4.458277

[175 rows x 2 columns]


In [6]:
print("\n------> Saving dataframe to csv file... ------\n")
speaker_df.to_csv(speaker_df_fp, index = False)
print('Successfully saved group-by-speaker dataframe to csv file at:', speaker_df_fp)


------> Saving dataframe to csv file... ------

Successfully saved group-by-speaker dataframe to csv file at: /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full_v4_select_speaker/AusTalk_story_sentences_full_speaker_dataframe_v4.csv


In [7]:
print('Total number of speakers:', len(speaker_df))
print("Total hours:", speaker_df['duration'].sum()/(60*60))

Total number of speakers: 175
Total hours: 0.49523126732174355
