In [1]:
# Purpose: Extract and group the speakers from the datasets to ensure that the speakers appear in train datasets
#          do not appear in dev or test datasets.
# Requirement: prepare_scripted_datasets.py (ipynb)
#              Check_duration_15sec.py (ipynb)
# Based on https://github.com/monomest/Thesis/blob/3a15f747dfd934535ffb7a02bf3fee97d9c546cb/s5/wav2vec_projects/OGI_getSpkrs.py

In [1]:
print("\n------> Importing libraries... ------\n")

import pandas as pd


------> Importing libraries... ------



In [2]:
print("\n------> Loading files... ------\n")

# Path where the dataframe are stored
df_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_dataframe_v2.csv'

# Path to save the group-by-speaker daraframe 
speaker_df_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_speaker_dataframe_v2.csv'

print(f'Dataframe is stored at {df_fp}')
print(f'Group-by-speaker daraframe is stored at {speaker_df_fp}')



------> Loading files... ------

Dataframe is stored at /srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_dataframe_v2.csv
Group-by-speaker daraframe is stored at /srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_speaker_dataframe_v2.csv


In [3]:
df = pd.read_csv(df_fp, dtype={'speaker_id': str})
print("\n------> Describing dataset... ------\n")
print(df.speaker_id.describe())


------> Describing dataset... ------

count     26523
unique      180
top         219
freq        178
Name: speaker_id, dtype: object


In [4]:
df.drop(columns=['filepath', 'transcription'], inplace=True) 


print("\n------> Grouping according to speakers... ------\n")
speaker_df = df.groupby('speaker_id')['duration'].sum()
speaker_df = speaker_df.reset_index()
print(speaker_df)


------> Grouping according to speakers... ------

    speaker_id    duration
0         1000   94.662074
1         1001  131.205357
2         1032   97.057435
3         1050  163.756414
4         1075  154.897349
..         ...         ...
175        973   88.415174
176        975  107.345394
177        977  141.991467
178        978  119.092942
179        982  119.394804

[180 rows x 2 columns]


In [5]:
print("\n------> Saving dataframe to csv file... ------\n")
speaker_df.to_csv(speaker_df_fp, index = False)
print('Successfully saved group-by-speaker dataframe to csv file at:', speaker_df_fp)


------> Saving dataframe to csv file... ------

Successfully saved group-by-speaker dataframe to csv file at: /srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_speaker_dataframe_v2.csv


In [6]:
print('Total number of speakers:', len(speaker_df))
print("Total hours:", speaker_df['duration'].sum()/(60*60))

Total number of speakers: 180
Total hours: 5.697343974007766
