In [35]:
# Purpose: partition datasets into training, development and testiong sets
# Requirement: prepare_scripted_datasets.py (ipynb)
#              Check_duration_15sec.py (ipynb)
#              group_speaker_id.py (ipynb)

In [36]:
print("\n------> Importing libraries... ------\n")

import pandas as pd
from sklearn.model_selection import train_test_split


------> Importing libraries... ------



In [37]:
# (cannot be changed) seed = 230
seed = 230 

# !!!!!! When changing the proportion, go to group_speaker_id.ipynb to change the parameter "prop_training"
train_num = 0.70
dev_num = 0.15
test_num = 0.15

In [38]:
print("\n------> Loading files... ------\n")

df_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_dataframe_new_180speakers.csv'
speaker_df_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_speaker_dataframe_new_180speakers.csv'

train_df_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_train_dataframe_new_180speakers.csv'
dev_df_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_dev_dataframe_new_180speakers.csv'
test_df_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_test_dataframe_new_180speakers.csv'

print(f'Dataframe is stored at {df_fp}')
print(f'Group-by-speaker dataframe is stored at {speaker_df_fp}')
print(f'Training daraframe is stored at {train_df_fp}')
print(f'Development daraframe is stored at {dev_df_fp}')
print(f'Test daraframe is stored at {test_df_fp}')


------> Loading files... ------

Dataframe is stored at /srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_dataframe_new_180speakers.csv
Group-by-speaker dataframe is stored at /srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_speaker_dataframe_new_180speakers.csv
Training daraframe is stored at /srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_train_dataframe_new_180speakers.csv
Development daraframe is stored at /srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_dev_dataframe_new_180speakers.csv
Test daraframe is stored at /srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_test_dataframe_new_180speakers.csv


In [39]:
print("\n------> Splitting the group-by-speaker dataframe... ------\n")
speaker_df = pd.read_csv(speaker_df_fp, dtype={'speaker_id': str})
train_speaker_set, dev_test_speaker_set = train_test_split(speaker_df, test_size=(dev_num+test_num), random_state=seed, shuffle=True)

prop_test = test_num/(dev_num+test_num)
dev_speaker_set, test_speaker_set = train_test_split(dev_test_speaker_set, test_size=prop_test, random_state=seed, shuffle=True)


------> Splitting the group-by-speaker dataframe... ------



In [40]:
print(f"Training dataset includes {len(train_speaker_set)} speakers and {train_speaker_set['duration'].sum()/(60*60)} hours")
print(f"Development dataset includes {len(dev_speaker_set)} speakers and {dev_speaker_set['duration'].sum()/(60*60)} hours")
print(f"Testing dataset includes {len(test_speaker_set)} speakers and {test_speaker_set['duration'].sum()/(60*60)} hours")

Training dataset includes 126 speakers and 3.859580136985175 hours
Development dataset includes 27 speakers and 0.8320461690562523 hours
Testing dataset includes 27 speakers and 0.8461664757909966 hours


In [41]:
# this code snippet is used for testing purposes
df = pd.read_csv(df_fp, dtype={'speaker_id': str})

id_to_match = train_speaker_set['speaker_id'].values[2]
matching_row = df[df['speaker_id']==id_to_match]
print(matching_row)

                                               filepath  duration speaker_id   
9013  /srv/scratch/chacmod/auskidtalk_scripted_audio...      0.48        208  \
9014  /srv/scratch/chacmod/auskidtalk_scripted_audio...      0.60        208   
9015  /srv/scratch/chacmod/auskidtalk_scripted_audio...      0.48        208   
9016  /srv/scratch/chacmod/auskidtalk_scripted_audio...      0.54        208   
9017  /srv/scratch/chacmod/auskidtalk_scripted_audio...      0.42        208   
...                                                 ...       ...        ...   
9149  /srv/scratch/chacmod/auskidtalk_scripted_audio...      0.99        208   
9150  /srv/scratch/chacmod/auskidtalk_scripted_audio...      0.84        208   
9151  /srv/scratch/chacmod/auskidtalk_scripted_audio...      0.90        208   
9152  /srv/scratch/chacmod/auskidtalk_scripted_audio...      0.93        208   
9153  /srv/scratch/chacmod/auskidtalk_scripted_audio...      0.96        208   

      transcription  
9013           du

In [42]:
print("\n------> Obtaining training dataframe... ------\n")

# Extra filepath, duration, speaker_id and transcription corresponsing training dataset from the original dataframe
train_dp = pd.DataFrame(
        {'filepath': [],
         'duration': [],
         'speaker_id': [],
         'transcription': []
         })
for i in range(len(train_speaker_set)):
    id_to_match = train_speaker_set['speaker_id'].values[i]
    matching_speakers = df[df['speaker_id']==id_to_match]
    train_dp = pd.concat([train_dp, matching_speakers], ignore_index=True)  # concatenate training dataframe with matching speakers, setting ignore_index=True to reindex the training dataframe

train_dp['speaker_id'] = train_dp['speaker_id'].astype(str)  # preverse leading zeros
train_dp.to_csv(train_df_fp, index=False)

print("Number of speech files:", len(train_dp))
print("Number of speakers:", len(set(train_dp['speaker_id'])))
print("Total hours:", train_dp['duration'].sum()/(60*60))
print("Total minutes:", train_dp['duration'].sum()/(60))
print("Total seconds:", train_dp['duration'].sum())
print('Successfully saved training dataframe to csv file at: ', train_df_fp)


------> Obtaining training dataframe... ------

Number of speech files: 18384
Number of speakers: 126
Total hours: 3.8595801369851754
Total minutes: 231.5748082191105
Total seconds: 13894.488493146631
Successfully saved training dataframe to csv file at:  /srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_train_dataframe_new_180speakers.csv


In [43]:
print("\n------> Obtaining development dataframe... ------\n")

# Extra filepath, duration, speaker_id and transcription corresponsing development dataset from the original dataframe
dev_dp = pd.DataFrame(
        {'filepath': [],
         'duration': [],
         'speaker_id': [],
         'transcription': []
         })
for i in range(len(dev_speaker_set)):
    id_to_match = dev_speaker_set['speaker_id'].values[i]
    matching_speakers = df[df['speaker_id']==id_to_match]
    dev_dp = pd.concat([dev_dp, matching_speakers], ignore_index=True)  # concatenate training dataframe with matching speakers, setting ignore_index=True to reindex the training dataframe

dev_dp['speaker_id'] = dev_dp['speaker_id'].astype(str)  # preverse leading zeros
dev_dp.to_csv(dev_df_fp, index=False)

print("Number of speech files:", len(dev_dp))
print("Number of speakers:", len(set(dev_dp['speaker_id'])))
print("Total hours:", dev_dp['duration'].sum()/(60*60))
print("Total minutes:", dev_dp['duration'].sum()/(60))
print("Total seconds:", dev_dp['duration'].sum())
print('Successfully saved development dataframe to csv file at: ', dev_df_fp)


------> Obtaining development dataframe... ------

Number of speech files: 4019
Number of speakers: 27
Total hours: 0.8320461690562523
Total minutes: 49.92277014337514
Total seconds: 2995.3662086025083
Successfully saved development dataframe to csv file at:  /srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_dev_dataframe_new_180speakers.csv


In [44]:
print("\n------> Obtaining testing dataframe... ------\n")

# Extra filepath, duration, speaker_id and transcription corresponsing testing dataset from the original dataframe
test_dp = pd.DataFrame(
        {'filepath': [],
         'duration': [],
         'speaker_id': [],
         'transcription': []
         })
for i in range(len(test_speaker_set)):
    id_to_match = test_speaker_set['speaker_id'].values[i]
    matching_speakers = df[df['speaker_id']==id_to_match]
    test_dp = pd.concat([test_dp, matching_speakers], ignore_index=True)  # concatenate training dataframe with matching speakers, setting ignore_index=True to reindex the training dataframe
    
test_dp['speaker_id'] = test_dp['speaker_id'].astype(str)  # preverse leading zeros
test_dp.to_csv(test_df_fp, index=False)
    
print("Number of speech files:", len(test_dp))
print("Number of speakers:", len(set(test_dp['speaker_id'])))
print("Total hours:", test_dp['duration'].sum()/(60*60))
print("Total minutes:", test_dp['duration'].sum()/(60))
print("Total seconds:", test_dp['duration'].sum())
print('Successfully saved testing dataset to csv file at: ', test_df_fp)


------> Obtaining testing dataframe... ------

Number of speech files: 3968
Number of speakers: 27
Total hours: 0.8461664757909966
Total minutes: 50.769988547459796
Total seconds: 3046.199312847588
Successfully saved testing dataset to csv file at:  /srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_test_dataframe_new_180speakers.csv
