In [None]:
# Purpose: shuffle the full dataset of training, development and testing sets
# Requirement: prepare_scripted_datasets.py (ipynb)
#              group_speaker_id.py (ipynb)
#              dataset_splitting.py (ipynb)

In [1]:
# ------------------------------------------
#        Importing libraies
# ------------------------------------------

# For dataframes
import pandas as pd
# For splitting data
from sklearn.model_selection import train_test_split
# For printing filepath
import os

In [2]:
# ------------------------------------------
#           Setting seed
# ------------------------------------------
print("\n------> Setting seed... ----------------------------------------------\n")
# Set a seed to ensure random split can be reproducible
seed = 230
print("--> Setting seed as:", seed)


------> Setting seed... ----------------------------------------------

--> Setting seed as: 230


In [7]:
# ------------------------------------------
#        Setting filepath
# ------------------------------------------
# These data filepath contain dataframes generated from 'dataset_splitting.py'
#data_train_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_train_dataframe.csv'
#data_dev_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dev_dataframe.csv'
#data_test_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_test_dataframe.csv'

data_train_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_train_dataframe.csv'
data_dev_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_dev_dataframe.csv'
data_test_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_test_dataframe.csv'

In [8]:
# where full datasets are saved
data_train_full_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_train_dataframe_shuffled.csv'
data_dev_full_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_dev_dataframe_shuffled.csv'
data_test_full_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_test_dataframe_shuffled.csv'

In [9]:
# ------------------------------------------
#        Reading in dataframe
# ------------------------------------------
# Reading in data dataframe from csv file, as string type
# to preserve leading zeros in speaker id
data_train_df = pd.read_csv(data_train_fp, dtype=str)
data_dev_df = pd.read_csv(data_dev_fp, dtype=str)
data_test_df = pd.read_csv(data_test_fp, dtype=str)
# Converting duration column to float64
data_train_df["duration"] = data_train_df["duration"].apply(pd.to_numeric)
data_dev_df["duration"] = data_dev_df["duration"].apply(pd.to_numeric)
data_test_df["duration"] = data_test_df["duration"].apply(pd.to_numeric)

In [14]:
shuffled_train = data_train_df.sample(frac=1.0, random_state=seed)
shuffled_train = shuffled_train.reset_index(drop=True)
shuffled_train.to_csv(data_train_full_fp, index=False)
duration = shuffled_train['duration'].sum()
print(f'Total duration in train dataset is {duration/(60*60)} hours')
print(shuffled_train)

Total duration in train dataset is 0.38628393055555477 hours
                                               filepath  duration  speaker_id   
0     /srv/scratch/chacmod/auskidtalk_audio/181_task...      0.57   181_task1  \
1     /srv/scratch/chacmod/auskidtalk_audio/1050_tas...      0.84  1050_task1   
2     /srv/scratch/chacmod/auskidtalk_audio/172_task...      0.60   172_task1   
3     /srv/scratch/chacmod/auskidtalk_audio/172_task...      0.54   172_task1   
4     /srv/scratch/chacmod/auskidtalk_audio/181_task...      0.26   181_task1   
...                                                 ...       ...         ...   
2715  /srv/scratch/chacmod/auskidtalk_audio/176_task...      0.33   176_task1   
2716  /srv/scratch/chacmod/auskidtalk_audio/172_task...      0.84   172_task1   
2717  /srv/scratch/chacmod/auskidtalk_audio/1075_tas...      0.81  1075_task1   
2718  /srv/scratch/chacmod/auskidtalk_audio/1150_tas...      0.66  1150_task1   
2719  /srv/scratch/chacmod/auskidtalk_audio/176_

In [15]:
shuffled_dev = data_dev_df.sample(frac=1.0, random_state=seed)
shuffled_dev = shuffled_dev.reset_index(drop=True)
shuffled_dev.to_csv(data_dev_full_fp, index=False)
duration = shuffled_dev['duration'].sum()
print(f'Total duration in dev dataset is {duration/(60*60)} hours')
print(shuffled_dev)

Total duration in dev dataset is 0.08670000000000003 hours
                                              filepath  duration speaker_id   
0    /srv/scratch/chacmod/auskidtalk_audio/122_task...      0.93  122_task1  \
1    /srv/scratch/chacmod/auskidtalk_audio/122_task...      0.96  122_task1   
2    /srv/scratch/chacmod/auskidtalk_audio/173_task...      0.96  173_task1   
3    /srv/scratch/chacmod/auskidtalk_audio/122_task...      0.99  122_task1   
4    /srv/scratch/chacmod/auskidtalk_audio/122_task...      0.48  122_task1   
..                                                 ...       ...        ...   
456  /srv/scratch/chacmod/auskidtalk_audio/122_task...      0.93  122_task1   
457  /srv/scratch/chacmod/auskidtalk_audio/173_task...      0.58  173_task1   
458  /srv/scratch/chacmod/auskidtalk_audio/173_task...      0.82  173_task1   
459  /srv/scratch/chacmod/auskidtalk_audio/122_task...      0.69  122_task1   
460  /srv/scratch/chacmod/auskidtalk_audio/122_task...      0.79  122_ta

In [16]:
shuffled_test = data_test_df.sample(frac=1.0, random_state=seed)
shuffled_test = shuffled_test.reset_index(drop=True)
shuffled_test.to_csv(data_test_full_fp, index=False)
duration = shuffled_test['duration'].sum()
print(f'Total duration in test dataset is {duration/(60*60)} hours')
print(shuffled_test)

Total duration in test dataset is 0.07303055555555564 hours
                                              filepath  duration speaker_id   
0    /srv/scratch/chacmod/auskidtalk_audio/135_task...      0.51  135_task1  \
1    /srv/scratch/chacmod/auskidtalk_audio/135_task...      0.81  135_task1   
2    /srv/scratch/chacmod/auskidtalk_audio/130_task...      0.72  130_task1   
3    /srv/scratch/chacmod/auskidtalk_audio/135_task...      0.48  135_task1   
4    /srv/scratch/chacmod/auskidtalk_audio/135_task...      0.37  135_task1   
..                                                 ...       ...        ...   
457  /srv/scratch/chacmod/auskidtalk_audio/135_task...      0.78  135_task1   
458  /srv/scratch/chacmod/auskidtalk_audio/130_task...      0.20  130_task1   
459  /srv/scratch/chacmod/auskidtalk_audio/130_task...      0.53  130_task1   
460  /srv/scratch/chacmod/auskidtalk_audio/135_task...      0.57  135_task1   
461  /srv/scratch/chacmod/auskidtalk_audio/135_task...      0.46  135_t

In [13]:
shuffled_dev['duration'].sum()

312.1200000000001