In [6]:
# Purpose: shuffle the full dataset of training, development and testing sets
# Requirement: prepare_scripted_datasets.py (ipynb)
#              Check_duration_15sec.py (ipynb)
#              group_speaker_id.py (ipynb)
#              dataset_splitting.py (ipynb)

In [7]:
# ------------------------------------------
#        Importing libraies
# ------------------------------------------

# For dataframes
import pandas as pd
# For splitting data
from sklearn.model_selection import train_test_split
# For printing filepath
import os

In [8]:
# ------------------------------------------
#           Setting seed
# ------------------------------------------
print("\n------> Setting seed... ----------------------------------------------\n")
# Set a seed to ensure random split can be reproducible
seed = 230
print("--> Setting seed as:", seed)


------> Setting seed... ----------------------------------------------

--> Setting seed as: 230


In [9]:
# ------------------------------------------
#        Setting filepath
# ------------------------------------------
# These data filepath contain dataframes generated from 'dataset_splitting.py'
#data_train_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_train_dataframe.csv'
#data_dev_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dev_dataframe.csv'
#data_test_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_test_dataframe.csv'

#data_train_fp = '/srv/scratch/z5313567/thesis/OGI_AusKidTalk_local/scripted_OGI_scripted_AusKidTalk_train_dataframe.csv'
data_train_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_train_dataframe_new_180speakers.csv'
data_dev_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_dev_dataframe_new_180speakers.csv'
data_test_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_test_dataframe_new_180speakers.csv'

In [10]:
# where shuffled datasets are saved
data_train_shuffled_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_train_dataframe_new_180speakers_shuffled.csv'
data_dev_shuffled_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_dev_dataframe_new_180speakers_shuffled.csv'
data_test_shuffled_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/scripted_new/AusKidTalk_scripted_test_dataframe_new_180speakers_shuffled.csv'

In [11]:
# ------------------------------------------
#        Reading in dataframe
# ------------------------------------------
# Reading in data dataframe from csv file, as string type
# to preserve leading zeros in speaker id
#data_train_df = pd.read_csv(data_train_fp, dtype={'speaker_id': str})
#data_dev_df = pd.read_csv(data_dev_fp, dtype={'speaker_id': str})
#data_test_df = pd.read_csv(data_test_fp, dtype={'speaker_id': str})
data_train_df = pd.read_csv(data_train_fp, dtype=str)
data_dev_df = pd.read_csv(data_dev_fp, dtype=str)
data_test_df = pd.read_csv(data_test_fp, dtype=str)

# Converting duration column to float64
data_train_df["duration"] = data_train_df["duration"].apply(pd.to_numeric)
data_dev_df["duration"] = data_dev_df["duration"].apply(pd.to_numeric)
data_test_df["duration"] = data_test_df["duration"].apply(pd.to_numeric)

In [12]:
shuffled_train = data_train_df.sample(frac=1.0, random_state=seed)
shuffled_train = shuffled_train.reset_index(drop=True)
shuffled_train['speaker_id'] = shuffled_train['speaker_id'].astype(str) # preverse leading zeros
shuffled_train.to_csv(data_train_shuffled_fp, index=False)
duration = shuffled_train['duration'].sum()
print(f'Total duration in train dataset is {duration/(60*60)} hours')
print(shuffled_train)

Total duration in train dataset is 3.8595801369851754 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/auskidtalk_scripted_audio...  1.255165        485  \
1      /srv/scratch/chacmod/auskidtalk_scripted_audio...  1.040000        603   
2      /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.540000        777   
3      /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.480000        748   
4      /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.690000        219   
...                                                  ...       ...        ...   
18379  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.680000        603   
18380  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.570000        130   
18381  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.450000       1464   
18382  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.744746        456   
18383  /srv/scratch/chacmod/auskidtalk_scripted_a

In [13]:
shuffled_dev = data_dev_df.sample(frac=1.0, random_state=seed)
shuffled_dev = shuffled_dev.reset_index(drop=True)
shuffled_dev['speaker_id'] = shuffled_dev['speaker_id'].astype(str) # preverse leading zeros
shuffled_dev.to_csv(data_dev_shuffled_fp, index=False)
duration = shuffled_dev['duration'].sum()
print(f'Total duration in dev dataset is {duration/(60*60)} hours')
print(shuffled_dev)

Total duration in dev dataset is 0.8320461690562522 hours
                                               filepath  duration speaker_id   
0     /srv/scratch/chacmod/auskidtalk_scripted_audio...  1.188363        857  \
1     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.680000        222   
2     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.690000        229   
3     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.840000        547   
4     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.440000        318   
...                                                 ...       ...        ...   
4014  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.411644        547   
4015  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.655705        715   
4016  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.420000       1150   
4017  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.630000        218   
4018  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.941

In [14]:
shuffled_test = data_test_df.sample(frac=1.0, random_state=seed)
shuffled_test = shuffled_test.reset_index(drop=True)
shuffled_test['speaker_id'] = shuffled_test['speaker_id'].astype(str) 
shuffled_test.to_csv(data_test_shuffled_fp, index=False)
duration = shuffled_test['duration'].sum()
print(f'Total duration in test dataset is {duration/(60*60)} hours')
print(shuffled_test)

Total duration in test dataset is 0.8461664757909966 hours
                                               filepath  duration speaker_id   
0     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.750000        203  \
1     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.960000        977   
2     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.840000        743   
3     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.661656        205   
4     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.208904        977   
...                                                 ...       ...        ...   
3963  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.780000        774   
3964  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.990000        973   
3965  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.630000        203   
3966  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.575779        620   
3967  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.81

In [15]:
shuffled_dev['duration'].sum()

2995.366208602508