In [1]:
# Purpose: shuffle the full dataset of training, development and testing sets
# Requirement: prepare_scripted_datasets.py (ipynb)
#              Check_duration_15sec.py (ipynb)
#              group_speaker_id.py (ipynb)
#              dataset_splitting.py (ipynb)

In [1]:
# ------------------------------------------
#        Importing libraies
# ------------------------------------------

# For dataframes
import pandas as pd
# For splitting data
from sklearn.model_selection import train_test_split
# For printing filepath
import os

In [2]:
# ------------------------------------------
#           Setting seed
# ------------------------------------------
print("\n------> Setting seed... ----------------------------------------------\n")
# Set a seed to ensure random split can be reproducible
seed = 230
print("--> Setting seed as:", seed)


------> Setting seed... ----------------------------------------------

--> Setting seed as: 230


In [3]:
# ------------------------------------------
#        Setting filepath
# ------------------------------------------
# These data filepath contain dataframes generated from 'dataset_splitting.py'
#data_train_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_train_dataframe.csv'
#data_dev_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dev_dataframe.csv'
#data_test_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_test_dataframe.csv'

#data_train_fp = '/srv/scratch/z5313567/thesis/OGI_AusKidTalk_local/scripted_OGI_scripted_AusKidTalk_train_dataframe.csv'
data_train_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_train_dataframe_v2.csv'
data_dev_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_dev_dataframe_v2.csv'
data_test_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_test_dataframe_v2.csv'

In [4]:
# where shuffled datasets are saved
data_train_shuffled_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_train_dataframe_shuffled_v2.csv'
data_dev_shuffled_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_dev_dataframe_shuffled_v2.csv'
data_test_shuffled_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous_v2/AusKidTalk_combined_scripted_spontaneous_test_dataframe_shuffled_v2.csv'

In [5]:
# ------------------------------------------
#        Reading in dataframe
# ------------------------------------------
# Reading in data dataframe from csv file, as string type
# to preserve leading zeros in speaker id
#data_train_df = pd.read_csv(data_train_fp, dtype={'speaker_id': str})
#data_dev_df = pd.read_csv(data_dev_fp, dtype={'speaker_id': str})
#data_test_df = pd.read_csv(data_test_fp, dtype={'speaker_id': str})
data_train_df = pd.read_csv(data_train_fp, dtype=str)
data_dev_df = pd.read_csv(data_dev_fp, dtype=str)
data_test_df = pd.read_csv(data_test_fp, dtype=str)

# Converting duration column to float64
data_train_df["duration"] = data_train_df["duration"].apply(pd.to_numeric)
data_dev_df["duration"] = data_dev_df["duration"].apply(pd.to_numeric)
data_test_df["duration"] = data_test_df["duration"].apply(pd.to_numeric)

In [6]:
shuffled_train = data_train_df.sample(frac=1.0, random_state=seed)
shuffled_train = shuffled_train.reset_index(drop=True)
shuffled_train['speaker_id'] = shuffled_train['speaker_id'].astype(str) # preverse leading zeros
shuffled_train.to_csv(data_train_shuffled_fp, index=False)
duration = shuffled_train['duration'].sum()
print(f'Total duration in train dataset is {duration/(60*60)} hours')
print(shuffled_train)

Total duration in train dataset is 3.98029457949032 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.540000        228  \
1      /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.510000        304   
2      /srv/scratch/chacmod/auskidtalk_scripted_audio...  1.062668        899   
3      /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.396053        539   
4      /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.780000        682   
...                                                  ...       ...        ...   
18491  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.810000        603   
18492  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.810088        439   
18493  /srv/scratch/chacmod/auskidtalk_scripted_audio...  1.050000       1464   
18494  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.760174        456   
18495  /srv/scratch/chacmod/auskidtalk_scripted_aud

In [7]:
shuffled_dev = data_dev_df.sample(frac=1.0, random_state=seed)
shuffled_dev = shuffled_dev.reset_index(drop=True)
shuffled_dev['speaker_id'] = shuffled_dev['speaker_id'].astype(str) # preverse leading zeros
shuffled_dev.to_csv(data_dev_shuffled_fp, index=False)
duration = shuffled_dev['duration'].sum()
print(f'Total duration in dev dataset is {duration/(60*60)} hours')
print(shuffled_dev)

Total duration in dev dataset is 0.8430717370619673 hours
                                               filepath  duration speaker_id   
0     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.780000        222  \
1     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.930000        694   
2     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.420000       1032   
3     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.810000        671   
4     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.580211        431   
...                                                 ...       ...        ...   
4028  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.411644        547   
4029  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.630000        715   
4030  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.420000       1150   
4031  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.900000        218   
4032  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.195

In [8]:
shuffled_test = data_test_df.sample(frac=1.0, random_state=seed)
shuffled_test = shuffled_test.reset_index(drop=True)
shuffled_test['speaker_id'] = shuffled_test['speaker_id'].astype(str) 
shuffled_test.to_csv(data_test_shuffled_fp, index=False)
duration = shuffled_test['duration'].sum()
print(f'Total duration in test dataset is {duration/(60*60)} hours')
print(shuffled_test)

Total duration in test dataset is 0.8739776574554781 hours
                                               filepath  duration speaker_id   
0     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.390000       1050  \
1     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.600000       1050   
2     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.530000        772   
3     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.300000        954   
4     /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.990000        651   
...                                                 ...       ...        ...   
3989  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.780000        774   
3990  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.540000        973   
3991  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.630000        203   
3992  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.575779        620   
3993  /srv/scratch/chacmod/auskidtalk_scripted_audio...  0.81

In [9]:
shuffled_dev['duration'].sum()

3035.058253423082