In [37]:
# Purpose: shuffle the full dataset of training, development and testing sets
# Requirement: prepare_scripted_datasets.py (ipynb)
#              Check_duration_15sec.py (ipynb)
#              group_speaker_id.py (ipynb)
#              dataset_splitting.py (ipynb)

In [38]:
# ------------------------------------------
#        Importing libraies
# ------------------------------------------

# For dataframes
import pandas as pd
# For splitting data
from sklearn.model_selection import train_test_split
# For printing filepath
import os

In [39]:
# ------------------------------------------
#           Setting seed
# ------------------------------------------
print("\n------> Setting seed... ----------------------------------------------\n")
# Set a seed to ensure random split can be reproducible
seed = 230
print("--> Setting seed as:", seed)


------> Setting seed... ----------------------------------------------

--> Setting seed as: 230


In [40]:
# ------------------------------------------
#        Setting filepath
# ------------------------------------------
# These data filepath contain dataframes generated from 'dataset_splitting.py'
#data_train_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_train_dataframe.csv'
#data_dev_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dev_dataframe.csv'
#data_test_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_test_dataframe.csv'

#data_train_fp = '/srv/scratch/z5313567/thesis/OGI_AusKidTalk_local/scripted_OGI_scripted_AusKidTalk_train_dataframe.csv'
data_train_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_train_dataframs_15sec.csv'
data_dev_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_dev_dataframs_15sec.csv'
data_test_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_test_dataframs_15sec.csv'

In [41]:
# where full datasets are saved
data_train_full_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_train_15sec_shuffled.csv'
data_dev_full_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_dev_15sec_shuffled.csv'
data_test_full_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_test_15sec_shuffled.csv'

In [42]:
# ------------------------------------------
#        Reading in dataframe
# ------------------------------------------
# Reading in data dataframe from csv file, as string type
# to preserve leading zeros in speaker id
#data_train_df = pd.read_csv(data_train_fp, dtype={'speaker_id': str})
#data_dev_df = pd.read_csv(data_dev_fp, dtype={'speaker_id': str})
#data_test_df = pd.read_csv(data_test_fp, dtype={'speaker_id': str})
data_train_df = pd.read_csv(data_train_fp, dtype=str)
data_dev_df = pd.read_csv(data_dev_fp, dtype=str)
data_test_df = pd.read_csv(data_test_fp, dtype=str)

# Converting duration column to float64
data_train_df["duration"] = data_train_df["duration"].apply(pd.to_numeric)
data_dev_df["duration"] = data_dev_df["duration"].apply(pd.to_numeric)
data_test_df["duration"] = data_test_df["duration"].apply(pd.to_numeric)

In [43]:
shuffled_train = data_train_df.sample(frac=1.0, random_state=seed)
shuffled_train = shuffled_train.reset_index(drop=True)
shuffled_train['speaker_id'] = shuffled_train['speaker_id'].astype(str) # preverse leading zeros
shuffled_train.to_csv(data_train_full_fp, index=False)
duration = shuffled_train['duration'].sum()
print(f'Total duration in train dataset is {duration/(60*60)} hours')
print(shuffled_train)

Total duration in train dataset is 49.866753917863434 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  6.185034      00152  \
1      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.170023      00095   
2      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.930068        111   
3      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.938141        110   
4      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.310023        117   
...                                                  ...       ...        ...   
54453  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  4.468163        008   
54454  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.100000      00023   
54455  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.500000      00002   
54456  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.210159        196   
54457  /srv/scratch/chacmod/CU_2/corpus/data/trai

In [44]:
shuffled_dev = data_dev_df.sample(frac=1.0, random_state=seed)
shuffled_dev = shuffled_dev.reset_index(drop=True)
shuffled_dev['speaker_id'] = shuffled_dev['speaker_id'].astype(str) # preverse leading zeros
shuffled_dev.to_csv(data_dev_full_fp, index=False)
duration = shuffled_dev['duration'].sum()
print(f'Total duration in dev dataset is {duration/(60*60)} hours')
print(shuffled_dev)

Total duration in dev dataset is 10.17157481733434 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.030068        169  \
1      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  5.310068        285   
2      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  5.215011      02078   
3      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.205034      00081   
4      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.490023        288   
...                                                  ...       ...        ...   
10679  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.615011      00171   
10680  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.230023      01007   
10681  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  4.130159        219   
10682  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  0.740000      00120   
10683  /srv/scratch/chacmod/CU_2/corpus/data/train-p

In [45]:
shuffled_test = data_test_df.sample(frac=1.0, random_state=seed)
shuffled_test = shuffled_test.reset_index(drop=True)
shuffled_test['speaker_id'] = shuffled_test['speaker_id'].astype(str) 
shuffled_test.to_csv(data_test_full_fp, index=False)
duration = shuffled_test['duration'].sum()
print(f'Total duration in test dataset is {duration/(60*60)} hours')
print(shuffled_test)

Total duration in test dataset is 12.121990942302848 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  9.770023        158  \
1      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  6.209070        064   
2      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.760000        200   
3      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  7.500091        160   
4      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.560091        202   
...                                                  ...       ...        ...   
14887  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.070023      01070   
14888  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.690068        179   
14889  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  6.960091        321   
14890  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.155011      00122   
14891  /srv/scratch/chacmod/CU_2/corpus/data/train

In [46]:
shuffled_dev['duration'].sum()

36617.66934240363