In [4]:
# Purpose: shuffle the full dataset of training, development and testing sets
# Requirement: prepare_scripted_datasets.py (ipynb)
#              Check_duration_15sec.py (ipynb)
#              group_speaker_id.py (ipynb)
#              dataset_splitting.py (ipynb)

In [5]:
# ------------------------------------------
#        Importing libraies
# ------------------------------------------

# For dataframes
import pandas as pd
# For splitting data
from sklearn.model_selection import train_test_split
# For printing filepath
import os

In [6]:
# ------------------------------------------
#           Setting seed
# ------------------------------------------
print("\n------> Setting seed... ----------------------------------------------\n")
# Set a seed to ensure random split can be reproducible
seed = 230
print("--> Setting seed as:", seed)


------> Setting seed... ----------------------------------------------

--> Setting seed as: 230


In [7]:
# ------------------------------------------
#        Setting filepath
# ------------------------------------------
# These data filepath contain dataframes generated from 'dataset_splitting.py'
#data_train_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_train_dataframe.csv'
#data_dev_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dev_dataframe.csv'
#data_test_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_test_dataframe.csv'

#data_train_fp = '/srv/scratch/z5313567/thesis/OGI_AusKidTalk_local/scripted_OGI_scripted_AusKidTalk_train_dataframe.csv'
data_train_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_train_dataframe_15sec.csv'
data_dev_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_dev_dataframe_15sec.csv'
data_test_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_test_dataframe_15sec.csv'

In [8]:
# where full datasets are saved
data_train_full_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_train_15sec_shuffled.csv'
data_dev_full_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_dev_15sec_shuffled.csv'
data_test_full_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_test_15sec_shuffled.csv'

In [9]:
# ------------------------------------------
#        Reading in dataframe
# ------------------------------------------
# Reading in data dataframe from csv file, as string type
# to preserve leading zeros in speaker id
#data_train_df = pd.read_csv(data_train_fp, dtype={'speaker_id': str})
#data_dev_df = pd.read_csv(data_dev_fp, dtype={'speaker_id': str})
#data_test_df = pd.read_csv(data_test_fp, dtype={'speaker_id': str})
data_train_df = pd.read_csv(data_train_fp, dtype=str)
data_dev_df = pd.read_csv(data_dev_fp, dtype=str)
data_test_df = pd.read_csv(data_test_fp, dtype=str)

# Converting duration column to float64
data_train_df["duration"] = data_train_df["duration"].apply(pd.to_numeric)
data_dev_df["duration"] = data_dev_df["duration"].apply(pd.to_numeric)
data_test_df["duration"] = data_test_df["duration"].apply(pd.to_numeric)

In [10]:
shuffled_train = data_train_df.sample(frac=1.0, random_state=seed)
shuffled_train = shuffled_train.reset_index(drop=True)
shuffled_train['speaker_id'] = shuffled_train['speaker_id'].astype(str) # preverse leading zeros
shuffled_train.to_csv(data_train_full_fp, index=False)
duration = shuffled_train['duration'].sum()
print(f'Total duration in train dataset is {duration/(60*60)} hours')
print(shuffled_train)

Total duration in train dataset is 47.131229100529104 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.930068    0200236  \
1      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.780000    0200126   
2      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  0.860000    0300095   
3      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.120136    0200211   
4      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.075011    0100343   
...                                                  ...       ...        ...   
57324  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.520000    0100032   
57325  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  7.391066    0300037   
57326  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  3.955011    0200018   
57327  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  0.960000    0100042   
57328  /srv/scratch/chacmod/CU_2/corpus/data/trai

In [11]:
shuffled_dev = data_dev_df.sample(frac=1.0, random_state=seed)
shuffled_dev = shuffled_dev.reset_index(drop=True)
shuffled_dev['speaker_id'] = shuffled_dev['speaker_id'].astype(str) # preverse leading zeros
shuffled_dev.to_csv(data_dev_full_fp, index=False)
duration = shuffled_dev['duration'].sum()
print(f'Total duration in dev dataset is {duration/(60*60)} hours')
print(shuffled_dev)

Total duration in dev dataset is 8.942813466868229 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.645034    0200305  \
1      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.205034    0000056   
2      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  4.610068    0200151   
3      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.860000    0401006   
4      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.840000    0200185   
...                                                  ...       ...        ...   
11052  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.055011    0300097   
11053  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.205034    0100044   
11054  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  0.955011    0401006   
11055  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.860000    0402034   
11056  /srv/scratch/chacmod/CU_2/corpus/data/train-p

In [12]:
shuffled_test = data_test_df.sample(frac=1.0, random_state=seed)
shuffled_test = shuffled_test.reset_index(drop=True)
shuffled_test['speaker_id'] = shuffled_test['speaker_id'].astype(str) 
shuffled_test.to_csv(data_test_full_fp, index=False)
duration = shuffled_test['duration'].sum()
print(f'Total duration in test dataset is {duration/(60*60)} hours')
print(shuffled_test)

Total duration in test dataset is 8.584900818846055 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.960091    0100301  \
1      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.320000    0300091   
2      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.270068    0200202   
3      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  4.012154    0500047   
4      /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.130023    0000011   
...                                                  ...       ...        ...   
10484  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.325034    0102207   
10485  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  5.570068    0100314   
10486  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.260000    0300098   
10487  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  3.940000    0200239   
10488  /srv/scratch/chacmod/CU_2/corpus/data/train-

In [13]:
shuffled_dev['duration'].sum()

32194.128480725623