In [2]:
# Purpose: shuffle the full dataset of training, development and testing sets
# Requirement: prepare_scripted_datasets.py (ipynb)
#              group_speaker_id.py (ipynb)
#              dataset_splitting.py (ipynb)

In [3]:
# ------------------------------------------
#        Importing libraies
# ------------------------------------------

# For dataframes
import pandas as pd
# For splitting data
from sklearn.model_selection import train_test_split
# For printing filepath
import os

In [4]:
# ------------------------------------------
#           Setting seed
# ------------------------------------------
print("\n------> Setting seed... ----------------------------------------------\n")
# Set a seed to ensure random split can be reproducible
seed = 230
print("--> Setting seed as:", seed)


------> Setting seed... ----------------------------------------------

--> Setting seed as: 230


In [5]:
# ------------------------------------------
#        Setting filepath
# ------------------------------------------
# These data filepath contain dataframes generated from 'dataset_splitting.py'
#data_train_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_train_dataframe.csv'
#data_dev_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dev_dataframe.csv'
#data_test_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_test_dataframe.csv'

data_train_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_full_combined_scripted_spontaneous/full_OGI_combined_scripted_spontaneous_train_dataframe.csv'
data_dev_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_full_combined_scripted_spontaneous/full_OGI_combined_scripted_spontaneous_dev_dataframe.csv'
data_test_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_full_combined_scripted_spontaneous/full_OGI_combined_scripted_spontaneous_test_dataframe.csv'

In [6]:
# where full datasets are saved
data_train_full_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_full_combined_scripted_spontaneous/full_OGI_combined_scripted_spontaneous_train_shuffle.csv'
data_dev_full_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_full_combined_scripted_spontaneous/full_OGI_combined_scripted_spontaneous_dev_shuffle.csv'
data_test_full_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_full_combined_scripted_spontaneous/full_OGI_combined_scripted_spontaneous_test_shuffle.csv'

In [7]:
# ------------------------------------------
#        Reading in dataframe
# ------------------------------------------
# Reading in data dataframe from csv file, as string type
# to preserve leading zeros in speaker id
data_train_df = pd.read_csv(data_train_fp, dtype=str)
data_dev_df = pd.read_csv(data_dev_fp, dtype=str)
data_test_df = pd.read_csv(data_test_fp, dtype=str)
# Converting duration column to float64
data_train_df["duration"] = data_train_df["duration"].apply(pd.to_numeric)
data_dev_df["duration"] = data_dev_df["duration"].apply(pd.to_numeric)
data_test_df["duration"] = data_test_df["duration"].apply(pd.to_numeric)

In [8]:
shuffled_train = data_train_df.sample(frac=1.0, random_state=seed)
shuffled_train = shuffled_train.reset_index(drop=True)
shuffled_train.to_csv(data_train_full_fp, index=False)
duration = shuffled_train['duration'].sum()
print(f'Total duration in train dataset is {duration/(60*60)} hours')
print(shuffled_train)

Total duration in train dataset is 69.86795187704712 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/OGI/speech/scripted/05/0/...  3.156689      ks50a  \
1      /srv/scratch/chacmod/OGI/speech/scripted/05/2/...  1.302222      ksg40   
2      /srv/scratch/chacmod/OGI/speech/scripted/06/0/...  1.607574      ksh3s   
3      /srv/scratch/chacmod/OGI/speech/scripted/10/1/...  6.119002      ksa3n   
4      /srv/scratch/chacmod/OGI/speech/scripted/08/2/...  2.359501      ks83l   
...                                                  ...       ...        ...   
50949  /srv/scratch/chacmod/OGI/speech/scripted/09/3/...  1.987710      ks937   
50950  /srv/scratch/chacmod/OGI/speech/scripted/05/2/...  4.239138      ksg02   
50951  /srv/scratch/chacmod/OGI/speech/scripted/05/3/...  4.963220      ksg14   
50952  /srv/scratch/chacmod/OGI/speech/scripted/00/0/...  3.114649      ks03g   
50953  /srv/scratch/chacmod/OGI/speech/scripted/08

In [9]:
shuffled_dev = data_dev_df.sample(frac=1.0, random_state=seed)
shuffled_dev = shuffled_dev.reset_index(drop=True)
shuffled_dev.to_csv(data_dev_full_fp, index=False)
duration = shuffled_dev['duration'].sum()
print(f'Total duration in dev dataset is {duration/(60*60)} hours')
print(shuffled_dev)

Total duration in dev dataset is 15.218279768203578 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/OGI/speech/scripted/08/4/...  2.338957      ks80v  \
1      /srv/scratch/chacmod/OGI/speech/scripted/06/0/...  5.734512      ks63i   
2      /srv/scratch/chacmod/OGI/speech/scripted/09/3/...  2.719592      ks90f   
3      /srv/scratch/chacmod/OGI/speech/scripted/03/2/...  3.155646      ks30x   
4      /srv/scratch/chacmod/OGI/speech/scripted/06/2/...  2.339093      ks61i   
...                                                  ...       ...        ...   
11127  /srv/scratch/chacmod/OGI/speech/scripted/03/2/...  3.155964      ks31d   
11128  /srv/scratch/chacmod/OGI/speech/scripted/01/2/...  3.112698      ksc3y   
11129  /srv/scratch/chacmod/OGI/speech/scripted/07/0/...  2.338821      ksi0y   
11130  /srv/scratch/chacmod/OGI/speech/scripted/01/2/...  5.117143      ksc3y   
11131  /srv/scratch/chacmod/OGI/speech/scripted/03/

In [10]:
shuffled_test = data_test_df.sample(frac=1.0, random_state=seed)
shuffled_test = shuffled_test.reset_index(drop=True)
shuffled_test.to_csv(data_test_full_fp, index=False)
duration = shuffled_test['duration'].sum()
print(f'Total duration in test dataset is {duration/(60*60)} hours')
print(shuffled_test)

Total duration in test dataset is 15.296677059712776 hours
                                                filepath  duration speaker_id   
0      /srv/scratch/chacmod/OGI/speech/scripted/08/0/...  1.407846      ks81c  \
1      /srv/scratch/chacmod/OGI/speech/scripted/01/3/...  3.591293      ks10d   
2      /srv/scratch/chacmod/OGI/speech/scripted/01/3/...  4.092154      ks10u   
3      /srv/scratch/chacmod/OGI/speech/scripted/05/0/...  4.952653      ksg4a   
4      /srv/scratch/chacmod/OGI/speech/scripted/05/0/...  1.588073      ks50c   
...                                                  ...       ...        ...   
11009  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  4.982766      ks92x   
11010  /srv/scratch/chacmod/OGI/speech/scripted/01/1/...  3.103537      ksc39   
11011  /srv/scratch/chacmod/OGI/speech/scripted/02/0/...  5.118639      ks23o   
11012  /srv/scratch/chacmod/OGI/speech/scripted/01/3/...  2.614694      ksc01   
11013  /srv/scratch/chacmod/OGI/speech/scripted/05

In [11]:
shuffled_dev['duration'].sum()

54785.80716553288