In [1]:
# Purpose: split training, development and testing datasets generated from data_splitting.py into 10min, 1hour, 5hour and 10hour subsets
# Requirement: prepare_scripted_datasets.py (ipynb)
#              group_speaker_id.py (ipynb)
#              dataset_splitting.py (ipynb)

In [2]:
# ------------------------------------------
#        Importing libraies
# ------------------------------------------

# For dataframes
import pandas as pd
# For splitting data
from sklearn.model_selection import train_test_split
# For printing filepath
import os

In [3]:
# ------------------------------------------
#           Setting seed
# ------------------------------------------
print("\n------> Setting seed... ----------------------------------------------\n")
# Set a seed to ensure random split can be reproducible
seed = 230
print("--> Setting seed as:", seed)


------> Setting seed... ----------------------------------------------

--> Setting seed as: 230


In [4]:
# -----------------------------------------------
#           Setting train, dev, test proportion
# -----------------------------------------------
train_prop = 0.7
dev_prop = 0.15
test_prop = 0.15

In [5]:
# ------------------------------------------
#        Setting filepath
# ------------------------------------------
# These data filepath contain dataframes generated from 'dataset_splitting.py'
data_train_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_train_dataframe_15sec.csv'
data_dev_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_dev_dataframe_15sec.csv'
data_test_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_test_dataframe_15sec.csv'

In [7]:
# where 10min datasets are saved
data_train_10min_fp = '/srv/scratch/z5313567/thesis/CU_local/5h/CU_10min_train_dataframe_15sec.csv'
data_dev_10min_fp = '/srv/scratch/z5313567/thesis/CU_local/5h/CU_10min_train_dataframe_15sec.csv'
data_test_10min_fp = '/srv/scratch/z5313567/thesis/CU_local/5h/CU_10min_train_dataframe_15sec.csv'

In [27]:
# where 1hour datasets are saved
data_train_1hour_fp = '/srv/scratch/z5313567/thesis/CU_local/5h/CU_1h_train_dataframe_15sec.csv'
data_dev_1hour_fp = '/srv/scratch/z5313567/thesis/CU_local/5h/CU_1h_train_dataframe_15sec.csv'
data_test_1hour_fp = '/srv/scratch/z5313567/thesis/CU_local/5h/CU_1h_train_dataframe_15sec.csv'

In [26]:
# where 5hour datasets are saved
data_train_5hour_fp = '/srv/scratch/z5313567/thesis/CU_local/4h/CU_4h_train_dataframe_15sec.csv'
data_dev_5hour_fp = '/srv/scratch/z5313567/thesis/CU_local/4h/CU_4h_dev_dataframe_15sec.csv'
data_test_5hour_fp = '/srv/scratch/z5313567/thesis/CU_local/4h//CU_4h_test_dataframe_15sec.csv'

In [29]:
# where 10hour datasets are saved
data_train_10hour_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_10hour_datasets/10hour_OGI_scripted_train_dataframe.csv'
data_dev_10hour_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_10hour_datasets/10hour_OGI_scripted_dev_dataframe.csv'
data_test_10hour_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_10hour_datasets/10hour_OGI_scripted_test_dataframe.csv'

In [6]:
# where 20hour datasets are saved
data_train_20hour_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_20hour_datasets/20hour_OGI_scripted_train_dataframe.csv'
data_dev_20hour_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_20hour_datasets/20hour_OGI_scripted_dev_dataframe.csv'
data_test_20hour_fp = '/srv/scratch/z5313567/thesis/OGI_local/new_20hour_datasets/20hour_OGI_scripted_test_dataframe.csv'

In [27]:
# ------------------------------------------
#        Reading in dataframe
# ------------------------------------------
# Reading in data dataframe from csv file, as string type
# to preserve leading zeros in speaker id
data_train_df = pd.read_csv(data_train_fp, dtype=str)
data_dev_df = pd.read_csv(data_dev_fp, dtype=str)
data_test_df = pd.read_csv(data_test_fp, dtype=str)
# Converting duration column to float64
data_train_df["duration"] = data_train_df["duration"].apply(pd.to_numeric)
data_dev_df["duration"] = data_dev_df["duration"].apply(pd.to_numeric)
data_test_df["duration"] = data_test_df["duration"].apply(pd.to_numeric)

In [28]:
# ------------------------------------------
#        Obtain total duration of train df
# ------------------------------------------
total_duration_train_df = data_train_df['duration'].sum()
print(f'Total duration in train dataframe is {total_duration_train_df} seconds, {total_duration_train_df/60} minutes, {total_duration_train_df/(60*60)} hours')
total_duration_dev_df = data_dev_df['duration'].sum()
print(f'Total duration in dev dataframe is {total_duration_dev_df} seconds, {total_duration_dev_df/60} minutes, {total_duration_dev_df/(60*60)} hours')
total_duration_test_df = data_test_df['duration'].sum()
print(f'Total duration in test dataframe is {total_duration_test_df} seconds, {total_duration_test_df/60} minutes, {total_duration_test_df/(60*60)} hours')

Total duration in train dataframe is 169672.42476190475 seconds, 2827.8737460317457 minutes, 47.1312291005291 hours
Total duration in dev dataframe is 32194.128480725623 seconds, 536.5688080120938 minutes, 8.942813466868229 hours
Total duration in test dataframe is 30905.642947845805 seconds, 515.0940491307634 minutes, 8.584900818846057 hours


In [29]:
def GetSubset(subset_secs, total_secs, data_df, subset_fp, seed):
    for i in range(len(subset_secs)):
        prop_subset = (subset_secs[i]/total_secs) #proportion of subset compared to total
        discard, subset = train_test_split(data_df, test_size=prop_subset, random_state=seed, shuffle=True) # subset can be either train, dev or test
        print(f'For {subset_secs[i]/(60*60)}-hour dataset--> Hours in subset:', subset['duration'].sum()/(60*60))
        subset.to_csv(subset_fp[i], index=False)
        print(f'Successfully saved {subset_secs[i]/(60*60)}-hour dataset to {subset_fp[i]}')
        print('Samples:', len(subset))
        print('\n')

In [33]:
train_subset_secs = [10*60, 1*60*60, 5*60*60, 10*60*60]
train_subset_fp = [data_train_10min_fp, data_train_1hour_fp, data_train_5hour_fp, data_train_10hour_fp]
GetSubset(train_subset_secs, total_duration_train_df, data_train_df, train_subset_fp, seed)

For 0.16666666666666666-hour dataset--> Hours in subset: 0.16514726631393298
Successfully saved 0.16666666666666666-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_10min_datasets/10min_OGI_scripted_train_dataframe.csv
Samples: 173


For 1.0-hour dataset--> Hours in subset: 1.0173574703955657
Successfully saved 1.0-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_1hour_datasets/1hour_OGI_scripted_train_dataframe.csv
Samples: 1036


For 5.0-hour dataset--> Hours in subset: 5.018006777525825
Successfully saved 5.0-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_5hour_datasets/5hour_OGI_scripted_train_dataframe.csv
Samples: 5177


For 10.0-hour dataset--> Hours in subset: 10.035672965482489
Successfully saved 10.0-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_10hour_datasets/10hour_OGI_scripted_train_dataframe.csv
Samples: 10354




In [34]:
dev_subset_secs = [(train_subset_secs[i] / train_prop) * dev_prop for i in range(len(train_subset_secs))]
dev_subset_fp = [data_dev_10min_fp, data_dev_1hour_fp, data_dev_5hour_fp, data_dev_10hour_fp]
GetSubset(dev_subset_secs, total_duration_dev_df, data_dev_df, dev_subset_fp, seed)

For 0.03571428571428572-hour dataset--> Hours in subset: 0.039786104812295275
Successfully saved 0.03571428571428572-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_10min_datasets/10min_OGI_scripted_dev_dataframe.csv
Samples: 37


For 0.2142857142857143-hour dataset--> Hours in subset: 0.2248028974552784
Successfully saved 0.2142857142857143-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_1hour_datasets/1hour_OGI_scripted_dev_dataframe.csv
Samples: 222


For 1.0714285714285714-hour dataset--> Hours in subset: 1.0851111740992694
Successfully saved 1.0714285714285714-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_5hour_datasets/5hour_OGI_scripted_dev_dataframe.csv
Samples: 1109


For 2.142857142857143-hour dataset--> Hours in subset: 2.136513492063492
Successfully saved 2.142857142857143-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_10hour_datasets/10hour_OGI_scripted_dev_dataframe.csv
Samples: 2217




In [35]:
test_subset_secs = [(train_subset_secs[i] / train_prop) * test_prop for i in range(len(train_subset_secs))]
test_subset_fp = [data_test_10min_fp, data_test_1hour_fp, data_test_5hour_fp, data_test_10hour_fp]
GetSubset(test_subset_secs, total_duration_test_df, data_test_df, test_subset_fp, seed)

For 0.03571428571428572-hour dataset--> Hours in subset: 0.039383282942806755
Successfully saved 0.03571428571428572-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_10min_datasets/10min_OGI_scripted_test_dataframe.csv
Samples: 37


For 0.2142857142857143-hour dataset--> Hours in subset: 0.22082228521038044
Successfully saved 0.2142857142857143-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_1hour_datasets/1hour_OGI_scripted_test_dataframe.csv
Samples: 218


For 1.0714285714285714-hour dataset--> Hours in subset: 1.0918419249181155
Successfully saved 1.0714285714285714-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_5hour_datasets/5hour_OGI_scripted_test_dataframe.csv
Samples: 1086


For 2.142857142857143-hour dataset--> Hours in subset: 2.1632552280171327
Successfully saved 2.142857142857143-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_10hour_datasets/10hour_OGI_scripted_test_dataframe.csv
Samples: 2171




In [11]:
train_subset_secs = [20*60*60]
train_subset_fp = [data_train_20hour_fp]
GetSubset(train_subset_secs, total_duration_train_df, data_train_df, train_subset_fp, seed)

For 20.0-hour dataset--> Hours in subset: 19.959878584026203
Successfully saved 20.0-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_20hour_datasets/20hour_OGI_scripted_train_dataframe.csv
Samples: 20707




In [12]:
dev_subset_secs = [(train_subset_secs[i] / train_prop) * dev_prop for i in range(len(train_subset_secs))]
dev_subset_fp = [data_dev_20hour_fp]
GetSubset(dev_subset_secs, total_duration_dev_df, data_dev_df, dev_subset_fp, seed)

For 4.285714285714286-hour dataset--> Hours in subset: 4.2601931972789115
Successfully saved 4.285714285714286-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_20hour_datasets/20hour_OGI_scripted_dev_dataframe.csv
Samples: 4433




In [13]:
test_subset_secs = [(train_subset_secs[i] / train_prop) * test_prop for i in range(len(train_subset_secs))]
test_subset_fp = [data_test_20hour_fp]
GetSubset(test_subset_secs, total_duration_test_df, data_test_df, test_subset_fp, seed)

For 4.285714285714286-hour dataset--> Hours in subset: 4.3338754724111865
Successfully saved 4.285714285714286-hour dataset to /srv/scratch/z5313567/thesis/OGI_local/new_20hour_datasets/20hour_OGI_scripted_test_dataframe.csv
Samples: 4342




In [30]:
train_subset_secs = [3.98*60*60]
train_subset_fp = [data_train_5hour_fp]
GetSubset(train_subset_secs, total_duration_train_df, data_train_df, train_subset_fp, seed)

For 3.98-hour dataset--> Hours in subset: 4.069543121693122
Successfully saved 3.98-hour dataset to /srv/scratch/z5313567/thesis/CU_local/4h/CU_4h_train_dataframe_15sec.csv
Samples: 4842




In [31]:
dev_subset_secs = [(train_subset_secs[i] / train_prop) * dev_prop for i in range(len(train_subset_secs))]
dev_subset_fp = [data_dev_5hour_fp]
GetSubset(dev_subset_secs, total_duration_dev_df, data_dev_df, dev_subset_fp, seed)

For 0.852857142857143-hour dataset--> Hours in subset: 0.8833593222474175
Successfully saved 0.852857142857143-hour dataset to /srv/scratch/z5313567/thesis/CU_local/4h/CU_4h_dev_dataframe_15sec.csv
Samples: 1055




In [32]:
test_subset_secs = [(train_subset_secs[i] / train_prop) * test_prop for i in range(len(train_subset_secs))]
test_subset_fp = [data_test_5hour_fp]
GetSubset(test_subset_secs, total_duration_test_df, data_test_df, test_subset_fp, seed)

For 0.852857142857143-hour dataset--> Hours in subset: 0.8609941672965482
Successfully saved 0.852857142857143-hour dataset to /srv/scratch/z5313567/thesis/CU_local/4h//CU_4h_test_dataframe_15sec.csv
Samples: 1043


