In [18]:
# Purpose: partition datasets into training, development and testiong sets
# Requirement: prepare_scripted_datasets.py (ipynb)
#              Check_duration_15sec.py (ipynb)
#              group_speaker_id.py (ipynb)

In [19]:
print("\n------> Importing libraries... ------\n")

import pandas as pd
from sklearn.model_selection import train_test_split


------> Importing libraries... ------



In [20]:
# (cannot be changed) seed = 230
seed = 230 

# !!!!!! When changing the proportion, go to group_speaker_id.ipynb to change the parameter "prop_training"
train_num = 0.70
dev_num = 0.15
test_num = 0.15

In [21]:
print("\n------> Loading files... ------\n")

OGI_df_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_dataframe_15sec.csv'
OGI_speaker_df_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_speaker_dataframe_15sec.csv'

OGI_train_df_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_train_dataframe_15sec.csv'
OGI_dev_df_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_dev_dataframe_15sec.csv'
OGI_test_df_fp = '/srv/scratch/z5313567/thesis/CU_local/CU_full_test_dataframe_15sec.csv'

print(f'OFI dataframe is stored at {OGI_df_fp}')
print(f'OGI group-by-speaker dataframe is stored at {OGI_speaker_df_fp}')
print(f'OGI train daraframe is stored at {OGI_train_df_fp}')
print(f'OGI development daraframe is stored at {OGI_dev_df_fp}')
print(f'OGI test daraframe is stored at {OGI_test_df_fp}')


------> Loading files... ------

OFI dataframe is stored at /srv/scratch/z5313567/thesis/CU_local/CU_full_dataframe_15sec.csv
OGI group-by-speaker dataframe is stored at /srv/scratch/z5313567/thesis/CU_local/CU_full_speaker_dataframe_15sec.csv
OGI train daraframe is stored at /srv/scratch/z5313567/thesis/CU_local/CU_full_train_dataframe_15sec.csv
OGI development daraframe is stored at /srv/scratch/z5313567/thesis/CU_local/CU_full_dev_dataframe_15sec.csv
OGI test daraframe is stored at /srv/scratch/z5313567/thesis/CU_local/CU_full_test_dataframe_15sec.csv


In [22]:
print("\n------> Splitting the group-by-speaker dataframe... ------\n")
OGI_speaker_df = pd.read_csv(OGI_speaker_df_fp, dtype={'speaker_id': str})
train_speaker_set, dev_test_speaker_set = train_test_split(OGI_speaker_df, test_size=(dev_num+test_num), random_state=seed, shuffle=True)

prop_test = test_num/(dev_num+test_num)
dev_speaker_set, test_speaker_set = train_test_split(dev_test_speaker_set, test_size=prop_test, random_state=seed, shuffle=True)


------> Splitting the group-by-speaker dataframe... ------



In [23]:
print(f"Training dataset includes {len(train_speaker_set)} speakers and {train_speaker_set['duration'].sum()/(60*60)} hours")
print(f"Development dataset includes {len(dev_speaker_set)} speakers and {dev_speaker_set['duration'].sum()/(60*60)} hours")
print(f"Testing dataset includes {len(test_speaker_set)} speakers and {test_speaker_set['duration'].sum()/(60*60)} hours")

Training dataset includes 641 speakers and 47.1312291005291 hours
Development dataset includes 138 speakers and 8.942813466868229 hours
Testing dataset includes 138 speakers and 8.584900818846059 hours


In [7]:
# this code snippet is used for testing purposes
OGI_df = pd.read_csv(OGI_df_fp, dtype={'speaker_id': str})

id_to_match = train_speaker_set['speaker_id'].values[2]
matching_row = OGI_df[OGI_df['speaker_id']==id_to_match]
print(matching_row)

                                                filepath  duration speaker_id   
67131  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  6.820091    0100302  \
67132  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  4.470068    0100302   
67133  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  4.300091    0100302   
67134  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  5.400091    0100302   
67135  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  8.720091    0100302   
...                                                  ...       ...        ...   
77559  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  5.844082    0100302   
77560  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.844127    0100302   
77561  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  2.250068    0100302   
77562  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  4.718095    0100302   
77563  /srv/scratch/chacmod/CU_2/corpus/data/train-pa...  1.781134    0100302   

                           

In [8]:
print("\n------> Obtaining training dataframe... ------\n")

# Extra filepath, duration, speaker_id and transcription corresponsing training dataset from the original OGI dataframe
train_dp = pd.DataFrame(
        {'filepath': [],
         'duration': [],
         'speaker_id': [],
         'transcription': []
         })
for i in range(len(train_speaker_set)):
    id_to_match = train_speaker_set['speaker_id'].values[i]
    matching_speakers = OGI_df[OGI_df['speaker_id']==id_to_match]
    train_dp = pd.concat([train_dp, matching_speakers], ignore_index=True)  # Concatenate training dataframe with matching speakers, setting ignore_index=True to reindex the training dataframe

train_dp['speaker_id'] = train_dp['speaker_id'].astype(str)  # preverse leading zeros
train_dp.to_csv(OGI_train_df_fp, index=False)

print("Number of speech files:", len(train_dp))
print("Total hours:", train_dp['duration'].sum()/(60*60))
print("Total minutes:", train_dp['duration'].sum()/(60))
print("Total seconds:", train_dp['duration'].sum())
print('Successfully saved training dataframe to csv file at: ', OGI_train_df_fp)


------> Obtaining training dataframe... ------

Number of speech files: 57329
Total hours: 47.1312291005291
Total minutes: 2827.8737460317457
Total seconds: 169672.42476190475
Successfully saved training dataframe to csv file at:  /srv/scratch/z5313567/thesis/CU_local/CU_full_train_dataframe_15sec.csv


In [9]:
print("\n------> Obtaining development dataframe... ------\n")

# Extra filepath, duration, speaker_id and transcription corresponsing development dataset from the original OGI dataframe
dev_dp = pd.DataFrame(
        {'filepath': [],
         'duration': [],
         'speaker_id': [],
         'transcription': []
         })
for i in range(len(dev_speaker_set)):
    id_to_match = dev_speaker_set['speaker_id'].values[i]
    matching_speakers = OGI_df[OGI_df['speaker_id']==id_to_match]
    dev_dp = pd.concat([dev_dp, matching_speakers], ignore_index=True)  # Concatenate training dataframe with matching speakers, setting ignore_index=True to reindex the training dataframe

dev_dp['speaker_id'] = dev_dp['speaker_id'].astype(str)  # preverse leading zeros
dev_dp.to_csv(OGI_dev_df_fp, index=False)

print("Number of speech files:", len(dev_dp))
print("Total hours:", dev_dp['duration'].sum()/(60*60))
print("Total minutes:", dev_dp['duration'].sum()/(60))
print("Total seconds:", dev_dp['duration'].sum())
print('Successfully saved development dataframe to csv file at: ', OGI_dev_df_fp)


------> Obtaining development dataframe... ------

Number of speech files: 11057
Total hours: 8.942813466868229
Total minutes: 536.5688080120938
Total seconds: 32194.128480725623
Successfully saved development dataframe to csv file at:  /srv/scratch/z5313567/thesis/CU_local/CU_full_dev_dataframe_15sec.csv


In [10]:
print("\n------> Obtaining testing dataframe... ------\n")

# Extra filepath, duration, speaker_id and transcription corresponsing testing dataset from the original OGI dataframe
test_dp = pd.DataFrame(
        {'filepath': [],
         'duration': [],
         'speaker_id': [],
         'transcription': []
         })
for i in range(len(test_speaker_set)):
    id_to_match = test_speaker_set['speaker_id'].values[i]
    matching_speakers = OGI_df[OGI_df['speaker_id']==id_to_match]
    test_dp = pd.concat([test_dp, matching_speakers], ignore_index=True)  # Concatenate training dataframe with matching speakers, setting ignore_index=True to reindex the training dataframe
    
test_dp['speaker_id'] = test_dp['speaker_id'].astype(str)  # preverse leading zeros
test_dp.to_csv(OGI_test_df_fp, index=False)
    
print("Number of speech files:", len(test_dp))
print("Total hours:", test_dp['duration'].sum()/(60*60))
print("Total minutes:", test_dp['duration'].sum()/(60))
print("Total seconds:", test_dp['duration'].sum())
print('Successfully saved testing dataset to csv file at: ', OGI_test_df_fp)


------> Obtaining testing dataframe... ------

Number of speech files: 10489
Total hours: 8.584900818846057
Total minutes: 515.0940491307634
Total seconds: 30905.642947845805
Successfully saved testing dataset to csv file at:  /srv/scratch/z5313567/thesis/CU_local/CU_full_test_dataframe_15sec.csv
