In [1]:
# Purpose: partition datasets into training, development and testiong sets
# Requirement: prepare_scripted_datasets.py (ipynb)
#              group_speaker_id.py (ipynb)

In [2]:
print("\n------> Importing libraries... ------\n")

import pandas as pd
from sklearn.model_selection import train_test_split


------> Importing libraries... ------



In [3]:
# (cannot be changed) seed = 230
seed = 230 

# When changing the proportion, go to group_speaker_id.ipynb to change the parameter "prop_training"
train_num = 0.70
dev_num = 0.15
test_num = 0.15

In [4]:
print("\n------> Loading files... ------\n")
OGI_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dataframe.csv'
OGI_speaker_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_speaker_dataframe.csv'

OGI_train_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_train_dataframe.csv'
OGI_dev_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dev_dataframe.csv'
OGI_test_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_test_dataframe.csv'

print(f'OFI dataframe is stored at {OGI_df_fp}')
print(f'OGI group-by-speaker dataframe is stored at {OGI_train_df_fp}')
print(f'OGI train daraframe is stored at {OGI_train_df_fp}')
print(f'OGI development daraframe is stored at {OGI_dev_df_fp}')
print(f'OGI test daraframe is stored at {OGI_test_df_fp}')


------> Loading files... ------

OFI dataframe is stored at /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dataframe.csv
OGI group-by-speaker dataframe is stored at /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_train_dataframe.csv
OGI train daraframe is stored at /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_train_dataframe.csv
OGI development daraframe is stored at /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dev_dataframe.csv
OGI test daraframe is stored at /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_test_dataframe.csv


In [5]:
print("\n------> Splitting the group-by-speaker dataframe... ------\n")
OGI_speaker_df = pd.read_csv(OGI_speaker_df_fp)
train_speaker_set, dev_test_speaker_set = train_test_split(OGI_speaker_df, test_size=(dev_num+test_num), random_state=seed, shuffle=True)

prop_test = test_num/(dev_num+test_num)
dev_speaker_set, test_speaker_set = train_test_split(dev_test_speaker_set, test_size=prop_test, random_state=seed, shuffle=True)


------> Splitting the group-by-speaker dataframe... ------



In [6]:
print(f"Training dataset includes {len(train_speaker_set)} speakers and {train_speaker_set['duration'].sum()/(60*60)} hours")
print(f"Development dataset includes {len(dev_speaker_set)} speakers and {dev_speaker_set['duration'].sum()/(60*60)} hours")
print(f"Testing dataset includes {len(test_speaker_set)} speakers and {test_speaker_set['duration'].sum()/(60*60)} hours")

Training dataset includes 782 speakers and 48.47577176870748 hours
Development dataset includes 168 speakers and 10.60130604686319 hours
Testing dataset includes 168 speakers and 10.706544583018394 hours


In [7]:
# this code snippet is used for testing purposes
OGI_df = pd.read_csv(OGI_df_fp)

id_to_match = train_speaker_set['speaker_id'].values[2]
matching_row = OGI_df[OGI_df['speaker_id']==id_to_match]
print(matching_row)

                                                filepath  duration speaker_id   
44850  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  3.877188      ksk13  \
44851  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  1.967710      ksk13   
44852  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  1.968345      ksk13   
44853  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  4.599456      ksk13   
44854  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  3.156327      ksk13   
...                                                  ...       ...        ...   
44913  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  5.343628      ksk13   
44914  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  1.967574      ksk13   
44915  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  2.798095      ksk13   
44916  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  1.587664      ksk13   
44917  /srv/scratch/chacmod/OGI/speech/scripted/09/0/...  3.156916      ksk13   

                          t

In [8]:
OGI_df = pd.read_csv(OGI_df_fp)

In [17]:
print("\n------> Obtaining training dataframe... ------\n")

# Extra filepath, duration, speaker_id and transcription corresponsing training dataset from the original OGI dataframe
train_dp = pd.DataFrame(
        {'filepath': [],
         'duration': [],
         'speaker_id': [],
         'transcription': []
         })
for i in range(len(train_speaker_set)):
    id_to_match = train_speaker_set['speaker_id'].values[i]
    matching_speakers = OGI_df[OGI_df['speaker_id']==id_to_match]
    train_dp = pd.concat([train_dp, matching_speakers], ignore_index=True)  # Concatenate training dataframe with matching speakers, setting ignore_index=True to reindex the training dataframe
    train_dp.to_csv(OGI_train_df_fp, index=False)

print("Number of speech files:", len(train_dp))
print("Total hours:", train_dp['duration'].sum()/(60*60))
print('Successfully saved training dataframe to csv file at: ', OGI_train_df_fp)


------> Obtaining training dataframe... ------

Number of speech files: 50187
Total hours: 48.47577176870748
Successfully saved training dataframe to csv file at:  /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_train_dataframe.csv


In [15]:
print("\n------> Obtaining development dataframe... ------\n")

# Extra filepath, duration, speaker_id and transcription corresponsing development dataset from the original OGI dataframe
dev_dp = pd.DataFrame(
        {'filepath': [],
         'duration': [],
         'speaker_id': [],
         'transcription': []
         })
for i in range(len(dev_speaker_set)):
    id_to_match = dev_speaker_set['speaker_id'].values[i]
    matching_speakers = OGI_df[OGI_df['speaker_id']==id_to_match]
    dev_dp = pd.concat([dev_dp, matching_speakers], ignore_index=True)  # Concatenate training dataframe with matching speakers, setting ignore_index=True to reindex the training dataframe
    dev_dp.to_csv(OGI_dev_df_fp, index=False)
    
print("Number of speech files:", len(dev_dp))
print("Total hours:", dev_dp['duration'].sum()/(60*60))
print('Successfully saved development dataframe to csv file at: ', OGI_dev_df_fp)


------> Obtaining development dataframe... ------

Number of speech files: 10965
Total hours: 10.60130604686319
Successfully saved development dataframe to csv file at:  /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dev_dataframe.csv


In [16]:
print("\n------> Obtaining testing dataframe... ------\n")

# Extra filepath, duration, speaker_id and transcription corresponsing testing dataset from the original OGI dataframe
test_dp = pd.DataFrame(
        {'filepath': [],
         'duration': [],
         'speaker_id': [],
         'transcription': []
         })
for i in range(len(test_speaker_set)):
    id_to_match = test_speaker_set['speaker_id'].values[i]
    matching_speakers = OGI_df[OGI_df['speaker_id']==id_to_match]
    test_dp = pd.concat([test_dp, matching_speakers], ignore_index=True)  # Concatenate training dataframe with matching speakers, setting ignore_index=True to reindex the training dataframe
    test_dp.to_csv(OGI_test_df_fp, index=False)
    
print("Number of speech files:", len(test_dp))
print("Total hours:", test_dp['duration'].sum()/(60*60))
print('Successfully saved testing dataset to csv file at: ', OGI_test_df_fp)


------> Obtaining testing dataframe... ------

Number of speech files: 10847
Total hours: 10.706544583018392
Successfully saved testing dataset to csv file at:  /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_test_dataframe.csv
