In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [18]:
full_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_dataframe.csv'
train_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_train_dataframe.csv'
dev_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_dev_dataframe.csv'
test_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_test_dataframe.csv'

In [19]:
df = pd.read_csv(full_df_fp, dtype={'speaker_id': str})

In [20]:
# shuffle the dataframe
df = df.sample(frac=1, random_state=230)

In [21]:
unique_transcriptions = df['transcription'].unique()
train_transcriptions, valid_test_transcriptions = train_test_split(unique_transcriptions, test_size=0.4, random_state=42)
valid_transcriptions, test_transcriptions = train_test_split(valid_test_transcriptions, test_size=0.5, random_state=42)

train_set = df[df['transcription'].isin(train_transcriptions)]
valid_set = df[df['transcription'].isin(valid_transcriptions)]
test_set = df[df['transcription'].isin(test_transcriptions)]

In [22]:
def split_by_speakers(data):
    unique_speakers = data['speaker_id'].unique()
    train_speakers, valid_test_speakers = train_test_split(unique_speakers, test_size=0.4, random_state=42)
    valid_speakers, test_speakers = train_test_split(valid_test_speakers, test_size=0.5, random_state=42)

    train_data = data[data['speaker_id'].isin(train_speakers)]
    valid_data = data[data['speaker_id'].isin(valid_speakers)]
    test_data = data[data['speaker_id'].isin(test_speakers)]

    return train_data, valid_data, test_data

train_set, valid_set, test_set = split_by_speakers(train_set), split_by_speakers(valid_set), split_by_speakers(test_set)


In [23]:
train_df, valid_df, test_df = train_set[0], valid_set[0], test_set[0]

In [27]:
type(train_df)

pandas.core.frame.DataFrame

In [30]:
train_df['speaker_id'] = train_df['speaker_id'].astype(str)  # preverse leading zeros
train_df.to_csv(train_df_fp, index=False)

print("Number of speech files:", len(train_df))
print("Total hours:", train_df['duration'].sum()/(60*60))
print("Total minutes:", train_df['duration'].sum()/(60))
print("Total seconds:", train_df['duration'].sum())
print('Successfully saved training dataframe to csv file at: ', train_df_fp)

Number of speech files: 16728
Total hours: 42.24523934240362
Total minutes: 2534.7143605442175
Total seconds: 152082.86163265305
Successfully saved training dataframe to csv file at:  /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_train_dataframe


In [31]:
valid_df['speaker_id'] = valid_df['speaker_id'].astype(str)  # preverse leading zeros
valid_df.to_csv(dev_df_fp, index=False)

print("Number of speech files:", len(valid_df))
print("Total hours:", valid_df['duration'].sum()/(60*60))
print("Total minutes:", valid_df['duration'].sum()/(60))
print("Total seconds:", valid_df['duration'].sum())
print('Successfully saved development dataframe to csv file at: ', dev_df_fp)

Number of speech files: 5455
Total hours: 9.858598437893676
Total minutes: 591.5159062736205
Total seconds: 35490.954376417234
Successfully saved development dataframe to csv file at:  /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_dev_dataframe


In [32]:
test_df['speaker_id'] = test_df['speaker_id'].astype(str)  # preverse leading zeros
test_df.to_csv(test_df_fp, index=False)
    
print("Number of speech files:", len(test_df))
print("Total hours:", test_df['duration'].sum()/(60*60))
print("Total minutes:", test_df['duration'].sum()/(60))
print("Total seconds:", test_df['duration'].sum())
print('Successfully saved testing dataset to csv file at: ', test_df_fp)

Number of speech files: 5941
Total hours: 16.536351473922902
Total minutes: 992.1810884353741
Total seconds: 59530.86530612245
Successfully saved testing dataset to csv file at:  /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_test_dataframe
