In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [28]:
full_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_dataframe_15sec.csv'
train_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_train_dataframe_15sec.csv'
dev_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_dev_dataframe_15sec.csv'
test_df_fp = '/srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_test_dataframe_15sec.csv'

In [29]:
df = pd.read_csv(full_df_fp, dtype={'speaker_id': str})

In [30]:
seed = 230

In [31]:
# 1.group by transcription
group_by_transcription = df.groupby('transcription')['duration'].sum()
group_by_transcription = group_by_transcription.reset_index()

# 2.split the dataframe into training, development and testing sets based on transcription
train_transcription, dev_test_transcription = train_test_split(group_by_transcription, test_size=0.6, random_state=seed, shuffle=True)
dev_transcription, test_transcription = train_test_split(dev_test_transcription, test_size=0.5, random_state=seed, shuffle=True)

In [32]:
# 3.group by speaker_id
group_by_speaker = df.groupby('speaker_id')['duration'].sum()
group_by_speaker = group_by_speaker.reset_index()

# 4.split the dataframe into training, development and testing sets based on speaker_id
train_speaker, dev_test_speaker = train_test_split(group_by_speaker, test_size=0.3, random_state=seed, shuffle=True)
dev_speaker, test_speaker = train_test_split(dev_test_speaker, test_size=0.5, random_state=seed, shuffle=True)

In [33]:
# combine speaker_ids and transcriptions in training, development and testing sets 
def combine_sets(transcription_df, speaker_df):
    # extract rows with matching speaker_ids from speaker_df
    matched_speakers = df[df['speaker_id'].isin(speaker_df['speaker_id'])]
    print('The number of matched speakers is: ', len(set(matched_speakers['speaker_id'])))
    # extract rows with matching transcriptions from transcription_df
    matched_transcriptions = df[df['transcription'].isin(transcription_df['transcription'])]
    print('The number of matched transcriptions is: ', len(set(matched_transcriptions['transcription'])))
    
    # extract rows with both matching speaker_ids and transcriptions
    matched_rows = df[df['speaker_id'].isin(speaker_df['speaker_id']) & df['transcription'].isin(transcription_df['transcription'])]
    return matched_rows

train_df = combine_sets(train_transcription, train_speaker)
dev_df = combine_sets(dev_transcription, dev_speaker)
test_df = combine_sets(test_transcription, test_speaker)

The number of matched speakers is:  542
The number of matched transcriptions is:  26
The number of matched speakers is:  116
The number of matched transcriptions is:  19
The number of matched speakers is:  117
The number of matched transcriptions is:  20


In [34]:
train_df.to_csv(train_df_fp, index=False)

print("Number of speech files:", len(train_df))
print("Total hours:", train_df['duration'].sum()/(60*60))
print("Total minutes:", train_df['duration'].sum()/(60))
print("Total seconds:", train_df['duration'].sum())
print('Successfully saved training dataframe to csv file at: ', train_df_fp)

Number of speech files: 12260
Total hours: 20.48828230032754
Total minutes: 1229.2969380196523
Total seconds: 73757.81628117914
Successfully saved training dataframe to csv file at:  /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_train_dataframe_15sec.csv


In [35]:
dev_df.to_csv(dev_df_fp, index=False)

print("Number of speech files:", len(dev_df))
print("Total hours:", dev_df['duration'].sum()/(60*60))
print("Total minutes:", dev_df['duration'].sum()/(60))
print("Total seconds:", dev_df['duration'].sum())
print('Successfully saved development dataframe to csv file at: ', dev_df_fp)

Number of speech files: 1830
Total hours: 2.775826820357773
Total minutes: 166.54960922146637
Total seconds: 9992.976553287983
Successfully saved development dataframe to csv file at:  /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_dev_dataframe_15sec.csv


In [36]:
test_df.to_csv(test_df_fp, index=False)
    
print("Number of speech files:", len(test_df))
print("Total hours:", test_df['duration'].sum()/(60*60))
print("Total minutes:", test_df['duration'].sum()/(60))
print("Total seconds:", test_df['duration'].sum())
print('Successfully saved testing dataset to csv file at: ', test_df_fp)

Number of speech files: 1895
Total hours: 3.06150382968002
Total minutes: 183.6902297808012
Total seconds: 11021.413786848072
Successfully saved testing dataset to csv file at:  /srv/scratch/z5313567/thesis/AusTalk_local/story_sentences/full/AusTalk_story_sentences_full_test_dataframe_15sec.csv
