In [5]:
import pandas as pd

In [6]:
scripted_train_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_train_dataframe.csv'
scripted_dev_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_dev_dataframe.csv'
scripted_test_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_test_dataframe.csv'
spontaneous_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/AusKidTalk_spontaneous_dataframe.csv'

combined_train_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous/AusKidTalk_combined_scripted_spontaneous_train_dataframe.csv'
combined_dev_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous/AusKidTalk_combined_scripted_spontaneous_dev_dataframe.csv'
combined_test_fp = '/srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous/AusKidTalk_combined_scripted_spontaneous_test_dataframe.csv'

In [7]:
scripted_df = pd.read_csv(scripted_train_fp)
spontaneous_df = pd.read_csv(spontaneous_fp)

In [8]:
scripted_speaker_ids = scripted_df["speaker_id"].unique()

In [9]:
filtered_spontaneous_df = spontaneous_df[spontaneous_df['speaker_id'].isin(scripted_speaker_ids)]

In [10]:
combined_df = pd.concat([scripted_df, filtered_spontaneous_df], ignore_index=True)

In [11]:
def combine_scripted_spontaneous(scripted_fp, spontaneous_fp, combined_fp):
    scripted_df = pd.read_csv(scripted_fp)
    spontaneous_df = pd.read_csv(spontaneous_fp)
    scripted_speaker_ids = scripted_df['speaker_id'].unique()
    spontaneous_same_spkr_id_df = spontaneous_df[spontaneous_df['speaker_id'].isin(scripted_speaker_ids)]
    combined_df = pd.concat([scripted_df, spontaneous_same_spkr_id_df], ignore_index=True)
    print('------------------------------Saved Successfully-----------------------------\n')
    combined_df.to_csv(combined_fp, index=False)
    
    num_speakers = combined_df['speaker_id'].nunique()
    total_duration = combined_df['duration'].sum()/(60*60) # in hours
    
    print(f'For the combined dataframe: {combined_fp}\n')
    print(f'The total number of speakers is {num_speakers}\n')
    print(f'The total duration of audio files is {total_duration} hours.\n')
    print(f'The number of scripted speeches is {len(scripted_df)}\n')
    print(f'The number of spontaneous speeches is {len(spontaneous_same_spkr_id_df)}\n')

In [12]:
print('------------------------------Processing training dataset-----------------------------\n')
combine_scripted_spontaneous(scripted_train_fp, spontaneous_fp, combined_train_fp)

------------------------------Processing training dataset-----------------------------

------------------------------Saved Successfully-----------------------------

For the combined dataframe: /srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous/AusKidTalk_combined_scripted_spontaneous_train_dataframe.csv

The total number of speakers is 55

The total duration of audio files is 1.6736812458128523 hours.

The number of scripted speeches is 8093

The number of spontaneous speeches is 60



In [13]:
print('------------------------------Processing development dataset-----------------------------\n')
combine_scripted_spontaneous(scripted_dev_fp, spontaneous_fp, combined_dev_fp)

------------------------------Processing development dataset-----------------------------

------------------------------Saved Successfully-----------------------------

For the combined dataframe: /srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous/AusKidTalk_combined_scripted_spontaneous_dev_dataframe.csv

The total number of speakers is 12

The total duration of audio files is 0.40105846368583753 hours.

The number of scripted speeches is 1744

The number of spontaneous speeches is 27



In [14]:
print('------------------------------Processing testing dataset-----------------------------\n')
combine_scripted_spontaneous(scripted_test_fp, spontaneous_fp, combined_test_fp)

------------------------------Processing testing dataset-----------------------------

------------------------------Saved Successfully-----------------------------

For the combined dataframe: /srv/scratch/z5313567/thesis/AusKidTalk_local/combined_scripted_spontaneous/AusKidTalk_combined_scripted_spontaneous_test_dataframe.csv

The total number of speakers is 12

The total duration of audio files is 0.3783709397312221 hours.

The number of scripted speeches is 1774

The number of spontaneous speeches is 24



In [15]:
print(f'The proportion of training set is {1.6736812458128523 /(1.6736812458128523 +0.40105846368583753+0.3783709397312221 )}\n')
print(f'The proportion of development set is {0.40105846368583753 /(1.6736812458128523 +0.40105846368583753+0.3783709397312221  )}\n')
print(f'The proportion of testing set is {0.3783709397312221/(1.6736812458128523 +0.40105846368583753+0.3783709397312221 )}\n')


The proportion of training set is 0.6822689577163018

The proportion of development set is 0.1634897568977327

The proportion of testing set is 0.15424128538596557

