In [None]:
# Purpose: Extract and group the speakers from the datasets to ensure that the speakers appear in train datasets
#          do not appear in dev or test datasets.
#          (Optional) to extract speakers by the desired durations
# Requirement: prepare_scripted_datasets.py (ipynb)
# Based on https://github.com/monomest/Thesis/blob/3a15f747dfd934535ffb7a02bf3fee97d9c546cb/s5/wav2vec_projects/OGI_getSpkrs.py

In [1]:
print("\n------> Importing libraries... ------\n")

import pandas as pd


------> Importing libraries... ------



In [24]:
extract_by_duration = True
desired_training_duration = 60*60 # convert to seconds
prop_training = 0.7 
if extract_by_duration:
    print(f'The desired duration of the training dataset is {desired_training_duration} seconds, {desired_training_duration/60} minutes, and {desired_training_duration/(60*60)} hours')
    # Refer to data_splitting.ipynb, the entire dataset is split into training, development and testing sets by 0.7:0.15:0.15
    total_duration = desired_duration / prop_training
    print(f'The total duration including training, development and testing sets should be {total_duration}')

The desired duration of the training dataset is 3600 seconds, 60.0 minutes, and 1.0 hours
The total duration including training, development and testing sets should be 857.1428571428572


In [16]:
print("\n------> Loading files... ------\n")

# Path where the OGI dataframe are stored
OGI_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dataframe.csv'
#OGI_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_spontaneous_dataframe.csv'

# Path to save the OGI group-by-speaker daraframe 
OGI_speaker_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_speaker_dataframe.csv'
#OGI_speaker_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_spontaneous_speaker_dataframe.csv'

# Path to save the OGI group-by-speaker dataframe according to desired duration
OGI_speaker_duration_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/10min_datasets'

print(f'OGI dataframe is stored at {OGI_df_fp}')
print(f'OGI group-by-speaker daraframe is stored at {OGI_speaker_df_fp}')


------> Loading files... ------

OGI dataframe is stored at /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dataframe.csv
OGI group-by-speaker daraframe is stored at /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_speaker_dataframe.csv


In [7]:
OGI_df = pd.read_csv(OGI_df_fp)
print("\n------> Describing dataset... ------\n")
print(OGI_df.speaker_id.describe())


------> Describing dataset... ------

count     71999
unique     1118
top       ks737
freq         68
Name: speaker_id, dtype: object


In [8]:
OGI_df.drop(columns=['filepath', 'transcription'], inplace=True)

print("\n------> Grouping according to speakers... ------\n")
OGI_speaker_df = OGI_df.groupby('speaker_id')['duration'].sum()
OGI_speaker_df = OGI_speaker_df.reset_index()
print(OGI_speaker_df)


------> Grouping according to speakers... ------

     speaker_id    duration
0         ks000  218.426803
1         ks001  148.691202
2         ks003  133.172018
3         ks006  140.281134
4         ks008  146.327075
...         ...         ...
1113      ksl1g  214.528934
1114      ksl2t  203.015057
1115      ksl2v  222.057143
1116      ksl2w  220.096236
1117      ksl2x  208.330159

[1118 rows x 2 columns]


In [9]:
print("\n------> Saving dataframe to csv file... ------\n")
OGI_speaker_df.to_csv(OGI_speaker_df_fp, index = False)
print('Successfully saved group-by-speaker dataframe to csv file at:', OGI_speaker_df_fp)


------> Saving dataframe to csv file... ------

Successfully saved group-by-speaker dataframe to csv file at: /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_speaker_dataframe.csv


In [10]:
print('Total number of speakers:', len(OGI_speaker_df))
print("Total hours:", OGI_speaker_df['duration'].sum()/(60*60))

Total number of speakers: 1118
Total hours: 69.78362239858907


In [17]:
type(OGI_speaker_df)

pandas.core.frame.DataFrame

In [23]:
duration_count = 0
extracted_rows = []
if extract_by_duration:
    for index, row in OGI_speaker_df.iterrows():
        if duration_count <= total_duration:
            extracted_rows.append(row)
            duration_count += row['duration']
    extracted_df_by_duration = pd.DataFrame(extracted_rows)
    print('Total number of speakers:', len(extracted_df_by_duration))
    print("Total hours:", extracted_df_by_duration['duration'].sum()/(60*60))
    print("Total minutes:", extracted_df_by_duration['duration'].sum()/(60))
    print("Total seconds:", extracted_df_by_duration['duration'].sum())

Total number of speakers: 6
Total hours: 0.2555744142101285
Total minutes: 15.33446485260771
Total seconds: 920.0678911564626


In [21]:
print(extracted_df_by_duration)

  speaker_id    duration
0      ks000  218.426803
1      ks001  148.691202
2      ks003  133.172018
3      ks006  140.281134
4      ks008  146.327075
5      ks00b  133.169660
