In [10]:
# Purpose: prepare datasets for OGI scripted datasets

In [7]:
print("\n------> Importing libraries... ------\n")
import os
import csv
import re
import librosa
import pandas as pd


------> Importing libraries... ------



In [8]:
print("\n------> Loading files... ------\n")
map_fp = "/srv/scratch/z5313567/thesis/OGI_local/all.csv" 
print(f'Mapping file is stored at: {map_fp}')

dataset_speech_fp = "/srv/scratch/chacmod/OGI/speech/scripted"
scripted_trans_fp = "/srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dataframe.csv" 
print(f'Speech files are stored at: {dataset_speech_fp}')
print(f'OGI scripted dataframe is stored at: {scripted_trans_fp}')


------> Loading files... ------

Mapping file is stored at: /srv/scratch/z5313567/thesis/OGI_local/all.csv
Speech files are stored at: /srv/scratch/chacmod/OGI/speech/scripted
OGI scripted dataframe is stored at: /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dataframe.csv


In [9]:
print("\n------> Obtaining .wav filepath + 2-digit speech ID... ------\n")
print("\n------> Also obtaining durations and speaker ID... ------\n")
speech_filepath = []
key_speech_ids = []
durations = []
speaker_ids = []
for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_speech_fp)):
    if dirpath is not dataset_speech_fp:
        #print(dirpath, "     ", dirnames, "    ",  filenames)    
        # check if 'filenames' is empty and with 'wav' extension
        if filenames and (filename.endswith(".wav") for filename in filenames): 
            for file_name in filenames:               
                # obtain each .wav filepath
                speech_fp = os.path.join(dirpath, file_name)
                speech_filepath.append(speech_fp)
                
                # obtain speech id
                # for example, /srv/scratch/chacmod/OGI/speech/scripted/00/0/ks001/ks001000.wav, ks001000 is extracted
                # the 6th and 7th digits of ks001000, 00, map to the transcription 'advantage'
                speech_id = file_name.replace('.wav', '')
                key_speech_id = speech_id[5:7]
                key_speech_ids.append(key_speech_id.upper())
                
                # obtain the signal, sampling rate, and hence the duration of each .wav file
                signal, sample_rate = librosa.load(speech_fp)
                dur = len(signal)/sample_rate
                durations.append(dur)
                
                # obtain speaker id
                # for example, /srv/scratch/chacmod/OGI/speech/scripted/00/0/ks001/ks001000.wav, ks001 is extracted
                directories = speech_fp.split('/')
                speaker_id = directories[-2]
                speaker_ids.append(speaker_id)
                
print(f'The numbre of 2-digit speech IDs is {len(key_speech_ids)}, and the number of speech_filepath is {len(speech_filepath)}')


------> Obtaining .wav filepath + 2-digit speech ID... ------


------> Also obtaining durations and speaker ID... ------

The numbre of 2-digit speech IDs is 71999, and the number of speech_filepath is 71999


In [5]:
#id_fp = {'filepath': speech_filepath, '2-digit speech ID': key_speech_ids, 'duration': }
#id_fp_dp = pd.DataFrame(id_fp)
id_fp_dp = pd.DataFrame(
        {'filepath': speech_filepath,
         '2-digit speech ID': key_speech_ids,
         'duration': durations,
         'speaker_id': speaker_ids
         })

In [7]:
print("\n------> Creating 2-digit transcription ID + transcription dataframe... ------\n")
trans = []
trans_ids = []
pattern = r'\"([^]"+)\"'
i = 0
with open(map_fp, 'r') as map_file:
    reader = csv.reader(map_file)
    # each line is in the format: ['00 "advantage"']
    for line in reader:
        # check if the line is empty
        if len(line) != 0:
            # after split: line[0].split('"') = ['00 ', 'advantage', '']
            trans_id = line[0].split('"')[0]
            # remove the trailing space
            trans_id = trans_id.replace(' ', '')
            trans_ids.append(trans_id)
            
            keywords = line[0].split('"')[1]
            # remove unnecessary symbols
            chars_to_ignore = '[\,\?\.\!\-\;\:\"]'
            keywords = re.sub(chars_to_ignore, '', keywords).lower()
            trans.append(keywords)
print(f'The number of 2-digit transcription IDs is {len(trans_ids)} and the number of transcriptions is {len(trans)}')


------> Creating 2-digit transcription ID + transcription dataframe... ------

The number of 2-digit transcription IDs is 321 and the number of transcriptions is 321


In [8]:
#id_trans = {'2-digit transcription ID': trans_ids, 'transcription': trans}
#id_trans_df = pd.DataFrame(id_trans)
id_trans_df = pd.DataFrame(
        {'2-digit transcription ID': trans_ids,
         'transcription': trans
         })
#print(id_trans_df)

In [9]:
'''
# can delete
id_to_match = id_fp_dp['2-digit speech ID'].values[1]
matching_row = id_trans_df[id_trans_df['2-digit transcription ID']==id_to_match]
matching_trans = matching_row['transcription'].values[0]
print(type(matching_trans))
#matching_trans = id_trans_df
'''

"\n# can delete\nid_to_match = id_fp_dp['2-digit speech ID'].values[1]\nmatching_row = id_trans_df[id_trans_df['2-digit transcription ID']==id_to_match]\nmatching_trans = matching_row['transcription'].values[0]\nprint(type(matching_trans))\n#matching_trans = id_trans_df\n"

In [10]:
print("\n------> Matching 2-digit ID... ------\n")
extract_trans = []
for i in range(len(id_fp_dp)):
    id_to_match = id_fp_dp['2-digit speech ID'].values[i]
    matching_row = id_trans_df[id_trans_df['2-digit transcription ID']==id_to_match]
    matching_trans = matching_row['transcription'].values[0]
    extract_trans.append(matching_trans)


------> Matching 2-digit ID... ------



In [11]:
id_fp_dp['transcription'] = extract_trans

In [12]:
#id_fp_dp['2-digit speech ID'] = id_fp_dp['2-digit speech ID'].astype(str)

In [12]:
id_fp_dp.drop(columns='2-digit speech ID', inplace=True)

In [13]:
#| filepath | 2-digit speech ID | transcription |
print("\n------> Saving dataframe to csv file... ------\n")
id_fp_dp.to_csv(scripted_trans_fp, index=False)
print('Successfully saved dataframe to csv file at: ', scripted_trans_fp)
print("Total number of speakers:", len(id_fp_dp))
print("Total hours:", id_fp_dp['duration'].sum()/(60*60))


------> Saving dataframe to csv file... ------

Successfully saved dataframe to csv file at:  /srv/scratch/z5313567/thesis/OGI_local/OGI_scripted_dataframe.csv
Total number of speakers: 71999
Total hours: 69.78362239858907


In [None]:
'''
# can delete


# obtain speech id
# for example, /srv/scratch/chacmod/OGI/speech/scripted/00/0/ks001/ks001000.wav, ks001000 is extracted
# the 6th and 7th digits of ks001000, 00, map to the transcription 'advantage'
for filepath in speech_filepath:
    directories = filepath.split('/')
    speech_id = directories[-1]
    speech_id = speech_id.replace('.wav', '')
    key_speech_id = speech_id 
'''

"\n# obtain speech id\n# for example, /srv/scratch/chacmod/OGI/speech/scripted/00/0/ks001/ks001000.wav, ks001000 is extracted\n# the 6th and 7th digits of ks001000, 00, map to the transcription 'advantage'\nfor filepath in speech_filepath:\n    directories = filepath.split('/')\n    speech_id = directories[-1]\n    speech_id = speech_id.replace('.wav', '')\n    key_speech_id = speech_id \n"