In [None]:
pip install librosa


In [None]:
pip install os

In [None]:
pip install pandas

In [1]:
# Purpose: prepare datasets for OGI spontaneous datasets
# Based on https://github.com/musikalkemist/Deep-Learning-Audio-Application-From-Design-to-Deployment/blob/master/2-%20Preparing%20the%20Dataset/prepare_dataset.py
# Based on https://github.com/monomest/Thesis/blob/3a15f747dfd934535ffb7a02bf3fee97d9c546cb/s5/wav2vec_projects/OGI_prep.py#L40
print("\n------> Importing libraries... ------\n")

import librosa
import os
import json
import re
import pandas as pd


------> Importing libraries... ------



In [2]:
print("\n------> Loading files... ------\n")

# Path where the OGI datasets are stored
dataset_transcription_fp = "/srv/scratch/chacmod/OGI/trans/spontaneous"
dataset_speech_fp = "/srv/scratch/chacmod/OGI/speech/spontaneous"
print(f'Transcription files are stored at: {dataset_transcription_fp}')
print(f'Speech files are stored at: {dataset_speech_fp}')

# Path to save OGI dataframe
OGI_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_spontaneous_dataframe.csv'
print(f'OGI spontaneous dataframe is stored at: {OGI_df_fp}')




------> Loading files... ------

Transcription files are stored at: /srv/scratch/chacmod/OGI/trans/spontaneous
Speech files are stored at: /srv/scratch/chacmod/OGI/speech/spontaneous
OGI spontaneous dataframe is stored at: /srv/scratch/z5313567/thesis/OGI_local/OGI_spontaneous_dataframe.csv


In [3]:
print("\n------> Obtaining speech tags... ------\n")

trans = [];
squ_brkt_tags = [] # []
ang_brkt_tags = [] # <>
rnd_brkt_tags = [] # ()

# pattern to remove: [words], <words>, (words)
pattern = r'\[([^]]+)\]|<([^>]+)>|\(([^)]+)\)'

for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_transcription_fp)):
    if dirpath is not dataset_transcription_fp:      
        if filenames and any(filename.endswith(".txt") for filename in filenames):
            trans_fp = os.path.join(dirpath, filenames[0])
            with open(trans_fp, "r") as f:
                lines = f.readlines()
                line = lines[0]
                '''
                if '<br. and maybe walk my dog maybe play with my cat <br>' in line:
                    print(trans_fp)
                    break
                '''
                matches = re.findall(pattern, line)
                for match in matches:
                    if match[0]: # []
                        squ_brkt_tags.append('[{}]'.format(match[0]))
                        squ_brkt_tags = list(set(squ_brkt_tags)) # use set function to remove repetitive patterns
                    elif match[1]: # <>
                        ang_brkt_tags.append('<{}>'.format(match[1]))
                        ang_brkt_tags = list(set(ang_brkt_tags)) # use set function to remove repetitive patterns
                    elif match[2]: #  ()
                        rnd_brkt_tags.append('({})'.format(match[2]))
                        rnd_brkt_tags = list(set(rnd_brkt_tags)) # use set function to remove repetitive patterns
            f.close()

ang_brkt_tags.remove('<bs: can you des*>')
ang_brkt_tags.remove('<br. and maybe walk my dog maybe play with my cat <br>')
print(f"[] includes {squ_brkt_tags}, the number is {len(squ_brkt_tags)} \n")
print(f"<> includes {ang_brkt_tags}, the number is {len(ang_brkt_tags)} \n")
print(f"() includes {rnd_brkt_tags}, the number is {len(rnd_brkt_tags)}")



------> Obtaining speech tags... ------

[] includes ['[b]', '[oaches]', '[sure]', '[nhattan]', '[st]', '[gon]', '[ow]', '[elius]', '[cation]', '[ike]', '[ies]', '[attle]', '[end]', '[min]', '[ple]', '[tead]', '[nna]', '[n]', '[ster]', '[ead]', '[ther]', '[ght]', '[py]', '[ean]', '[na]', '[ell]', '[f]', '[unner]', '[ly]', '[bn]', '[corting]', '[day]', "[dn't]", '[uy]', '[me]', '[lack]', '[tling]', '[puter]', '[ge]', '[ke]', '[a]', '[aw]', '[ent]', '[retary]', '[la]', '[abet]', '[ndiana]', '[uch]', '[son]', '[sh]', '[sk]', '[thers]', '[ia]', '[c]', '[bs]', '[ver]', '[ahu]', '[cient]', '[tan]', '[eo]', '[x]', '[t]', '[peak]', '[wh]', '[em]', '[kes]', '[ll]', '[ck]', '[est]', '[nesday]', '[and]', '[ff]', '[ter]', '[gold]', '[amp]', '[tch]', '[zzle]', '[nk]', '[ting]', '[ack]', '[ward]', '[ah]', '[ers]', '[ifier]', '[id]', '[ide]', '[y]', '[ters]', '[tion]', '[dwiches]', '[th]', '[teal]', '[ve]', '[om]', '[ife]', '[seum]', '[ccer]', '[w]', '[mputer]', '[at]', '[us]', '[land]', '[ee]', '[a

In [4]:
#annotations = squ_brkt_tags + ang_brkt_tags 
#print(annotations)

In [5]:
print("\n------> Obtaining transcriptions... ------\n")

trans = [];

tags = ang_brkt_tags + squ_brkt_tags + rnd_brkt_tags

for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_transcription_fp)):
    if dirpath is not dataset_transcription_fp:
        #print(dirpath, "     ", dirnames, "    ",  filenames)       
        if filenames and (filename.endswith(".txt") for filename in filenames):
            trans_fp = os.path.join(dirpath, filenames[0])
            with open(trans_fp, "r") as f:
                lines = f.readlines()
                # remove all speech tags from transcription
                # <sing>, <br>, <bn>, <bs>, <ln>, <pau>, <long>, <laugh>, <pron>, <ls>, <ns>, <uu>, <whisper>, <fp>, <sniff>, <tc>, [gold]...
                line = lines[0]
                for tag in tags:
                    line = line.replace(tag, '') 
                line = line.replace('<bs', '')
                line = line.replace('<br', '')
                line = line.replace(']', '')
                line = line.replace('>', '')
                # remove unnecessary symbols
                chars_to_ignore = '[\,\?\.\!\-\;\:\"\*]'
                line = re.sub(chars_to_ignore, '', line).lower()
                # remove extra whitespace between words
                line = re.sub(r'\s+', ' ', line)
                # remove leading and trailing whitespaces
                line = line.strip()
                trans.append(line)
            f.close()
print(len(trans))


------> Obtaining transcriptions... ------

1101


In [6]:
print("\n------> Obtaining .wav filepath, durations and speaker ID... ------\n")

speech_filepath = [];
durations = [];
speaker_ids = [];

for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_speech_fp)):
    if dirpath is not dataset_speech_fp:
        #print(dirpath, "     ", dirnames, "    ",  filenames)    
        # check if 'filenames' is empty and with 'wav' extension
        if filenames and (filename.endswith(".wav") for filename in filenames):
            # obtain each .wav filepath
            speech_fp = os.path.join(dirpath, filenames[0])
            speech_filepath.append(speech_fp)
            
            # obtain the signal, sampling rate, and hence the duration of each .wav file
            signal, sample_rate = librosa.load(speech_fp)
            dur = len(signal)/sample_rate
            durations.append(dur)
            
            # obtain speaker id
            # for example, /srv/scratch/chacmod/OGI/speech/spontaneous/00/0/ks001/ks001xx0, ks001 is extracted
            directories =  speech_fp.split('/')
            speaker_id = directories[-2]
            speaker_ids.append(speaker_id)                               
            
print(len(speech_filepath), len(durations), len(speaker_ids))


------> Obtaining .wav filepath, durations and speaker ID... ------

1101 1101 1101


In [19]:
#combine_trans_fp = {k: v for k, v in zip(trans, speech_filepath)}
#print(combine_trans_fp)

In [20]:
OGI = pd.DataFrame(
        {'filepath': speech_filepath,
         'duration': durations,
         'speaker_id': speaker_ids,
         'transcription': trans
         })

In [21]:
# Remove rows where there is no transcription
# i.e. no spoken words, only silence or speech tags
OGI = OGI[OGI.transcription != None]
OGI = OGI[OGI.transcription != ""]

In [29]:
print("\n------> Saving dataframe to csv file... ------\n")
#OGI_df_fp = '/srv/scratch/z5313567/thesis/OGI_local/OGI_dataframe.csv'
OGI.to_csv(OGI_df_fp,index = False)
print('Successfully saved dataframe to csv file at', OGI_df_fp)
print("Total number of speakers:", len(OGI))
print("Total hours:", OGI['duration'].sum()/(60*60))


------> Saving dataframe to csv file... ------

Successfully saved dataframe to csv file at /srv/scratch/z5313567/thesis/OGI_local/OGI_spontaneous_dataframe.csv
Total number of speakers: 1101
Total hours: 30.5992863063744
