In [1]:
import os
import csv
import librosa
import re
import pandas as pd

In [2]:
# directories containing the speech dataset
data_directory1 = "/srv/scratch/chacmod/CU_2/corpus/data/train-part1-cu"
data_directory2 = "/srv/scratch/chacmod/CU_2/corpus/data/train-part2-cu-sentences"
data_directory3 = "/srv/scratch/chacmod/CU_2/corpus/data/train-part3-cu-stories"
data_directory4 = "/srv/scratch/chacmod/CU_2/corpus/data/train-part4-cu-summaries"

# directories containing the transcription dataset
transcription_directory1 = "/srv/scratch/chacmod/CU_2/corpus/text_proc/train-part1-cu"
transcription_directory2 = "/srv/scratch/chacmod/CU_2/corpus/text_proc/train-part2-cu-sentences"
transcription_directory3 = '/srv/scratch/chacmod/CU_2/corpus/text_proc/train-part3-cu-stories'
transcription_directory4 = '/srv/scratch/chacmod/CU_2/corpus/text_proc/train-part4-cu-summaries'


In [3]:
# output CSV file
save_df_fp = "/srv/scratch/z5313567/thesis/CU_local/CU_dataframs.csv"

In [4]:
extracted_filepath = []
extracted_duration = []
extracted_speakerID = []
extracted_transcription = []

In [5]:
def get_duration(wav_filepath):
    signal, sample_rate = librosa.load(wav_filepath)
    dur = len(signal)/sample_rate
    return dur

In [6]:
chars_to_ignore_regex = '[,?.!*\-\;\:\"“%‘”�—’…–]'
ignore_tags = ['br', 'ga', 'sil']

def clean_transcription(transcription):
     # split the transcription into individual words
    words = transcription.split()  
    # filter out the unwanted tags
    cleaned_words = [word for word in words if word not in ignore_tags]
    # join invividual words
    cleaned_transcription = ' '.join(cleaned_words)
    # remove unnecessary symbols
    cleaned_transcription = re.sub(chars_to_ignore_regex, "", cleaned_transcription)
    # remove extra whitespace between words
    cleaned_transcription = re.sub(r"\s+", " ", cleaned_transcription)
    # convert to lower case
    cleaned_transcription = cleaned_transcription.lower()

    return cleaned_transcription.strip()

In [7]:
# ---------------------------------------for the /srv/scratch/chacmod/CU_2/corpus/data/train-part1-cu---------------------------------------------------
for speaker_folder in os.listdir(transcription_directory1):
    speaker_filepath = os.path.join(transcription_directory1, speaker_folder)
    if os.path.isdir(speaker_filepath): # check if this is a folder
         # extract the speaker id from the folder name. For example, CC-00-01-00001, the speaker_id is 00001
        speaker_id = speaker_folder.split("-")[3]
        speaker_id = str(speaker_id) # preverse leading zeros
        speaker_path = speaker_filepath # /srv/scratch/chacmod/CU_2/corpus/text_proc/train-part2-cu-sentences/spk-05-106-sent
        
        for transcription_file in os.listdir(speaker_path):
            if transcription_file.endswith(".txt"): # CC-00-01-00001-00001-01-0.txt
                transcription_filepath = os.path.join(speaker_path, transcription_file) # /srv/scratch/chacmod/CU_2/corpus/text_proc/train-part1-cu/CC-00-01-00001/CC-00-01-00001-00001-01-0.txt

                # extract the corresponding wav filepath
                wav_filename = os.path.splitext(transcription_file)[0] + ".wav" # CC-00-01-00001-00001-01-0.wav 
                wav_filepath = os.path.join(data_directory1, speaker_folder, wav_filename) # /srv/scratch/chacmod/CU_2/corpus/data/train-part1-cu/CC-00-01-00001/CC-00-01-00001-00001-01-0.wav                        
                
                # Check if the wav file exists before getting its duration
                if not os.path.exists(wav_filepath):
                    print(f"Warning: .wav file not found for {transcription_filepath}. Skipping.")
                    continue
                # check if the .wav file is empty brfore getting its duration
                if os.path.getsize(wav_filepath) == 0:
                    print(f"Warning: .wav file {wav_filepath} is empty. Skipping.")
                    continue
                # Get the duration of the wav file
                duration = get_duration(wav_filepath)
                
                # extract the transcription from the file
                with open(transcription_filepath, "r") as f:
                    transcription = f.read()            
                cleaned_transcription = clean_transcription(transcription)
                # check if the cleaned_transcription is empty
                if not cleaned_transcription:
                    print(f"Warning: Cleaned transcription is empty for {transcription_filepath}. Skipping.")
                    continue
                
                
                extracted_filepath.append(wav_filepath)
                extracted_duration.append(duration)
                extracted_speakerID.append(speaker_id)
                extracted_transcription.append(cleaned_transcription)



In [8]:
# ---------------------------------------for the /srv/scratch/chacmod/CU_2/corpus/data/train-part2-cu-sentences---------------------------------------------------
for speaker_folder in os.listdir(transcription_directory2):
    speaker_filepath = os.path.join(transcription_directory2, speaker_folder)
    if os.path.isdir(speaker_filepath): # check if this is a folder
         # extract the speaker id from the folder name. For example, spk-05-106-sent, the speaker_id is 106
        speaker_id = speaker_folder.split("-")[2]
        speaker_id = str(speaker_id) # preverse leading zeros
        speaker_path = speaker_filepath # /srv/scratch/chacmod/CU_2/corpus/text_proc/train-part2-cu-sentences/spk-05-106-sent
        
        for transcription_file in os.listdir(speaker_path):
            if transcription_file.endswith(".txt"): # 9.txt
                transcription_filepath = os.path.join(speaker_path, transcription_file) #/srv/scratch/chacmod/CU_2/corpus/text_proc/train-part2-cu-sentences/spk-05-106-sent/9.txt

                # extract the corresponding wav filepath
                wav_filename = os.path.splitext(transcription_file)[0] + ".wav" # 9.wav
                wav_filepath = os.path.join(data_directory2, speaker_folder, wav_filename) # /srv/scratch/chacmod/CU_2/corpus/data/train-part2-cu-sentences/spk-05-106-sent/9.wav                              
                
                # Check if the wav file exists before getting its duration
                if not os.path.exists(wav_filepath):
                    print(f"Warning: .wav file not found for {transcription_filepath}. Skipping.")
                    continue
                # check if the .wav file is empty brfore getting its duration
                if os.path.getsize(wav_filepath) == 0:
                    print(f"Warning: .wav file {wav_filepath} is empty. Skipping.")
                    continue
                # Get the duration of the wav file
                duration = get_duration(wav_filepath)
                
                # extract the transcription from the file
                with open(transcription_filepath, "r") as f:
                    transcription = f.read()            
                cleaned_transcription = clean_transcription(transcription)
                # check if the cleaned_transcription is empty
                if not cleaned_transcription:
                    print(f"Warning: Cleaned transcription is empty for {transcription_filepath}. Skipping.")
                    continue
                
                extracted_filepath.append(wav_filepath)
                extracted_duration.append(duration)
                extracted_speakerID.append(speaker_id)
                extracted_transcription.append(cleaned_transcription)



In [9]:
# ---------------------------------------for the /srv/scratch/chacmod/CU_2/corpus/data/train-part3-cu-stories---------------------------------------------------
for series_folder in os.listdir(transcription_directory3):
    series_filepath = os.path.join(transcription_directory3, series_folder)
    for speaker_folder in os.listdir(series_filepath):
        speaker_filepath = os.path.join(series_filepath, speaker_folder)
        if os.path.isdir(speaker_filepath): # check if this is a folder
             # extract the speaker id from the folder name. For example, spk-05-106-sent, the speaker_id is 106
            speaker_id = speaker_folder.split("-")[2]
            speaker_id = str(speaker_id) # preverse leading zeros
            speaker_path = speaker_filepath # /srv/scratch/chacmod/CU_2/corpus/text_proc/train-part3-cu-stories/03/spk-03-031
            
            for transcription_file in os.listdir(speaker_path):
                if transcription_file.endswith(".txt"): # 9.txt
                    transcription_filepath = os.path.join(speaker_path, transcription_file) # /srv/scratch/chacmod/CU_2/corpus/text_proc/train-part3-cu-stories/03/spk-03-031/spk-03-031-story-100.txt

                    # extract the corresponding wav filepath
                    wav_filename = os.path.splitext(transcription_file)[0] + ".wav" # 9.wav
                    wav_filepath = os.path.join(data_directory3, series_folder, speaker_folder, wav_filename) # /srv/scratch/chacmod/CU_2/corpus/data/train-part3-cu-stories/03/spk-03-031/spk-03-031-story-100.wav                            
                    
                    # Check if the wav file exists before getting its duration
                    if not os.path.exists(wav_filepath):
                        print(f"Warning: .wav file not found for {transcription_filepath}. Skipping.")
                        continue
                    # check if the .wav file is empty brfore getting its duration
                    if os.path.getsize(wav_filepath) == 0:
                        print(f"Warning: .wav file {wav_filepath} is empty. Skipping.")
                        continue
                    # Get the duration of the wav file
                    duration = get_duration(wav_filepath)

                    # extract the transcription from the file
                    with open(transcription_filepath, "r") as f:
                        transcription = f.read()            
                    cleaned_transcription = clean_transcription(transcription)
                    # check if the cleaned_transcription is empty
                    if not cleaned_transcription:
                        print(f"Warning: Cleaned transcription is empty for {transcription_filepath}. Skipping.")
                        continue

                    extracted_filepath.append(wav_filepath)
                    extracted_duration.append(duration)
                    extracted_speakerID.append(speaker_id)
                    extracted_transcription.append(cleaned_transcription)



In [10]:
# ---------------------------------------for the /srv/scratch/chacmod/CU_2/corpus/data/train-part4-cu-summaries---------------------------------------------------
for series_folder in os.listdir(transcription_directory4):
    series_filepath = os.path.join(transcription_directory4, series_folder)
    for speaker_folder in os.listdir(series_filepath):
        speaker_filepath = os.path.join(series_filepath, speaker_folder)
        if os.path.isdir(speaker_filepath): # check if this is a folder
             # extract the speaker id from the folder name. For example, spk-05-106-sent, the speaker_id is 106
            speaker_id = speaker_folder.split("-")[2]
            speaker_id = str(speaker_id) # preverse leading zeros
            speaker_path = speaker_filepath # /srv/scratch/chacmod/CU_2/corpus/text_proc/train-part4-cu-summaries/01/spk-01-257
            
            for transcription_file in os.listdir(speaker_path):
                if transcription_file.endswith(".txt"): # 9.txt
                    transcription_filepath = os.path.join(speaker_path, transcription_file) # /srv/scratch/chacmod/CU_2/corpus/text_proc/train-part4-cu-summaries/01/spk-01-257/1083774655656-100.txt

                    # extract the corresponding wav filepath
                    wav_filename = os.path.splitext(transcription_file)[0] + ".wav" # 9.wav
                    wav_filepath = os.path.join(data_directory4, series_folder, speaker_folder, wav_filename) # /srv/scratch/chacmod/CU_2/corpus/data/train-part4-cu-summaries/01/spk-01-257/1083774655656-100.wav                            

                    # Check if the wav file exists before getting its duration
                    if not os.path.exists(wav_filepath):
                        print(f"Warning: .wav file not found for {transcription_filepath}. Skipping.")
                        continue
                    # check if the .wav file is empty brfore getting its duration
                    if os.path.getsize(wav_filepath) == 0:
                        print(f"Warning: .wav file {wav_filepath} is empty. Skipping.")
                        continue
                    # Get the duration of the wav file
                    duration = get_duration(wav_filepath)

                    # extract the transcription from the file
                    with open(transcription_filepath, "r") as f:
                        transcription = f.read()            
                    cleaned_transcription = clean_transcription(transcription)
                    # check if the cleaned_transcription is empty
                    if not cleaned_transcription:
                        print(f"Warning: Cleaned transcription is empty for {transcription_filepath}. Skipping.")
                        continue

                    extracted_filepath.append(wav_filepath)
                    extracted_duration.append(duration)
                    extracted_speakerID.append(speaker_id)
                    extracted_transcription.append(cleaned_transcription)



In [13]:
print("\n--------------------------> Generating dataframe ... -------------------------\n")
df = pd.DataFrame(
        {'filepath': extracted_filepath,
         'duration': extracted_duration,
         'speaker_id': extracted_speakerID,
         'transcription': extracted_transcription,
         })
# preverse leading zeros
df['speaker_id'] = df['speaker_id'].astype(str) 

# when accessing the dataframe, run 'df = pd.read_csv(save_df_fp, dtype={'speaker_id': str})' to avoid missing leading 
print("\n---------------------------> Saving dataframe to csv file... ------------------\n")
df.to_csv(save_df_fp, index=False)
print('Successfully saved dataframe to csv file at: ', save_df_fp)
print("Total number of speakers:", len(set(df["speaker_id"])))
print("Total hours:", df['duration'].sum()/(60*60))


--------------------------> Generating dataframe ... -------------------------


---------------------------> Saving dataframe to csv file... ------------------

Successfully saved dataframe to csv file at:  /srv/scratch/z5313567/thesis/CU_local/CU_dataframs.csv
Total number of speakers: 843
Total hours: 73.60574982363316
