In [13]:
import os
import csv
import librosa
import re
import pandas as pd

In [14]:
# directory containing the speech dataset
data_directory = "/srv/scratch/chacmod/CU_2/corpus/data/train-part3-cu-stories"

# directory containing the transcription dataset
transcription_directory = '/srv/scratch/chacmod/CU_2/corpus/text_proc/train-part3-cu-stories'

In [15]:
# output CSV file
save_df_fp = "/srv/scratch/z5313567/thesis/CU_local/CU_stories_dataframe.csv"

In [16]:
extracted_filepath = []
extracted_duration = []
extracted_speakerID = []
extracted_transcription_clean = []

In [17]:
def get_duration(wav_filepath):
    signal, sample_rate = librosa.load(wav_filepath)
    dur = len(signal)/sample_rate
    return dur

In [18]:
chars_to_ignore_regex = '[,?.!*\-\;\:\"“%‘”�—’…–]'
ignore_tags = ['br', 'ga', 'SIL']

def clean_transcription(transcription):
     # split the transcription into individual words
    words = transcription.split()  
    # filter out the unwanted tags
    cleaned_words = [word for word in words if word not in ignore_tags]
    # join invividual words
    cleaned_transcription = ' '.join(cleaned_words)
    # remove unnecessary symbols
    cleaned_transcription = re.sub(chars_to_ignore_regex, "", cleaned_transcription)
    # remove extra whitespace between words
    cleaned_transcription = re.sub(r"\s+", " ", cleaned_transcription)
    # convert to lower case
    #cleaned_transcription = cleaned_transcription.lower()

    return cleaned_transcription.strip()

In [19]:
# ---------------------------------------for the /srv/scratch/chacmod/CU_2/corpus/data/train-part3-cu-stories---------------------------------------------------
for series_folder in os.listdir(transcription_directory):
    series_filepath = os.path.join(transcription_directory, series_folder)
    for speaker_folder in os.listdir(series_filepath):
        speaker_filepath = os.path.join(series_filepath, speaker_folder)
        if os.path.isdir(speaker_filepath): # check if this is a folder
             # extract the speaker id from the folder name. For example, spk-05-106-sent, the speaker_id is 0500106, with two 0s inserted between 05 and 106
            speaker_folder_parts = speaker_folder.split('-')
            speaker_id = speaker_folder_parts[1] + '00' + speaker_folder_parts[2]    
            #speaker_id = speaker_folder.split("-")[2]
            speaker_id = str(speaker_id) # preverse leading zeros
            speaker_path = speaker_filepath # /srv/scratch/chacmod/CU_2/corpus/text_proc/train-part3-cu-stories/03/spk-03-031
            
            for transcription_file in os.listdir(speaker_path):
                if transcription_file.endswith(".txt"): # 9.txt
                    transcription_filepath = os.path.join(speaker_path, transcription_file) # /srv/scratch/chacmod/CU_2/corpus/text_proc/train-part3-cu-stories/03/spk-03-031/spk-03-031-story-100.txt

                    # extract the corresponding wav filepath
                    wav_filename = os.path.splitext(transcription_file)[0] + ".wav" # 9.wav
                    wav_filepath = os.path.join(data_directory, series_folder, speaker_folder, wav_filename) # /srv/scratch/chacmod/CU_2/corpus/data/train-part3-cu-stories/03/spk-03-031/spk-03-031-story-100.wav                            
                    
                    # Check if the wav file exists before getting its duration
                    if not os.path.exists(wav_filepath):
                        print(f"Warning: .wav file not found for {transcription_filepath}. Skipping.")
                        continue
                    # check if the .wav file is empty brfore getting its duration
                    if os.path.getsize(wav_filepath) == 0:
                        print(f"Warning: .wav file {wav_filepath} is empty. Skipping.")
                        continue
                    # Get the duration of the wav file
                    duration = get_duration(wav_filepath)

                    # extract the transcription from the file
                    with open(transcription_filepath, "r") as f:
                        transcription = f.read()            
                    cleaned_transcription = clean_transcription(transcription)
                    # check if the cleaned_transcription is empty
                    if not cleaned_transcription:
                        print(f"Warning: Cleaned transcription is empty for {transcription_filepath}. Skipping.")
                        continue
                    if cleaned_transcription is None:
                        print(f"Warning: Cleaned transcription is None for {transcription_filepath}. Skipping.")
                        continue

                    extracted_filepath.append(wav_filepath)
                    extracted_duration.append(duration)
                    extracted_speakerID.append(speaker_id)
                    #extracted_transcription_original.append(transcription)
                    extracted_transcription_clean.append(cleaned_transcription)



In [20]:
print("\n--------------------------> Generating dataframe ... -------------------------\n")
df = pd.DataFrame(
        {'filepath': extracted_filepath,
         'duration': extracted_duration,
         'speaker_id': extracted_speakerID,
         'transcription': extracted_transcription_clean,
         })
# preverse leading zeros
df['speaker_id'] = df['speaker_id'].astype(str) 

# when accessing the dataframe, run 'df = pd.read_csv(save_df_fp, dtype={'speaker_id': str})' to avoid missing leading 
print("\n---------------------------> Saving dataframe to csv file... ------------------\n")
df.to_csv(save_df_fp, index=False)
print('Successfully saved dataframe to csv file at: ', save_df_fp)
print("Total number of speakers:", len(set(df["speaker_id"])))
print("Total hours:", df['duration'].sum()/(60*60))


--------------------------> Generating dataframe ... -------------------------


---------------------------> Saving dataframe to csv file... ------------------

Successfully saved dataframe to csv file at:  /srv/scratch/z5313567/thesis/CU_local/CU_stories_dataframe.csv
Total number of speakers: 318
Total hours: 34.67251913580247
