In [None]:
from pydub import AudioSegment
import pandas as pd
import re
from os import listdir
from os.path import isfile, join
import numpy as np
from scipy.io.wavfile import read

In [None]:
def process_utterance(utter, file_id, spk_id):
    res = file_id + ' 1 '
    if utter[2][0] != '"':
        return ';;\n'
    if utter[2][1] == '"':
        res += 'inter_segment_gap ' + utter[0][0:len(utter[0])-2] + ' ' + utter[1][0:len(utter[1])-2]
    else:
        res += spk_id + ' ' + utter[0][0:len(utter[0])-2] + ' ' + utter[1][0:len(utter[1])-2] + ' ' + utter[2][1:len(utter[2])-2]

    return res + '\n'

In [None]:
def split_audio_with_transcription(file, audio_path, transcription_path, audio_output_folder, transcription_output_folder, chunk_limit=30):
    if audio_path.find('comp-p') != -1:
        rate, aud = read(audio_path)
        # make the audio in pydub audio segment format
        aud_segm = AudioSegment(aud.tobytes(),frame_rate = rate,
                             sample_width = aud.dtype.itemsize,channels = 2)
        channels = AudioSegment.split_to_mono(aud_segm)
        audio = channels[0]
    else:
        audio = AudioSegment.from_wav(audio_path)
    transcription_file = open(transcription_path,  encoding= 'latin_1')
    lines = transcription_file.readlines()

    id_spk = ''
    count = 1
    utter = []
    start_end_times = []
    chunk_duration = 0
    utter_trans = ""

    for line in lines:
        if re.match(r'"N0|"N1', line):
            id_spk = line[1:len(line)-2]
            utter = []
            continue
        if id_spk == '':
            continue
        else:
            if line[1:len(line)-2] == "IntervalTier" or line[1:len(line)-2] == "TextTier":
                break
            if len(utter) < 3:
                utter.append(line)
                continue
            else:
                processed_utter = process_utterance(utter, audio_path, id_spk)
                utter = [line]
                if processed_utter[0] == ';':
                    continue
                start_time = float(processed_utter.split()[3])
                end_time = float(processed_utter.split()[4])

                text = ' '.join(processed_utter.split()[5:])
                if text == '':
                    continue

                # print('duration')
                # print(end_time - start_time)
                # print('chunk dur')
                # print(chunk_duration)
                # print('chunk dur + utter dur')
                print(chunk_duration + end_time - start_time)
                if chunk_duration + end_time - start_time < chunk_limit:
                    chunk_duration += end_time - start_time
                    # print('utter_trans: ' + utter_trans)
                    utter_trans = utter_trans + " " + text
                    # print('new utter_trans: ' + utter_trans)
                    # print('start end time: ')
                    # print(start_end_times)
                    start_end_times = start_end_times + [start_time, end_time]
                    # print('new start end time: ')
                    # print(start_end_times)                   
                else:
                    # Prepare the chunk with audio and transcription
                    chunk = AudioSegment.silent(duration=0)
                    for i in range(0, len(start_end_times), 2):
                        chunk += audio[start_end_times[i] * 1000:start_end_times[i+1] * 1000]

                    # Export the chunk
                    chunk.export(f"{audio_output_folder}/{file[:-4]}-{count}.wav", format="wav")

                    # Write the transcription for the chunk
                    with open(f"{transcription_output_folder}/{file[:-4]}-{count}.txt", 'w',  encoding= 'latin_1') as trans_file:
                        trans_file.write(utter_trans)

                    # Reset for the next chunk
                    chunk_duration = end_time - start_time
                    start_end_times = [start_time, end_time]
                    utter_trans = text
                    count += 1

In [None]:
# Pass audio path
comp_p = [f for f in listdir('/path/Data/data/audio/wav/comp-p/nl/') if isfile(join('/path/Data/data/audio/wav/comp-p/nl/', f))]
comp_q = [f for f in listdir('/path/Data/data/audio/wav/comp-q/nl/') if isfile(join('/path/Data/data/audio/wav/comp-q/nl/', f))]
# Define your paths and output folder
for file in comp_p:
    audio_path = '/path/Data/data/audio/wav/comp-p/nl/' + file
    transcription_path = '/path/Data/data/annot/text/ort/comp-p/nl/' + file[:-3] + 'ort'
    audio_output_folder = '/path/Data/data/audio/wav/comp-p-chunked/'
    transcription_output_folder = '/path/Data/data/annot/text/txt/comp-p/'
    split_audio_with_transcription(file,audio_path, transcription_path, audio_output_folder, transcription_output_folder)
# Define your paths and output folder
for file in comp_q:
    audio_path = '/path/Data/data/audio/wav/comp-q/nl/' + file
    transcription_path = '/path/Data/data/annot/text/ort/comp-q/nl/' + file[:-3] + 'ort'
    audio_output_folder = '/path/Data/data/audio/wav/comp-q-chunked/'
    transcription_output_folder = '//path/Data/data/annot/text/txt/comp-q/'
    split_audio_with_transcription(file,audio_path, transcription_path, audio_output_folder, transcription_output_folder)


In [None]:
import os
import csv

# Define the paths to the folders containing comp-p and comp-q WAV and text files
comp_p_wav_folder = '/path/Data/data/audio/wav/comp-p-chunked/'
comp_p_txt_folder = '/path/Data/data/annot/text/txt/comp-p/'
comp_q_wav_folder = '/path/Data/data/audio/wav/comp-q-chunked/'
comp_q_txt_folder = '/path/Data/data/annot/text/txt/comp-q/'
output_csv = '/path/Data/data/meta/csv/nl/recordings3.csv'

# Create a CSV file to write the data
with open(output_csv, 'w', newline='', encoding='utf-8-sig') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Path', 'Sentence'])

    # Function to read and process files in a folder
    def process_folder(wav_folder, txt_folder):
        # wav_files = [os.path.join(wav_folder, file) for file in os.listdir(wav_folder) if file.endswith('.wav')]
        wav_files = [ file for file in os.listdir(wav_folder) if file.endswith('.wav')]
        # txt_files = [os.path.join(txt_folder, file) for file in os.listdir(txt_folder) if file.endswith('.txt')]
        
        for wav_file in wav_files:
            try:
                # Read the content of the text file with "latin_1" encoding
                txt_file = wav_file[:-3] + "txt"
                with open(os.path.join(txt_folder, txt_file), 'r', encoding='latin_1') as txt_path:
                    sentence = txt_path.read().strip()
                    
                # Write the data to the CSV file
                csvwriter.writerow([os.path.join(wav_folder, wav_file), sentence])
            
            except Exception as e:
                print(f"Error processing files, Error: {e}")
    
    # Process comp-p folders
    process_folder(comp_p_wav_folder, comp_p_txt_folder)
    
    # Process comp-q folders
    process_folder(comp_q_wav_folder, comp_q_txt_folder)

print(f'Data has been saved to {output_csv}')


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/path/Data/data/meta/csv/nl/recordings3.csv', encoding= 'utf-8-sig')

# Extract the 'Root' from the 'Path' column
df['Root'] = df['Path'].apply(lambda x: x.split('/')[-1].split('-')[0])

# Save the updated dataframe to a new CSV file
df.to_csv('/path/Data/data/meta/csv/nl/recordings4.csv', index=False, encoding= 'utf-8-sig')

In [None]:
import pandas as pd

# Load the first CSV file (recordings4)
df_recordings4 = pd.read_csv('/path/Data/data/meta/csv/nl/recordings4.csv')

# Load the second CSV file (recordings2)
df_recordings2 = pd.read_csv('/path/Data/data/meta/csv/nl/recordings2.csv', encoding= 'utf-8-sig')

# Merge the two dataframes based on the 'Root' column
merged_df = df_recordings4.merge(df_recordings2[['Root', 'Age', 'Component', 'SpeakerID' ,'Group', 'Gender' ,'CEF' , 'DialectRegion' ,'Duration (seconds)', 'Duration (days)']], on='Root', how='left')

# Save the merged dataframe to a new CSV file
merged_df.to_csv('/path/Data/data/meta/csv/nl/recordings5.csv', index=False, encoding= 'utf-8-sig')