In [53]:
import gzip
import re
import os
import pandas as pd

In [54]:
'''
utt2spk file
file_id id

1272-128104-0000 1272
1272-128104-0002 1272
1272-128104-0003 1272
1272-128104-0005 1272
'''

def extract_utt2spk(file_path):
    # Extract the text part of the filename
    filename = os.path.basename(file_path)
    file_prefix = filename.split('.')[0]
    # Read the contents of the gzipped file
    with gzip.open(file_path, 'rt') as file:
        contents = file.read()
    # Regex pattern to match text within quotation marks, excluding specific terms
    pattern = r'"(.*?)"'
    matches = re.findall(pattern, contents)
    # Exclude matches that contain "TextGrid", "IntervalTier", "N02000", "ooTextFile short", or are empty
    exclude_terms = {"TextGrid", "IntervalTier", "ooTextFile short", ""}
    text_lines = []
    for match in matches:
        if not any(match == term for term in exclude_terms):
            text_lines.append(match)
    # Remove specific markers like ^ and |
    cleaned_lines = text_lines[0]
    
    result_text = file_prefix + " " + cleaned_lines
    # print(result_text)
    return result_text




In [55]:
def process_folder(folder_path, function, name):
    results = []
    for filename in os.listdir(folder_path):
        if not filename.endswith('.txt') and not filename.endswith('.scp'):
            file_path = os.path.join(folder_path, filename)
            cleaned_text = function(file_path)
            results.append(cleaned_text)
    
    # Save results to transcripts.txt
    with open(os.path.join(folder_path, name), 'w') as outfile:
        for line in results:
            outfile.write(line + "\n")

# Example usage
folder_path = 'data/Transcripts_clean'
process_folder(folder_path, extract_utt2spk, 'utt2spk.txt')

In [56]:
def extract_wav(file_path):
    '''
    wav.scp file:
    file_id file_path 
    1272-128104-0000 data/libri_dev/wav/1272-128104-0000/1272-128104-0000.wav
    1272-128104-0002 data/libri_dev/wav/1272-128104-0002/1272-128104-0002.wav

    '''
    # Extract the text part of the filename
    file_path = file_path.replace('\\', '/')
    filename = os.path.basename(file_path)
    file_prefix = filename.split('.')[0]
    # Read the contents of the gzipped file
    
    result_text = file_prefix + " " + file_path
    # print(result_text)
    return result_text

folder_path = 'data/nl'
process_folder(folder_path, extract_wav, 'wav.scp')

In [57]:
def extract_spk2gender(file_path, xls_file, name):
    '''
    spk2gender:
    id gender
    1272 m
    1462 f
    1673 f
    '''
    # get list of all speakers
    speakers = []
    with open(file_path, 'r') as f:
        for line in f:
            _, speaker = line.split(' ', 1)  # Split on the first space
            speaker = speaker.strip()
            if speaker not in speakers:
                speakers.append(speaker)
    print(len(speakers))
    # Read xls file
    data = pd.read_excel(xls_file)
    # Extract the 'ID' and 'sex' columns
    extracted_data = data[['ID', 'sex']]
    
    #write speakers and gender in file
    with open(os.path.join(folder_path, name), 'w') as outfile:
        for _, row in extracted_data.iterrows(): 
            if row['ID'] in speakers:
                print(row['ID'], " is in ", speakers)
                text = ""
                if row['sex'] == 'sex1':
                    text = row['ID'] + " m"
                else:
                    text = row['ID'] + " f"
                outfile.write(text + "\n")
            
            
extract_spk2gender("data/Transcripts_clean/utt2spk.txt", "speakers.xls", "spk2gender.txt")
    
    

10
N02000  is in  ['N02000', 'N02001', 'N02002', 'N02003', 'N02004', 'N02005', 'N02006', 'N02007', 'N02008', 'N02009']
N02001  is in  ['N02000', 'N02001', 'N02002', 'N02003', 'N02004', 'N02005', 'N02006', 'N02007', 'N02008', 'N02009']
N02002  is in  ['N02000', 'N02001', 'N02002', 'N02003', 'N02004', 'N02005', 'N02006', 'N02007', 'N02008', 'N02009']
N02003  is in  ['N02000', 'N02001', 'N02002', 'N02003', 'N02004', 'N02005', 'N02006', 'N02007', 'N02008', 'N02009']
N02004  is in  ['N02000', 'N02001', 'N02002', 'N02003', 'N02004', 'N02005', 'N02006', 'N02007', 'N02008', 'N02009']
N02005  is in  ['N02000', 'N02001', 'N02002', 'N02003', 'N02004', 'N02005', 'N02006', 'N02007', 'N02008', 'N02009']
N02006  is in  ['N02000', 'N02001', 'N02002', 'N02003', 'N02004', 'N02005', 'N02006', 'N02007', 'N02008', 'N02009']
N02007  is in  ['N02000', 'N02001', 'N02002', 'N02003', 'N02004', 'N02005', 'N02006', 'N02007', 'N02008', 'N02009']
N02008  is in  ['N02000', 'N02001', 'N02002', 'N02003', 'N02004', 'N0

In [58]:
def extract_spk2utt(file_path, name):
    '''
    spk2utt
    id lijst_van_dingen_voor_dat_id
    '''
    #create dictionary
    speakers = {}
    with open(file_path, 'r') as f:
        for line in f:
            file, speaker = line.split(' ', 1)  # Split on the first space
            speaker = speaker.strip()
            if speaker in speakers:
                temp = speakers.get(speaker)
                temp.append(file)
                speakers[speaker] = temp
            else:
                speakers[speaker] = [file]

    #print dictionary
    with open(os.path.join(folder_path, name), 'w') as outfile:
        for speaker, files in speakers.items():
            text = speaker
            for file in files:
                text = text + " " + file
            outfile.write(text + "\n")

folder_path = 'files_for_harm'
extract_spk2utt('files_for_harm/utt2spk.txt', 'spk2utt.txt')

In [59]:
def extract_utt2dur(file_path, xls_file, name):
    '''
    utt2dur
    file_id length_of_file_in_seconds
    1272-128104-0000 5.855 
    1272-128104-0002 12.485 
    1272-128104-0003 9.9 
    1272-128104-0005 9.01 
    1272-128104-0006 5.64 
    1272-128104-0007 9.24 
    1272-128104-0010 5.6
    '''
    # get relevant ids
    ids = []
    with open(file_path, 'r') as f:
        for line in f:
            id, _ = line.split(' ', 1)  # Split on the first space
            id = id.strip()
            if id not in ids:
                ids.append(id)
    print(len(ids))
    # Read xls file
    data = pd.read_excel(xls_file)
    # Extract the 'ID' and 'sex' columns
    extracted_data = data[['recordingID', 'secCount']]
    
    #write speakers and gender in file
    with open(os.path.join(folder_path, name), 'w') as outfile:
        for _, row in extracted_data.iterrows(): 
            if row['recordingID'] in ids:
                text = str(row['recordingID']) + " "+ str(row['secCount'])
                outfile.write(text + "\n")
            
extract_utt2dur("data/Transcripts_clean/utt2spk.txt", "recordings.xls", "utt2dur.txt")

134


In [60]:
def trials(gender_file, utt_file, name_m, name_f):
    '''
    trials_m and trials_f
    id file_id target/nontarget
    '''
    
    # get list of male and female audios
    males = []
    females = []
    with open(gender_file, 'r') as f:
        for line in f:
            parts = line.split(' ')
            if parts[1].strip() == 'm':
                males.append(parts[0])
            else:
                females.append(parts[0])

    # get list of male and female audios
    male_audios = {}
    female_audios = {}
    with open(utt_file, 'r') as f:
        for line in f:
            parts = line.split(' ')
            if parts[0] in males:
                male_audios[parts[0]] = parts[1:]
            else:
                for file in parts[1:]:
                    female_audios[parts[0]] = parts[1:]
                    
                    
    # loop through males
    with open(os.path.join(folder_path, name_m), 'w') as outfile:
        for speaker in male_audios.keys():
            for key, files in male_audios.items():
                target = "nontarget"
                if speaker == key:
                    target = "target"
                for file in files:
                    text = speaker + " " + file.strip() + " " + target
                    outfile.write(text + "\n")
    
    with open(os.path.join(folder_path, name_f), 'w') as outfile:
        for speaker in female_audios.keys():
            for key, files in female_audios.items():
                target = "nontarget"
                if speaker == key:
                    target = "target"
                for file in files:
                    text = speaker + " " + file.strip()  + " " + target
                    outfile.write(text + "\n")

trials("files_for_harm/spk2gender.txt", "files_for_harm/spk2utt.txt", "trials_m.txt", "trials_f.txt")

In [61]:
def enrolls(file, name):
    ''' 
    file_ids
    1272-128104-0000
    1272-128104-0002
    1272-128104-0003
    ''' 
    
    with open(file, 'r') as f:
        with open(os.path.join(folder_path, name), 'w') as outfile:
            for line in f:
                parts = line.split(' ')
                outfile.write(parts[0] + "\n")

enrolls("files_for_harm/utt2spk.txt", "enrolls.txt")

In [78]:
def spk2gender_sub(gender, file, name):
    with open(file, 'r') as f:
        with open(os.path.join(folder_path, name), 'w') as outfile:
            for line in f:
                parts = line.split(' ')
                if parts[1].strip() == gender:  
                    outfile.write(line)
                

def spk2utt_sub(gender, file, name): 
    speakers = []
    with open(gender, 'r') as f:
        for line in f:   
            speakers.append(line.split(' ')[0]) 
                
    with open(file, 'r') as f:
        with open(os.path.join(folder_path, name), 'w') as outfile:
            for line in f:
                parts = line.split(' ')
                if parts[0].strip() in speakers:  
                    outfile.write(line)    

def text_sub(gender, file, name): 
    files = []
    with open(gender, 'r') as f:
        for line in f:   
            for utt in line.split(' ')[1:]:
                files.append(utt.strip())       
                
    with open(file, 'r') as f:
        with open(os.path.join(folder_path, name), 'w') as outfile:
            for line in f:
                parts = line.split(' ')
                if parts[0].strip() in files:  
                    outfile.write(line)  
                    
 

In [82]:
# Females
folder_path = "files_for_harm/trials_f"
spk2gender_sub("f", "files_for_harm/spk2gender.txt", "spk2gender.txt")
spk2utt_sub("files_for_harm/trials_f/spk2gender.txt", "files_for_harm/spk2utt.txt", "spk2utt.txt")
text_sub("files_for_harm/trials_f/spk2utt.txt", "files_for_harm/text.txt", "text.txt")
text_sub("files_for_harm/trials_f/spk2utt.txt", "files_for_harm/utt2dur.txt", "utt2dur.txt")
text_sub("files_for_harm/trials_f/spk2utt.txt", "files_for_harm/utt2spk.txt", "utt2spk.txt")
text_sub("files_for_harm/trials_f/spk2utt.txt", "files_for_harm/wav.scp", "wav.scp")

# Males
folder_path = "files_for_harm/trials_m"
spk2gender_sub("m", "files_for_harm/spk2gender.txt", "spk2gender.txt")
spk2utt_sub("files_for_harm/trials_m/spk2gender.txt", "files_for_harm/spk2utt.txt", "spk2utt.txt")
text_sub("files_for_harm/trials_m/spk2utt.txt", "files_for_harm/text.txt", "text.txt")
text_sub("files_for_harm/trials_m/spk2utt.txt", "files_for_harm/utt2dur.txt", "utt2dur.txt")
text_sub("files_for_harm/trials_m/spk2utt.txt", "files_for_harm/utt2spk.txt", "utt2spk.txt")
text_sub("files_for_harm/trials_m/spk2utt.txt", "files_for_harm/wav.scp", "wav.scp")
