In [2]:
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import time
import glob
from lxml.html import parse
from sphfile import SPHFile
import pydub
import audiosegment
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup
import sys

## Pair sound files and annotation
# Combine the two cells to avoid overlap!!!

In [3]:
soundfiles = glob.glob('testfiles/**/*.sph', recursive = True)
#annofiles = glob.glob('./**/*.txt', recursive = True) + glob.glob('./**/*.sgml', recursive = True)
data = {}
for soundfile in soundfiles:
    name = soundfile.split('/')[-1].split('.')[0]
    annofile = list(glob.glob('testfiles/**/'+name+'.txt', recursive = True))+list(glob.glob('testfiles/**/'+name+'.sgml', recursive = True))
    annofile = annofile[0]
    data[soundfile] = annofile
start_load = time.time()
soundfiles = list(data.keys())
data

{'testfiles/files/e960510b.sph': 'testfiles/files/e960510b.txt',
 'testfiles/files/g960515_.sph': 'testfiles/files/g960515_.txt',
 'testfiles/files/h960514_.sph': 'testfiles/files/h960514_.txt',
 'testfiles/files/j960510_.sph': 'testfiles/files/j960510_.txt',
 'testfiles/files/e960513a.sph': 'testfiles/files/e960513a.txt',
 'testfiles/files/e960514a.sph': 'testfiles/files/e960514a.txt',
 'testfiles/files/e960510a.sph': 'testfiles/files/e960510a.txt',
 'testfiles/files/e960514b.sph': 'testfiles/files/e960514b.txt',
 'testfiles/files/e960515_.sph': 'testfiles/files/e960515_.txt',
 'testfiles/files/e960513b.sph': 'testfiles/files/e960513b.txt'}

## Figure out gender of each speaker

In [4]:
# gender = {}
# for annofile in data.values():
#     doc = parse(annofile)
#     for tag in doc.getroot().iter():
#         if tag.tag in ['segment', 'turn']:
#             if 'spkrtype' in tag.attrib:
#                 gender[tag.attrib['speaker']] = tag.attrib['spkrtype']
gender = np.load('gender.npy', allow_pickle = True).reshape(1)[0]

FileNotFoundError: [Errno 2] No such file or directory: 'gender.npy'

# Save Segments

In [5]:
def calculate_overlap(row1, row2):
    if row1['soundfile'] != row2['soundfile']: # can only overlap if in same file
        return 0
    if row1['end']<row2['start'] or row1['start']>row2['end']:
        return 0
    if row1['start']<row2['start']:
        return row1['end']-row2['start']
    return row2['end']-row1['start']
def getstart(segment):
    return float(segment['s_time']) if segment.has_attr('s_time') else float(segment['starttime'])
def getend(segment):
    return float(segment['e_time']) if segment.has_attr('e_time') else float(segment['endtime'])
end_df = pd.DataFrame()
start_df = pd.DataFrame()
for i, soundfile in enumerate(soundfiles):
    name = soundfile.split('/')[-1].split('.')[0] # name of soundfile
    print(i)
    sph = SPHFile(soundfile)
    sound, sr = sph.content, sph.format['sample_rate']
    annofile = data[soundfile]
    with open(annofile) as file:
        soup = BeautifulSoup(file,'html.parser')
    tags = soup.find_all(['segment', 'turn'])
    for j, segment in enumerate(tags):
        save_start, save_end = True, True
        text = segment.content
        end = getend(segment)
        start = getstart(segment)
        speaker = segment['speaker'].lower()
        if speaker[:4] in ['male', 'fema', 'spkr']:
            speaker = name+'_'+speaker
        if speaker.lower().find('announcer') != -1:
            #print(speaker)
            continue
        if speaker in ['01janedoe', '01johndoe', '02janedoe', '02johndoe', '04johndoe', 'anchor1', 'unison']:
            continue
        if end > len(sound)/sr: # if end exceeds length, skip
            continue
        if start >= end - 2: # if segment too short
            continue
            
        
        # ending segments
        if j + 1 < len(tags) and getstart(tags[j+1]) < end - 0.1: # if someone interrupted talking
            #print('end interrupted')
            save_end = False
        end_idx = int(end*sr)
        start_idx = int((end-2)*sr) # truncate segment to 2s
        end_segment = sound[start_idx:end_idx]
        seg_object = audiosegment.from_numpy_array(end_segment, sr)
        silences = pydub.silence.detect_silence(seg_object, min_silence_len = 100, silence_thresh=-32, seek_step = 30)
        silences = [[int(a/1000*sr),int(b/1000*sr)] for [a, b] in silences] # turn unit into samples
        if silences!= []:
            if silences[-1][1] == 2*sr: # if silence at end, remove that silence
                #print('readjusted end')
                end_idx = end_idx - 2*sr + silences[-1][0] # readjust silence end
                # reset start idx
                start_idx = int(start*sr)
                if start_idx >= end_idx - 2*sr: # if segment too short
                    continue
                start_idx = end_idx - 2*sr 
        # recrop
        end_segment = sound[start_idx:end_idx]
        seg_object = audiosegment.from_numpy_array(end_segment, sr)
        # recalculate silence
        silences = pydub.silence.detect_silence(seg_object, min_silence_len = 100, silence_thresh=-32, seek_step = 30)
        silences = [[int(a/1000*sr),int(b/1000*sr)] for [a, b] in silences]
        length_silence = sum([b-a for [a, b] in silences])
        if length_silence/len(end_segment) > 0.3:
            save_end = False
        end_filename = 'testfiles/end_segments/'+name+'_seg'+str(j)+'.npy'
        end_row = {'soundfile': soundfile, 'segment_idx': j, 'start': start, 'end': end, 
               'silence_ratio': length_silence/len(end_segment), 'speaker':speaker, 'segfile':end_filename}
        
        
        # starting segments
        if j > 0 and getend(tags[j-1]) > start + 0.1: #if someone interrupted talking
            #print('start interrupted')
            save_start = False
        end_idx = int((start+2)*sr)
        start_idx = int(start*sr) # truncate segment to 2s
        start_segment = sound[start_idx:end_idx]
        seg_object = audiosegment.from_numpy_array(start_segment, sr)
        silences = pydub.silence.detect_silence(seg_object, min_silence_len = 100, silence_thresh=-32, seek_step = 30)
        silences = [[int(a/1000*sr),int(b/1000*sr)] for [a, b] in silences] # turn unit into samples
        if silences!= []:
            if silences[0][0] == 0: # if silence at start, remove that silence # different
                #print('readjusted start')
                start_idx = start_idx + silences[0][1] - int(0.1*sr) # readjust silence start # different, include attack
                # reset end idx
                end_idx = int(end*sr)
                if start_idx >= end_idx - 2*sr: # if segment too short
                    continue
                end_idx = start_idx + 2*sr # different
        # recrop
        start_segment = sound[start_idx:end_idx]
        seg_object = audiosegment.from_numpy_array(start_segment, sr)
        # recalculate silence
        silences = pydub.silence.detect_silence(seg_object, min_silence_len = 100, silence_thresh=-32, seek_step = 30)
        silences = [[int(a/1000*sr),int(b/1000*sr)] for [a, b] in silences]
        length_silence = sum([b-a for [a, b] in silences])
        if length_silence/len(start_segment) > 0.3:
            save_start = False
        start_filename = 'testfiles/start_segments/'+name+'_seg'+str(j)+'.npy'
        start_row = {'soundfile': soundfile, 'segment_idx': j, 'start': start, 'end': end, 
                   'silence_ratio': length_silence/len(start_segment), 'speaker':speaker, 'segfile':start_filename}
        if save_end and save_start and calculate_overlap(start_row, end_row)>sr: # if overlap, only take one
            if np.random.randn()>0:
                save_end = False
            else:
                save_start = False
        #print(save_end, save_start)
        if save_end:
            np.save(end_filename, end_segment)
            end_df = end_df.append(end_row, ignore_index = True)
        if save_start:
            np.save(start_filename, start_segment)
            start_df = start_df.append(start_row, ignore_index = True)
end_df.to_csv('testfiles/end.csv', index = False)
start_df.to_csv('testfiles/start.csv', index = False)

0
1
2
3
4
5
6
7
8
9


## Check for redundant speaker

In [6]:
segments_df = start_df.append(end_df, ignore_index = True)
#spkr_df = segments_df[segments_df['speaker'] == 'j960522b_johndoe001']
spkr_df = segments_df[segments_df['speaker'].str.contains('')]
idx = np.random.randint(len(spkr_df))
segfile = spkr_df.iloc[idx]['segfile']
seg = np.fromfile(segfile, dtype = np.int16)
print(spkr_df.iloc[idx]['soundfile'], spkr_df.iloc[idx]['speaker'], spkr_df.iloc[idx]['segfile'])
print(len(Counter(segments_df['speaker'])), ' speakers')
display(segments_df)
ipd.Audio(seg, rate = sr)
#rob_gifford, bill_clinton, jack_smith

testfiles/files/h960514_.sph joan_mcfarland testfiles/end_segments/h960514__seg89.npy
187  speakers


Unnamed: 0,end,segfile,segment_idx,silence_ratio,soundfile,speaker,start
0,16.639,testfiles/start_segments/e960510b_seg0.npy,0.0,0.000,testfiles/files/e960510b.sph,martin_savage,1.071
1,46.576,testfiles/start_segments/e960510b_seg1.npy,1.0,0.050,testfiles/files/e960510b.sph,lou_waters,16.639
2,62.153,testfiles/start_segments/e960510b_seg2.npy,2.0,0.050,testfiles/files/e960510b.sph,lou_waters,47.227
3,75.420,testfiles/start_segments/e960510b_seg3.npy,3.0,0.000,testfiles/files/e960510b.sph,martin_savage,63.025
4,98.866,testfiles/start_segments/e960510b_seg4.npy,4.0,0.080,testfiles/files/e960510b.sph,jamie_mcintyre,77.279
...,...,...,...,...,...,...,...
1452,1553.800,testfiles/end_segments/e960513b_seg90.npy,90.0,0.050,testfiles/files/e960513b.sph,mark_bernheimer,1547.015
1453,1563.705,testfiles/end_segments/e960513b_seg91.npy,91.0,0.215,testfiles/files/e960513b.sph,e960513b_f_us_016,1553.823
1454,1577.195,testfiles/end_segments/e960513b_seg92.npy,92.0,0.000,testfiles/files/e960513b.sph,mark_bernheimer,1563.705
1455,1583.737,testfiles/end_segments/e960513b_seg95.npy,95.0,0.000,testfiles/files/e960513b.sph,natalie_allen,1579.504


In [7]:
from bs4 import BeautifulSoup
total_segs = 0
soundfile = soundfiles[2]
name = soundfile.split('/')[-1].split('.')[0] # name of soundfile
sph = SPHFile(soundfile)
sound, sr = sph.content, sph.format['sample_rate']
annofile = data[soundfile]
with open(annofile) as file:
    soup = BeautifulSoup(file,'html.parser')
soup.find_all(['segment', 'turn'])

[<segment e_time="124.975" fidelity="High" mode="Planned" s_time="118.042" speaker="Kathleen_Kennedy">
 <background level="Low" time="118.042" type="Other">
 the capital of liberia erupts in violence
 <sync time="120.802">
 once again rival factions are battling for supremacy in the city streets
 {breath}
 </sync></background></segment>,
 <segment e_time="129.866" fidelity="Medium" mode="Spontaneous" s_time="124.975" speaker="male_nonnative2">
 <background level="Low" time="124.975" type="Speech">
 i was extremely impressed with some of his answers in the (( )) just
 reaffirmed my commitment that he was innocent
 </background></segment>,
 <segment e_time="133.361" fidelity="Medium" mode="Spontaneous" s_time="129.866" speaker="female_nonnative1">
 okay so he wasn't proven guilty but he wasn't proven innocent either
 </segment>,
 <segment e_time="137.507" fidelity="Medium" mode="Spontaneous" s_time="133.681" speaker="male_nonnative3">
 i think that the vast majority will st- still think 

In [8]:
from bs4 import BeautifulSoup
total_segs = 0
for i, soundfile in enumerate(soundfiles):
    name = soundfile.split('/')[-1].split('.')[0] # name of soundfile
    sph = SPHFile(soundfile)
    sound, sr = sph.content, sph.format['sample_rate']
    annofile = data[soundfile]
    try:
        with open(annofile) as file:
            soup = BeautifulSoup(file,'html.parser')
        print(i, annofile, len(soup.find_all(['segment', 'turn'])))
        total_segs+=len(soup.find_all(['segment', 'turn']))
        
    except:
        print('broken file ', annofile)
    if len(soup.find_all(['segment', 'turn'])) < 10:
        print(soup.find_all(['segment', 'turn']))
        print('error parsing ', annofile)
        break
total_segs

0 testfiles/files/e960510b.txt 116
1 testfiles/files/g960515_.txt 63
2 testfiles/files/h960514_.txt 177
3 testfiles/files/j960510_.txt 64
4 testfiles/files/e960513a.txt 82
5 testfiles/files/e960514a.txt 78
6 testfiles/files/e960510a.txt 77
7 testfiles/files/e960514b.txt 97
8 testfiles/files/e960515_.txt 82
9 testfiles/files/e960513b.txt 99


935

In [9]:
spkrfile = 'trainfiles/LDC97T22/hub4_eng_train_trans/spkrlist.sgml'
spkrlist = []
with open(spkrfile) as file:
    for line in file:
        soup = BeautifulSoup(line, 'html.parser')
        doc = soup.find_all('speaker')
        if len(doc):
            doc = doc[0]
            spkrlist.append(doc['name'])
len(spkrlist)

3116

In [10]:
print(len(spkrlist))
spkrlist = Counter(spkrlist)
print([speaker for speaker in spkrlist.keys() if spkrlist[speaker]>1])
print(len(spkrlist))

3116
['Alfonse_DAmato', 'Amanda_Greenleaf_Whelan', 'Ann_Lewis', 'Anne_McDermott', 'Bo_Gritz', 'Brian_Jenkins', 'Brent_Sadler', 'Kathy_Lohr', 'Oliver_Caman', 'Dan_Rutz', 'Don_Knapp', 'Reni_Vaughn', 'Lisa_Price', 'John_Ydstie', 'Steve_Inskeep', 'Jamie_McIntyre', 'Jim_Zarroli', 'Katharine_Barrett', 'Kent_Ninomiya', 'Lauch_Faircloth', 'Martha_Raddatz', 'Martin_Buser', 'Mary_Schiavo', 'Michael_Sivy', 'Philip_Boroff', 'Roger_Cossack', 'William_Perry', 'Sherry_Matteucci', 'William_Clinger']
3087


## Pool start/end segments

In [11]:
start_df = pd.read_csv('testfiles/start.csv')
end_df = pd.read_csv('testfiles/end.csv')
start_df = start_df.sample(frac=1).reset_index(drop=True)
end_df = end_df.sample(frac=1).reset_index(drop=True)
speakers = set(list(start_df['speaker'])+list(end_df['speaker']))
print(len(speakers), 'speakers')

187 speakers


## testset

In [12]:
# sample testing data
start_df_test = start_df[:]
end_df_test = end_df[:]
segments_df_test = start_df_test.append(end_df_test, ignore_index = True)
segments_by_speaker_test = {}
# for faster fetching of triplets negative speaker
for speaker in speakers:
    segments_by_speaker_test[speaker] = segments_df_test[segments_df_test['speaker'] == speaker]
# for faster fetching of pairs negative speaker
start_by_speaker_test = {}
for speaker in speakers:
    start_by_speaker_test[speaker] = start_df_test[start_df_test['speaker'] == speaker]
end_by_speaker_test = {}
for speaker in speakers:
    end_by_speaker_test[speaker] = end_df_test[end_df_test['speaker'] == speaker]

In [13]:
speakers_test = [speaker for speaker in speakers if len(segments_by_speaker_test[speaker])>0]
speakers_start_test = [speaker for speaker in speakers if len(start_by_speaker_test[speaker])>0]
speakers_end_test = [speaker for speaker in speakers if len(end_by_speaker_test[speaker])>0]
len(speakers_test), len(speakers_start_test), len(speakers_end_test)

(187, 180, 180)

## Make pairs

In [14]:
pairs = []
for _, anchor_speaker in enumerate(speakers_test):
    print(_)
    if anchor_speaker not in speakers_start_test or anchor_speaker not in speakers_end_test:
        continue # if speaker doesn't have both a start&end segment
    negative_speakers = [speaker for speaker in speakers_start_test if speaker != anchor_speaker]
    anchor_segments = end_by_speaker_test[anchor_speaker] # at end of segment
    positive_segments = start_by_speaker_test[anchor_speaker] # at start of segment
    anchor_indices = np.arange(len(anchor_segments))
    positive_indices = np.arange(len(positive_segments))
    np.random.shuffle(anchor_indices)# shuffle the anchor segments
    n_s = min(len(anchor_segments), 50)
    n_p = min(len(positive_segments), 50)
    for i in range(n_s): #iterate over anchors
        anchor_segment = anchor_segments.iloc[anchor_indices[i]]
        np.random.shuffle(positive_indices)
        for j in range(n_p): #iterate over compared segment
            # positive examples
            positive_segment = positive_segments.iloc[positive_indices[j]]
            pos_row = {'first_speaker': anchor_speaker, 'first_file': anchor_segment['segfile'], 
                       'second_speaker': anchor_speaker, 'second_file': positive_segment['segfile'],
                       'label': 0} # no speaker change
            # negative examples
            negative_speaker = negative_speakers[np.random.randint(0, len(negative_speakers))]
            negative_segments = start_by_speaker_test[negative_speaker] # segments for a specific random speaker
            negative_segment = negative_segments.iloc[np.random.randint(0, len(negative_segments))]
            neg_row = {'first_speaker': anchor_speaker, 'first_file': anchor_segment['segfile'], 
               'second_speaker': negative_speaker, 'second_file': negative_segment['segfile'],
               'label': 1} # has speaker change
            pairs.append(pos_row)
            pairs.append(neg_row)
pairs = pd.DataFrame(pairs)
pairs.to_csv('testfiles/test-pairs.csv')
pairs

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186


Unnamed: 0,first_speaker,first_file,second_speaker,second_file,label
0,h960514_m_nus_002,testfiles/end_segments/h960514__seg75.npy,h960514_m_nus_002,testfiles/start_segments/h960514__seg75.npy,0
1,h960514_m_nus_002,testfiles/end_segments/h960514__seg75.npy,noah_adams,testfiles/start_segments/j960510__seg41.npy,1
2,dave_hennon,testfiles/end_segments/e960514b_seg63.npy,dave_hennon,testfiles/start_segments/e960513a_seg56.npy,0
3,dave_hennon,testfiles/end_segments/e960514b_seg63.npy,e960510b_f_us_011,testfiles/start_segments/e960510b_seg75.npy,1
4,dave_hennon,testfiles/end_segments/e960514b_seg63.npy,dave_hennon,testfiles/start_segments/e960514b_seg63.npy,0
...,...,...,...,...,...
22325,mark_bernheimer,testfiles/end_segments/e960513b_seg80.npy,g960515__male_native2,testfiles/start_segments/g960515__seg61.npy,1
22326,mark_bernheimer,testfiles/end_segments/e960513b_seg80.npy,mark_bernheimer,testfiles/start_segments/e960513b_seg80.npy,0
22327,mark_bernheimer,testfiles/end_segments/e960513b_seg80.npy,h960514_m_us_010,testfiles/start_segments/h960514__seg87.npy,1
22328,mark_bernheimer,testfiles/end_segments/e960513b_seg80.npy,mark_bernheimer,testfiles/start_segments/e960513b_seg88.npy,0
