In [2]:
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import time
import glob
from lxml.html import parse
from sphfile import SPHFile
import pydub
import audiosegment
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup
import sys

## Pair sound files and annotation
# Combine the two cells to avoid overlap!!!

In [4]:
soundfiles = glob.glob('trainfiles/**/*.sph', recursive = True)
#annofiles = glob.glob('./**/*.txt', recursive = True) + glob.glob('./**/*.sgml', recursive = True)
data = {}
for soundfile in soundfiles:
    name = soundfile.split('/')[-1].split('.')[0]
    annofile = list(glob.glob('trainfiles/**/'+name+'.txt', recursive = True))+list(glob.glob('trainfiles/**/'+name+'.sgml', recursive = True))
    annofile = annofile[0]
    data[soundfile] = annofile
start_load = time.time()
soundfiles = list(data.keys())
data

{'trainfiles/LDC97S44/H4E96_01/h4eng_sp/a960521_.sph': 'trainfiles/LDC97T22/hub4_eng_train_trans/aABC_NLI/a960521_.txt',
 'trainfiles/LDC97S44/H4E96_01/h4eng_sp/a960522_.sph': 'trainfiles/LDC97T22/hub4_eng_train_trans/aABC_NLI/a960522_.txt',
 'trainfiles/LDC97S44/H4E96_01/h4eng_sp/a960523_.sph': 'trainfiles/LDC97T22/hub4_eng_train_trans/aABC_NLI/a960523_.txt',
 'trainfiles/LDC97S44/H4E96_01/h4eng_sp/a960524_.sph': 'trainfiles/LDC97T22/hub4_eng_train_trans/aABC_NLI/a960524_.txt',
 'trainfiles/LDC97S44/H4E96_01/h4eng_sp/a960528_.sph': 'trainfiles/LDC97T22/hub4_eng_train_trans/aABC_NLI/a960528_.txt',
 'trainfiles/LDC97S44/H4E96_01/h4eng_sp/a960530_.sph': 'trainfiles/LDC97T22/hub4_eng_train_trans/aABC_NLI/a960530_.txt',
 'trainfiles/LDC97S44/H4E96_01/h4eng_sp/a960604_.sph': 'trainfiles/LDC97T22/hub4_eng_train_trans/aABC_NLI/a960604_.txt',
 'trainfiles/LDC97S44/H4E96_01/h4eng_sp/a960605_.sph': 'trainfiles/LDC97T22/hub4_eng_train_trans/aABC_NLI/a960605_.txt',
 'trainfiles/LDC97S44/H4E96_01/h

## Figure out gender of each speaker

In [68]:
# gender = {}
# for annofile in data.values():
#     doc = parse(annofile)
#     for tag in doc.getroot().iter():
#         if tag.tag in ['segment', 'turn']:
#             if 'spkrtype' in tag.attrib:
#                 gender[tag.attrib['speaker']] = tag.attrib['spkrtype']
gender = np.load('gender.npy', allow_pickle = True).reshape(1)[0]

# Ending Segments

In [71]:
def getstart(segment):
    return float(segment['s_time']) if segment.has_attr('s_time') else float(segment['starttime'])
def getend(segment):
    return float(segment['e_time']) if segment.has_attr('e_time') else float(segment['endtime'])
end_df = pd.DataFrame()
start_df = pd.DataFrame()
for i, soundfile in enumerate(soundfiles):
    name = soundfile.split('/')[-1].split('.')[0] # name of soundfile
    print(i)
    sph = SPHFile(soundfile)
    sound, sr = sph.content, sph.format['sample_rate']
    annofile = data[soundfile]
    with open(annofile) as file:
        soup = BeautifulSoup(file,'html.parser')
    tags = soup.find_all(['segment', 'turn'])
    for j, segment in enumerate(tags):
        text = segment.content
        end = getend(segment)
        start = getstart(segment)
        speaker = segment['speaker']
        if speaker[:4] in ['male', 'fema']:
            speaker = name+'_'+speaker
        if end > len(sound)/sr: # if end exceeds length, skip
            continue
        if start >= end - 2: # if segment too short
            continue
            
        
        # ending segments
        if j + 1 < len(tags) and getstart(tags[j+1]) < end: # if someone interrupted talking
            print('interrupt')
            save_end = False
        end_idx = int(end*sr)
        start_idx = int((end-2)*sr) # truncate segment to 2s
        end_segment = sound[start_idx:end_idx]
        seg_object = audiosegment.from_numpy_array(end_segment, sr)
        silences = pydub.silence.detect_silence(seg_object, min_silence_len = 100, silence_thresh=-32, seek_step = 30)
        silences = [[int(a/1000*sr),int(b/1000*sr)] for [a, b] in silences] # turn unit into samples
        if silences!= []:
            if silences[-1][1] == 2*sr: # if silence at end, remove that silence
                print('readjusted end')
                end_idx = end_idx - 2*sr + silences[-1][0] # readjust silence end
                # reset start idx
                start_idx = start*sr
                if start_idx >= end_idx - 2*sr: # if segment too short
                    continue
                start_idx = end_idx - 2*sr 
        # recrop
        end_segment = sound[start_idx:end_idx]
        seg_object = audiosegment.from_numpy_array(end_segment, sr)
        # recalculate silence
        silences = pydub.silence.detect_silence(seg_object, min_silence_len = 100, silence_thresh=-32, seek_step = 30)
        silences = [[int(a/1000*sr),int(b/1000*sr)] for [a, b] in silences]
        length_silence = sum([b-a for [a, b] in silences])
        if length_silence/len(sound_segment) > 0.3:
            save_end = False
        end_filename = 'trainfiles/end_segments/'+name+'_seg'+str(j)+'.npy'
        end_row = {'file': soundfile, 'segment': j, 'start': start, 'end': end, 
               'silence_ratio': length_silence/len(sound_segment), 'speaker':speaker, 'filename':end_filename}
        if save_end = True:
            np.save(end_filename, sound_segment)
            end_df = end_df.append(row, ignore_index = True)
        # starting segments
        if j > 0 and getend(tags[j-1]) > start: #if someone interrupted talking
            print('interrupt')
            save_start = False
        end_idx = int((start+2)*sr)
        start_idx = int(start*sr) # truncate segment to 2s
        start_segment = sound[start_idx:end+idx]
        seg_object = audiosegment.from_numpy_array(start_segment, sr)
        silences = pydub.silence.detect_silence(seg_object, min_silence_len = 100, silence_thresh=-32, seek_step = 30)
        silences = [[int(a/1000*sr),int(b/1000*sr)] for [a, b] in silences]
        if silences!= []:
            if silences[0][0] == 0: # if silence at start, remove that silence # different
                start = start + silences[0][1] - int(0.1*sr) # readjust silence start # different, include attack
                if segment.tag == 'segment':
                    end = int(float(segment.attrib['e_time'])*sr) # different
                else:
                    end = int(float(segment.attrib['endtime'])*sr) # different
                if start >= end - 2*sr: # if segment too short
                    continue
                end = start + 2*sr # different
        # recrop
        sound_segment = sound[start:end]
        seg = audiosegment.from_numpy_array(sound_segment, sr)
        # recalculate silence
        silences = pydub.silence.detect_silence(seg, min_silence_len = 100, silence_thresh=-32, seek_step = 30)
        silences = [[int(a/1000*sr),int(b/1000*sr)] for [a, b] in silences]
        #if silences!=[]: # print out end of first silence, if any
        #    print(silences[0][1]) # different
        length_silence = sum([b-a for [a, b] in silences])
        if length_silence/len(sound_segment) <= 0.3:
            filename = 'trainfiles/start_segments/'+name+'seg'+str(j)+'.npy'
            np.save(filename, sound_segment)
            row = {'file': soundfile, 'segment': j, 'start': start, 'end': end, 
                   'silence_ratio': length_silence/len(sound_segment), 'speaker':speaker, 'filename':filename}
            start_df = start_df.append(row, ignore_index = True) # different
        
        #for [a, b] in silences:
        #    emptiness += list(sound_segment[a:b])           

0
1
2
3
4
5
6
7
8
9


In [72]:
end_df.to_csv('trainfiles/end.csv', index = False)
end_df

Unnamed: 0,end,file,filename,segment,silence_ratio,speaker,start
0,233805.0,trainfiles/LDC97S66/96_eval/evaldata/file1.sph,trainfiles/end_segments/file1seg0.npy,0.0,0.150,Leon_Harris,201805.0
1,1822056.0,trainfiles/LDC97S66/96_eval/evaldata/file1.sph,trainfiles/end_segments/file1seg1.npy,1.0,0.000,Steve_Hurst,1790056.0
2,2028872.0,trainfiles/LDC97S66/96_eval/evaldata/file1.sph,trainfiles/end_segments/file1seg2.npy,2.0,0.160,Leon_Harris,1996872.0
3,3342331.0,trainfiles/LDC97S66/96_eval/evaldata/file1.sph,trainfiles/end_segments/file1seg3.npy,3.0,0.215,Steve_Hurst,3310331.0
4,3451714.0,trainfiles/LDC97S66/96_eval/evaldata/file1.sph,trainfiles/end_segments/file1seg4.npy,4.0,0.000,Leon_Harris,3419714.0
...,...,...,...,...,...,...,...
612,57561840.0,trainfiles/LDC97S66/96_dev/devdata/o960710p.sph,trainfiles/end_segments/o960710pseg19.npy,19.0,0.000,o960710p_janedoe004,57529840.0
613,57690976.0,trainfiles/LDC97S66/96_dev/devdata/o960710p.sph,trainfiles/end_segments/o960710pseg20.npy,20.0,0.000,o960710p_janedoe001,57658976.0
614,60651552.0,trainfiles/LDC97S66/96_dev/devdata/o960710p.sph,trainfiles/end_segments/o960710pseg21.npy,21.0,0.130,Martin_Wells,60619552.0
615,84049040.0,trainfiles/LDC97S66/96_dev/devdata/o960710p.sph,trainfiles/end_segments/o960710pseg22.npy,22.0,0.095,Bob_Edwards,84017040.0


## Starting segments

0
1
2
3
4
5
6
7
8
9


In [74]:
start_df.to_csv('trainfiles/start.csv', index = False)
start_df

Unnamed: 0,end,file,filename,segment,silence_ratio,speaker,start
0,42560.0,trainfiles/LDC97S66/96_eval/evaldata/file1.sph,trainfiles/start_segments/file1seg0.npy,0.0,0.000,Leon_Harris,10560.0
1,269005.0,trainfiles/LDC97S66/96_eval/evaldata/file1.sph,trainfiles/start_segments/file1seg1.npy,1.0,0.000,Steve_Hurst,237005.0
2,1855816.0,trainfiles/LDC97S66/96_eval/evaldata/file1.sph,trainfiles/start_segments/file1seg2.npy,2.0,0.000,Leon_Harris,1823816.0
3,2060872.0,trainfiles/LDC97S66/96_eval/evaldata/file1.sph,trainfiles/start_segments/file1seg3.npy,3.0,0.050,Steve_Hurst,2028872.0
4,3374331.0,trainfiles/LDC97S66/96_eval/evaldata/file1.sph,trainfiles/start_segments/file1seg4.npy,4.0,0.000,Leon_Harris,3342331.0
...,...,...,...,...,...,...,...
649,57599120.0,trainfiles/LDC97S66/96_dev/devdata/o960710p.sph,trainfiles/start_segments/o960710pseg20.npy,20.0,0.225,o960710p_janedoe001,57567120.0
650,59804640.0,trainfiles/LDC97S66/96_dev/devdata/o960710p.sph,trainfiles/start_segments/o960710pseg21.npy,21.0,0.065,Martin_Wells,59772640.0
651,83809696.0,trainfiles/LDC97S66/96_dev/devdata/o960710p.sph,trainfiles/start_segments/o960710pseg22.npy,22.0,0.065,Bob_Edwards,83777696.0
652,84092400.0,trainfiles/LDC97S66/96_dev/devdata/o960710p.sph,trainfiles/start_segments/o960710pseg23.npy,23.0,0.250,o960710p_janedoe005,84060400.0


In [51]:
from bs4 import BeautifulSoup
soundfile = soundfiles[0]
name = soundfile.split('/')[-1].split('.')[0] # name of soundfile
sph = SPHFile(soundfile)
sound, sr = sph.content, sph.format['sample_rate']
annofile = data[soundfile]

end

'8.235'

In [8]:
from bs4 import BeautifulSoup
total_segs = 0
for i, soundfile in enumerate(soundfiles):
    name = soundfile.split('/')[-1].split('.')[0] # name of soundfile
    sph = SPHFile(soundfile)
    sound, sr = sph.content, sph.format['sample_rate']
    annofile = data[soundfile]
    try:
        with open(annofile) as file:
            soup = BeautifulSoup(file,'html.parser')
        print(i, annofile, len(soup.find_all(['segment', 'turn'])))
        total_segs+=len(soup.find_all(['segment', 'turn']))
        
    except:
        print('broken file ', annofile)
    if len(soup.find_all(['segment', 'turn'])) < 10:
        print(soup.find_all(['segment', 'turn']))
        print('error parsing ', annofile)
        break
total_segs

0 trainfiles/LDC97T22/hub4_eng_train_trans/dCNN_EED/d960531a.txt 97
1 trainfiles/LDC97T22/hub4_eng_train_trans/dCNN_EED/d960604b.txt 69
2 trainfiles/LDC97T22/hub4_eng_train_trans/dCNN_EED/d960531b.txt 92
3 trainfiles/LDC97T22/hub4_eng_train_trans/dCNN_EED/d960604a.txt 92
4 trainfiles/LDC97T22/hub4_eng_train_trans/gCNN_PRN/g960523_.txt 92
5 trainfiles/LDC97T22/hub4_eng_train_trans/hCNN_TWT/h960516_.txt 157
6 trainfiles/LDC97T22/hub4_eng_train_trans/gCNN_PRN/g960529_.txt 70
7 trainfiles/LDC97T22/hub4_eng_train_trans/gCNN_PRN/g960611_.txt 89
8 trainfiles/LDC97T22/hub4_eng_train_trans/gCNN_PRN/g960607_.txt 88
9 trainfiles/LDC97T22/hub4_eng_train_trans/hCNN_TWT/h960517_.txt 150
10 trainfiles/LDC97T22/hub4_eng_train_trans/gCNN_PRN/g960524_.txt 75
11 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960531d.txt 78
12 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960607c.txt 79
13 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960613c.txt 82
14 trainfiles/LDC97T22/hub4_eng_train_tran

119 trainfiles/LDC97T22/hub4_eng_train_trans/iCSP_WAJ/i960606_.txt 427
120 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960614b.txt 48
121 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960614d.txt 95
122 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960617_.txt 333
123 trainfiles/LDC97T22/hub4_eng_train_trans/kNPR_MKP/k960604_.txt 54
124 trainfiles/LDC97T22/hub4_eng_train_trans/kNPR_MKP/k960605_.txt 55
125 trainfiles/LDC97T22/hub4_eng_train_trans/kNPR_MKP/k960606_.txt 53
126 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960614a.txt 83
127 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960614c.txt 77
128 trainfiles/LDC97T22/hub4_eng_train_trans/kNPR_MKP/k960607_.txt 75
129 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960618b.txt 71
130 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960618a.txt 96
131 trainfiles/LDC97T22/hub4_eng_train_trans/jNPR_ATC/j960618c.txt 90
132 trainfiles/LDC97T22/hub4_eng_train_trans/kNPR_MKP/k960614_.txt 80
133 trainfiles/LDC

238 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/ee970724.sgml 200
239 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/ee970723.sgml 187
240 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/ee970703.sgml 135
241 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/em970914.sgml 45
242 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/ee970702.sgml 166
243 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/em970920.sgml 72
244 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/em970924.sgml 67
245 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/eh971008.sgml 196
246 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/em970923.sgml 83
247 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/em970922.sgml 85
248 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/em970921.sgml 62
249 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/em970919.sgml 52
250 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/em970918.sgml 87
251 trainfiles/LDC98T28/hub4e97_trans_980217/transcrp/eo970911.sgml 142


33624

In [38]:
spkrfile = 'trainfiles/LDC97T22/hub4_eng_train_trans/spkrlist.sgml'
spkrlist = []
with open(spkrfile) as file:
    for line in file:
        soup = BeautifulSoup(line, 'html.parser')
        doc = soup.find_all('speaker')
        if len(doc):
            doc = doc[0]
            spkrlist.append(doc['name'])
len(spkrlist)

3116

In [40]:
print(len(spkrlist))
spkrlist = Counter(spkrlist)
print([speaker for speaker in spkrlist.keys() if spkrlist[speaker]>1])
print(len(spkrlist))

3116
['Alfonse_DAmato', 'Amanda_Greenleaf_Whelan', 'Ann_Lewis', 'Anne_McDermott', 'Bo_Gritz', 'Brian_Jenkins', 'Brent_Sadler', 'Kathy_Lohr', 'Oliver_Caman', 'Dan_Rutz', 'Don_Knapp', 'Reni_Vaughn', 'Lisa_Price', 'John_Ydstie', 'Steve_Inskeep', 'Jamie_McIntyre', 'Jim_Zarroli', 'Katharine_Barrett', 'Kent_Ninomiya', 'Lauch_Faircloth', 'Martha_Raddatz', 'Martin_Buser', 'Mary_Schiavo', 'Michael_Sivy', 'Philip_Boroff', 'Roger_Cossack', 'William_Perry', 'Sherry_Matteucci', 'William_Clinger']
3087


# Train&Validate Split, figure out speakers

In [75]:
start_df = pd.read_csv('trainfiles/start.csv')
end_df = pd.read_csv('trainfiles/end.csv')
segments_df = start_df.append(end_df, ignore_index = True)
speaker_count = Counter(list(segments_df['speaker']))
speakers = list(speaker_count)
speakers.sort()
print(len(speakers),'speakers')
start_df = start_df.sample(frac=1).reset_index(drop=True)
end_df = end_df.sample(frac=1).reset_index(drop=True)
print(speakers)

155 speakers
['ABC_PRT_announcer', 'Al_Gore', 'Alex_Taylor', 'Alexander_Novak', 'Amy_Cazlow', 'Barbara_Boxer', 'Bernard_Shaw', 'Betsy_Keefer', 'Bill_Clinton', 'Bill_Hubbard', 'Bill_Richardson', 'Bill_Straub', 'Bob_Dole', 'Bob_Edwards', 'Boris_Maximov', 'Bud_Collins', 'Byron_Miranda', 'Candy_Crowley', 'Charles_Scanlon', 'Christina_Zelaya', 'Claudia_Sloan', 'Cynthia_Engle', 'Cynthia_McFadden', 'David_Brancaccio', 'David_Funderberk', 'David_Johnson', 'David_Smith', 'Diane_Sawyer', 'Dianne_Tumee', 'Donald_Radicheque', 'Donna_Kelly', 'Eileen_Clark', "Eileen_O'Conner", 'Elena_PPD', 'Fiona_Foster', 'Fritz_Ferber', 'George_Lewensky', 'George_Lewinski', 'Hassan_Maradovitch', 'Ignacio_Besaudi', 'Joanne_Miles', 'John_Dimsdale', 'John_McEnroe', 'John_Parker', 'Judy_Woodruff', 'Karin_Henrikson', 'Kay_Bailey_Hutchison', 'Kimberly_Dozier', 'Lee_Zasloff', 'Leon_Harris', 'Lisa_Mullins', 'Marina_Bauten', 'Martin_Wells', 'Mary_Ambrose', 'Maureena_Colby', 'Mickey_Kantor', 'NPR_MKP_Announcer', 'Nenet_Sheve

In [53]:
start_df = pd.read_csv('trainfiles/start.csv')
end_df = pd.read_csv('trainfiles/end.csv')
segments_df = start_df.append(end_df, ignore_index = True)
speaker_count = Counter(list(segments_df['speaker']))
speakers = list(speaker_count)
speakers.sort()
print(len(speakers),'speakers')
start_df = start_df.sample(frac=1).reset_index(drop=True)
end_df = end_df.sample(frac=1).reset_index(drop=True)
print(speakers)

1156 speakers
['Aarati_Kasturiranjan', 'Aaron_Brown', 'Abdul_Ram_Al-Shelhah', 'Abner_Louima', 'Adam_Hochberg', 'Adam_Karol_Czartoryski-Borbon', 'Adela_Nabretta', 'Adrienne_Arsenault', 'Aiesha_Perry', 'Aileen_Pincus', 'Al-Mal_Jaisigher', 'Al_Gore', 'Al_Hinman', 'Al_Hunt', 'Alan_Arkin', 'Alan_Dow', 'Alan_Greenspan', 'Alan_Yurman', 'Alex_Chadwick', 'Alex_Hunter', 'Alex_Penelas', 'Alexandra_Doan', 'Alexei_Potbaryoskin', 'Alfonzo_Martinez_Sierra', 'Alice_Featherstone', 'Alin_Lang', 'Allan_Duke', 'Amy_Eddings', 'Amy_Levin', 'Anatoly_Chubais', 'Anderson_Cooper', 'Andrea_Dubrovsky', 'Andrea_Koppel', 'Andrew_Luck_Banker', 'Andrew_Morton', 'Andrew_Vos', 'Andy_Green', 'Angela_Astore', 'Anita_Pratap', 'Ann_Kellan', 'Ann_McDermott', 'Anna_Feraluzi', 'Anna_Smith', 'Anne_Cooper', 'Anne_Fadiman', 'Anne_Marie_Maloney', 'Anne_McDermott', 'Anne_Merkich', 'Annie_Hamp', 'Ansel_Martinez', 'Ansonn_Chan', 'Anthony_Collins', 'Anthony_Keith_James', 'Anthony_Mack', 'Antonio_Mora', 'Arlen_Specter', 'Art_Buchwald'

In [33]:
# count = 0
# for speaker in speakers:
#     if speaker not in gender:
#         print(speaker, end = '  ')
#         key = input()
#         while key not in ['1', '2']:
#             key = input()
#         if key == '1':
#             print('male')
#             gender[speaker] = 'male'
#         else:
#             print('female')
#             gender[speaker] = 'female'
#         np.save('gender.npy', gender) 

In [34]:
def calculate_overlap(row1, row2):
    if row1['file'] != row2['file']: # can only overlap if in same file
        return 0
    if row1['end']<row2['start'] or row1['start']>row2['end']:
        return 0
    if row1['start']<row2['start']:
        return row1['end']-row2['start']
    return row2['end']-row1['start']
# overlap_count = 0
# for _, row1 in segments_df.iterrows():
#     #print(row1)
#     candidates = segments_df[segments_df['file'] == row1['file']]
#     candidates = candidates[candidates['speaker'] == row1['speaker']]
#     for _, row2 in candidates.iterrows():
#         if calculate_overlap(row1, row2)>0:
#             overlap_count+=1
#             #print(row2)
#     overlap_count-=1 # every segment overlaps with itself
# overlap_count

# Train

## Pool start/end segments

In [41]:
# sample training data
start_df = start_df[:int(len(start_df)*0.9)]
end_df = end_df[:int(len(end_df)*0.9)]

segments_df = start_df.append(end_df, ignore_index = True)
segments_by_speaker = {}
# for faster fetching of triplets negative speaker
for speaker in speakers:
    segments_by_speaker[speaker] = segments_df[segments_df['speaker'] == speaker]
# for faster fetching of pairs negative speaker
start_segments_by_speaker = {}
for speaker in speakers:
    start_segments_by_speaker[speaker] = start_df[start_df['speaker'] == speaker]
for speaker in speakers:
    print(speaker)

Aaron_Brown
Al_Gore
Al_Hinman
Alan_Dow
Andrea_Arsenault
Andy_Field
Anne_Garrels
Anne_McDermott
Anthony_Keith_James
Antonio_Mora
Austin_Bay
Barry_Serafin
Bernard_Shaw
Bill_Blakemore
Bill_Clinton
Bill_Dorman
Bill_Hemmer
Bill_Redeker
Bob_Caine
Bob_Dole
Bob_Franken
Brian_Cabell
Brian_Jenkins
Brian_Lamb
Brian_Rooney
Brian_Ross
Britt_Hume
Bruce_Morton
Byron_Miranda
C.C._Connelly
Candy_Crowley
Carl_Rochelle
Charles_Zewe
Chitra_Ragavan
Chris_Beary
Chris_Buerry
Chris_Bury
Chris_Wallace
Christina_Zorich
Christine_Negroni
Chuck_Roberts
Cokie_Roberts
Corey_Flintoff
Craig_Wintom
Cynthia_McFadden
Dan_Ronan
Dan_Rutz
Dave_Marash
David_Brancaccio
David_Ensor
David_Fromm
David_Hinson
David_McIntosh
David_Welna
Dean_Reynolds
Deborah_Amos
Diane_Sawyer
Dick_Schaap
Dick_Wilson
Don_Harrison
Don_Knapp
Donna_Kelley
Ed_Garsten
Eddie_Mair
Elizabeth_Arnold
Elsa_Klensch
Erin_Hayes
Eugenia_Halsey
Flip_Spiceland
Forrest_Sawyer
Frank_Stasio
Gary_Hart
Gary_Robertson
Gary_Tuchman
George_Lewinski
George_Strait
Gloria_Hi

## Make triplets

In [42]:
triplets = []
for _, anchor_speaker in enumerate(speakers):
    print(_)
    negative_speakers = [speaker for speaker in speakers if speaker != anchor_speaker]
    anchor_segments = segments_df[segments_df['speaker'] == anchor_speaker]
    anchor_indices = np.arange(len(anchor_segments))
    np.random.shuffle(anchor_indices)# shuffle the anchor segments
    for i in range(min(len(anchor_segments),35)):
        anchor_segment = anchor_segments.iloc[anchor_indices[i]]
        positive_indices = np.arange(len(anchor_segments))
        np.random.shuffle(positive_indices)# shuffle the anchor segments
        for j in range(min(len(anchor_segments),35)):
            positive_segment = anchor_segments.iloc[positive_indices[j]]
            if calculate_overlap(anchor_segment, positive_segment)>sr: # if overlap greater than 1s
                continue
            negative_speaker = negative_speakers[np.random.randint(0, len(negative_speakers))]
            negative_segments = segments_by_speaker[negative_speaker] # segments for a specific random speaker
            negative_segment = negative_segments.iloc[np.random.randint(0, len(negative_segments))]
            row = {'anchor_speaker': anchor_speaker, 'anchor_file': anchor_segment['filename'], 
                   'positive_file': positive_segment['filename'], 'negative_speaker':negative_speaker,
                   'negative_file': negative_segment['filename']}
            triplets.append(row)
triplets = pd.DataFrame(triplets)

triplets.to_csv('trainfiles/train-triplets.csv')
triplets

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181


Unnamed: 0,anchor_speaker,anchor_file,positive_file,negative_speaker,negative_file
0,Aaron_Brown,trainfiles/start_segments/ea980110seg41.npy,trainfiles/start_segments/c960521_seg74.npy,Carl_Rochelle,trainfiles/end_segments/e960517aseg9.npy
1,Aaron_Brown,trainfiles/start_segments/ea980110seg41.npy,trainfiles/start_segments/ea980110seg30.npy,Gary_Tuchman,trainfiles/start_segments/ee970813seg135.npy
2,Aaron_Brown,trainfiles/start_segments/ea980110seg41.npy,trainfiles/end_segments/ea980110seg46.npy,Jackie_Judd,trainfiles/start_segments/ea980112seg60.npy
3,Aaron_Brown,trainfiles/start_segments/ea980110seg41.npy,trainfiles/end_segments/c960523_seg95.npy,Peter_Kenyon,trainfiles/end_segments/j960618bseg17.npy
4,Aaron_Brown,trainfiles/start_segments/ea980110seg41.npy,trainfiles/start_segments/ea980108seg28.npy,Anthony_Keith_James,trainfiles/start_segments/ed980106seg52.npy
...,...,...,...,...,...
213663,Walter_Rodgers,trainfiles/end_segments/eo970826seg47.npy,trainfiles/start_segments/d960530aseg13.npy,Cokie_Roberts,trainfiles/end_segments/a960610_seg8.npy
213664,Walter_Rodgers,trainfiles/end_segments/eo970826seg47.npy,trainfiles/start_segments/em970922seg62.npy,Ron_Elving,trainfiles/end_segments/i960610_seg114.npy
213665,Walter_Rodgers,trainfiles/end_segments/eo970826seg47.npy,trainfiles/end_segments/em970922seg62.npy,Anne_Garrels,trainfiles/end_segments/j960607bseg38.npy
213666,Walter_Rodgers,trainfiles/end_segments/eo970826seg47.npy,trainfiles/end_segments/eo970826seg55.npy,Joie_Chen,trainfiles/end_segments/em971231seg56.npy


## Make pairs

In [43]:
pairs = []
for _, anchor_speaker in enumerate(speakers):
    print(_)
    negative_speakers = [speaker for speaker in speakers if speaker != anchor_speaker]
    anchor_segments = end_df[end_df['speaker'] == anchor_speaker] # at end of segment
    positive_segments = start_df[start_df['speaker'] == anchor_speaker] # at start of segment
    anchor_indices = np.arange(len(anchor_segments))
    np.random.shuffle(anchor_indices)# shuffle the anchor segments
    for i in range(min(len(anchor_segments), 50)):
        anchor_segment = anchor_segments.iloc[anchor_indices[i]]
        positive_indices = np.arange(len(positive_segments))
        np.random.shuffle(positive_indices)# shuffle the anchor segments
        for j in range(min(len(positive_segments), 50)):
            # positive examples
            positive_segment = positive_segments.iloc[positive_indices[j]]
            if calculate_overlap(anchor_segment, positive_segment)>sr: # if overlap greater than 1s
                continue
            # swap 1st 2nd randomly
            pos_row = {'first_speaker': anchor_speaker, 'first_file': anchor_segment['filename'], 
                       'second_speaker': anchor_speaker, 'second_file': positive_segment['filename'],
                       'label': 0} # no speaker change
      
            # negative examples
            negative_speaker = negative_speakers[np.random.randint(0, len(negative_speakers))]
            negative_segments = start_segments_by_speaker[negative_speaker] # segments for a specific random speaker
            negative_segment = negative_segments.iloc[np.random.randint(0, len(negative_segments))]
            neg_row = {'first_speaker': anchor_speaker, 'first_file': anchor_segment['filename'], 
               'second_speaker': negative_speaker, 'second_file': negative_segment['filename'],
               'label': 1} # has speaker change
            pairs.append(pos_row)
            pairs.append(neg_row)
pairs = pd.DataFrame(pairs)
pairs.to_csv('trainfiles/train-pairs.csv')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181


# Validate

## Pool start/end segments

In [44]:
# sample val data
start_df = start_df[int(len(start_df)*0.9):]
end_df = end_df[int(len(end_df)*0.9):]

segments_df = start_df.append(end_df, ignore_index = True)
segments_by_speaker = {}
# for faster fetching of triplets negative speaker
for speaker in speakers:
    segments_by_speaker[speaker] = segments_df[segments_df['speaker'] == speaker]
# for faster fetching of pairs negative speaker
start_segments_by_speaker = {}
for speaker in speakers:
    start_segments_by_speaker[speaker] = start_df[start_df['speaker'] == speaker]
for speaker in speakers:
    print(speaker)

Aaron_Brown
Al_Gore
Al_Hinman
Alan_Dow
Andrea_Arsenault
Andy_Field
Anne_Garrels
Anne_McDermott
Anthony_Keith_James
Antonio_Mora
Austin_Bay
Barry_Serafin
Bernard_Shaw
Bill_Blakemore
Bill_Clinton
Bill_Dorman
Bill_Hemmer
Bill_Redeker
Bob_Caine
Bob_Dole
Bob_Franken
Brian_Cabell
Brian_Jenkins
Brian_Lamb
Brian_Rooney
Brian_Ross
Britt_Hume
Bruce_Morton
Byron_Miranda
C.C._Connelly
Candy_Crowley
Carl_Rochelle
Charles_Zewe
Chitra_Ragavan
Chris_Beary
Chris_Buerry
Chris_Bury
Chris_Wallace
Christina_Zorich
Christine_Negroni
Chuck_Roberts
Cokie_Roberts
Corey_Flintoff
Craig_Wintom
Cynthia_McFadden
Dan_Ronan
Dan_Rutz
Dave_Marash
David_Brancaccio
David_Ensor
David_Fromm
David_Hinson
David_McIntosh
David_Welna
Dean_Reynolds
Deborah_Amos
Diane_Sawyer
Dick_Schaap
Dick_Wilson
Don_Harrison
Don_Knapp
Donna_Kelley
Ed_Garsten
Eddie_Mair
Elizabeth_Arnold
Elsa_Klensch
Erin_Hayes
Eugenia_Halsey
Flip_Spiceland
Forrest_Sawyer
Frank_Stasio
Gary_Hart
Gary_Robertson
Gary_Tuchman
George_Lewinski
George_Strait
Gloria_Hi

## Make triplets

In [45]:
triplets = []
for _, anchor_speaker in enumerate(speakers):
    print(_)
    negative_speakers = [speaker for speaker in speakers if speaker != anchor_speaker]
    anchor_segments = segments_df[segments_df['speaker'] == anchor_speaker]
    anchor_indices = np.arange(len(anchor_segments))
    np.random.shuffle(anchor_indices)# shuffle the anchor segments
    for i in range(min(len(anchor_segments),35)):
        anchor_segment = anchor_segments.iloc[anchor_indices[i]]
        positive_indices = np.arange(len(anchor_segments))
        np.random.shuffle(positive_indices)# shuffle the anchor segments
        for j in range(min(len(anchor_segments),35)):
            positive_segment = anchor_segments.iloc[positive_indices[j]]
            if calculate_overlap(anchor_segment, positive_segment)>sr: # if overlap greater than 1s
                continue
            negative_speaker = negative_speakers[np.random.randint(0, len(negative_speakers))]
            while len(start_segments_by_speaker[negative_speaker] == 0): #if we dropped someone totally
                negative_speaker = negative_speakers[np.random.randint(0, len(negative_speakers))]
            negative_segments = segments_by_speaker[negative_speaker] # segments for a specific random speaker
            negative_segment = negative_segments.iloc[np.random.randint(0, len(negative_segments))]
            row = {'anchor_speaker': anchor_speaker, 'anchor_file': anchor_segment['filename'], 
                   'positive_file': positive_segment['filename'], 'negative_speaker':negative_speaker,
                   'negative_file': negative_segment['filename']}
            triplets.append(row)
triplets = pd.DataFrame(triplets)

triplets.to_csv('trainfiles/val-triplets.csv')
triplets

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181


Unnamed: 0,anchor_speaker,anchor_file,positive_file,negative_speaker,negative_file
0,Aaron_Brown,trainfiles/start_segments/ea980110seg0.npy,trainfiles/start_segments/c960523_seg87.npy,George_Strait,trainfiles/start_segments/c960522_seg39.npy
1,Aaron_Brown,trainfiles/start_segments/ea980110seg0.npy,trainfiles/end_segments/ea980110seg45.npy,Flip_Spiceland,trainfiles/end_segments/ee970703seg63.npy
2,Aaron_Brown,trainfiles/start_segments/ea980110seg0.npy,trainfiles/end_segments/c960521_seg68.npy,Susan_Swain,trainfiles/end_segments/file2seg73.npy
3,Aaron_Brown,trainfiles/end_segments/c960521_seg68.npy,trainfiles/start_segments/ea980110seg0.npy,David_Welna,trainfiles/start_segments/j960531bseg27.npy
4,Aaron_Brown,trainfiles/end_segments/c960521_seg68.npy,trainfiles/end_segments/ea980110seg45.npy,Dan_Ronan,trainfiles/end_segments/em971001seg42.npy
...,...,...,...,...,...
31659,Walter_Rodgers,trainfiles/start_segments/em970918seg63.npy,trainfiles/end_segments/em970918seg61.npy,David_Ensor,trainfiles/end_segments/ea980130seg12.npy
31660,Walter_Rodgers,trainfiles/start_segments/eo970825seg60.npy,trainfiles/end_segments/eo970825seg57.npy,Bill_Clinton,trainfiles/start_segments/j960607cseg32.npy
31661,Walter_Rodgers,trainfiles/start_segments/eo970825seg60.npy,trainfiles/end_segments/em970922seg64.npy,Cokie_Roberts,trainfiles/end_segments/a960607_seg102.npy
31662,Walter_Rodgers,trainfiles/start_segments/eo970825seg60.npy,trainfiles/end_segments/em970918seg61.npy,Eddie_Mair,trainfiles/start_segments/eh971030seg117.npy


## Make pairs

In [49]:
pairs = []
for _, anchor_speaker in enumerate(speakers):
    print(_)
    negative_speakers = [speaker for speaker in speakers if speaker != anchor_speaker]
    anchor_segments = end_df[end_df['speaker'] == anchor_speaker] # at end of segment
    positive_segments = start_df[start_df['speaker'] == anchor_speaker] # at start of segment
    anchor_indices = np.arange(len(anchor_segments))
    np.random.shuffle(anchor_indices)# shuffle the anchor segments
    for i in range(min(len(anchor_segments), 50)):
        anchor_segment = anchor_segments.iloc[anchor_indices[i]]
        positive_indices = np.arange(len(positive_segments))
        np.random.shuffle(positive_indices)# shuffle the anchor segments
        for j in range(min(len(positive_segments), 50)):
            # positive examples
            positive_segment = positive_segments.iloc[positive_indices[j]]
            if calculate_overlap(anchor_segment, positive_segment)>sr: # if overlap greater than 1s
                continue
            # swap 1st 2nd randomly
            pos_row = {'first_speaker': anchor_speaker, 'first_file': anchor_segment['filename'], 
                       'second_speaker': anchor_speaker, 'second_file': positive_segment['filename'],
                       'label': 0} # no speaker change
      
            # negative examples
            negative_speaker = negative_speakers[np.random.randint(0, len(negative_speakers))]
            while len(start_segments_by_speaker[negative_speaker]) == 0: #if we dropped someone totally
                negative_speaker = negative_speakers[np.random.randint(0, len(negative_speakers))]
            negative_segments = start_segments_by_speaker[negative_speaker] # segments for a specific random speaker
            negative_segment = negative_segments.iloc[np.random.randint(0, len(negative_segments))]
            neg_row = {'first_speaker': anchor_speaker, 'first_file': anchor_segment['filename'], 
               'second_speaker': negative_speaker, 'second_file': negative_segment['filename'],
               'label': 1} # has speaker change
            pairs.append(pos_row)
            pairs.append(neg_row)
pairs = pd.DataFrame(pairs)
pairs.to_csv('trainfiles/val-pairs.csv')
pairs

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181


Unnamed: 0,first_speaker,first_file,second_speaker,second_file,label
0,Aaron_Brown,trainfiles/end_segments/ea980110seg45.npy,Aaron_Brown,trainfiles/start_segments/ea980110seg0.npy,0
1,Aaron_Brown,trainfiles/end_segments/ea980110seg45.npy,Dave_Marash,trainfiles/start_segments/a960605_seg10.npy,1
2,Aaron_Brown,trainfiles/end_segments/ea980110seg45.npy,Aaron_Brown,trainfiles/start_segments/c960523_seg87.npy,0
3,Aaron_Brown,trainfiles/end_segments/ea980110seg45.npy,Kathleen_Kennedy,trainfiles/start_segments/h960514_seg153.npy,1
4,Aaron_Brown,trainfiles/end_segments/c960521_seg68.npy,Aaron_Brown,trainfiles/start_segments/c960523_seg87.npy,0
...,...,...,...,...,...
33433,Walter_Rodgers,trainfiles/end_segments/eo970825seg57.npy,Michael_Gillen,trainfiles/start_segments/a960604_seg71.npy,1
33434,Walter_Rodgers,trainfiles/end_segments/em970922seg64.npy,Walter_Rodgers,trainfiles/start_segments/em970918seg63.npy,0
33435,Walter_Rodgers,trainfiles/end_segments/em970922seg64.npy,Louise_Schiavone,trainfiles/start_segments/em970920seg31.npy,1
33436,Walter_Rodgers,trainfiles/end_segments/em970922seg64.npy,Walter_Rodgers,trainfiles/start_segments/eo970825seg60.npy,0
