In [1]:
from IPython.display import Audio
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import time
import glob
from lxml.html import parse
from sphfile import SPHFile
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup
from librosa.core import resample
import sys
from tqdm.notebook import tqdm
np.random.seed(12345)
new_sr = 8000

## Pair sound files and annotation
# Check if the data contains spanish characters

In [2]:
soundfiles = glob.glob('files/**/*.sph', recursive = True)
#annofiles = glob.glob('./**/*.txt', recursive = True) + glob.glob('./**/*.sgml', recursive = True)
data = {}
for soundfile in soundfiles:
    name = soundfile.split('/')[-1].split('.')[0]
    annofile = list(glob.glob('files/**/'+name+'.txt', recursive = True))+list(glob.glob('files/**/'+name+'.sgml', recursive = True))
    annofile = annofile[0]
    data[soundfile] = annofile
    with open(annofile) as file:
        try:
            soup = BeautifulSoup(file,'html.parser')
        except:
            print(annofile)
start_load = time.time()
soundfiles = list(data.keys())
len(soundfiles)

298

In [5]:
def getstart(segment):
    return float(segment['s_time']) if segment.has_attr('s_time') else float(segment['starttime'])
def getend(segment):
    return float(segment['e_time']) if segment.has_attr('e_time') else float(segment['endtime'])

def cut_segments(i):
    df = []
    soundfile = soundfiles[i]
    name = soundfile.split('/')[-1].split('.')[0] # name of soundfile
    sph = SPHFile(soundfile)
    sound, sr = sph.content, sph.format['sample_rate']
    annofile = data[soundfile]
    with open(annofile) as file:
        soup = BeautifulSoup(file,'html.parser')
    tags = soup.find_all(['segment', 'turn'])
    for j, segment in enumerate(tags):
        #text = segment.content
        end = getend(segment)
        start = getstart(segment)
        speaker = segment['speaker'].lower()
        if speaker[:4] in ['male', 'fema', 'spkr']:
            speaker = name+'_'+speaker
        if speaker.lower().find('announcer') != -1:
            #print(speaker)
            continue
        if speaker in ['01janedoe', '01johndoe', '02janedoe', '02johndoe', '04johndoe', 'anchor1', 'unison']:
            continue
        if end > len(sound)/sr: # if end exceeds length, skip
            continue
        if start >= end-2: # if segment too short
            continue
            
        k = 0
        while start<end-2:
            start_idx = int(start*sr)
            end_idx = start_idx+2*sr
            segment = np.array(sound[start_idx:end_idx], dtype = np.float32)/(2**15)
            segment = resample(segment, sr, new_sr)

            filename = 'files/segments/'+name+'_turn'+str(j)+'_seg'+str(k)+'.npy'
            row = {'speaker':speaker, 'segfile':filename}

            np.save(filename, segment)
            df.append(row)
            k+=1
            start+=2
    df = pd.DataFrame(df)
    df.to_csv('files/csv/'+name+'.csv', index = False)

# Save Segments

In [6]:
from multiprocessing import Pool
pool = Pool(processes=15)   
pool.map(cut_segments, range(0, len(soundfiles)))
print('Done!')

Done!


In [56]:
csv_list = list(glob.glob('files/csv/*'))
csv_list.sort()
df = pd.DataFrame()
for csv in csv_list:
    try:
        csv = pd.read_csv(csv)
    except:
        print(csv, 'is corrupted')
        continue
    df = df.append(csv, ignore_index = True)
df = df.sample(frac = 1, random_state = 12345)

files/csv/ep970818.csv is corrupted
files/csv/ep970820.csv is corrupted


In [58]:
hundredth = len(df)//100
df[:90*hundredth].to_csv('files/train-segments.csv', index = False)
df[90*hundredth:95*hundredth].to_csv('files/val-segments.csv', index = False)
df[95*hundredth:].to_csv('files/test-segments.csv', index = False)
