In [4]:
import pandas as pd
import os
import json
import librosa


In [7]:
datasets = [
    'ast_afrikaans_english_scripts', 
    'ast_black_english_scripts',
    'coraal_scripts',
    'hisp_eng_scripts',
    'librispeechtrain_scripts',
    'maheshchandra_scripts',
    'wsjcam0_scripts'
]

In [17]:
dataset2dialect = {
    'ast_afrikaans_english_scripts': 'afrikaan', 
    'ast_black_english_scripts': 'xhosa_zulu_sotho',
    'coraal_scripts': 'africa_american',
    'hisp_eng_scripts': 'spanish_central_south_america',
    'librispeechtrain_scripts': 'american',
    'maheshchandra_scripts': 'indian',
    'wsjcam0_scripts': 'british'
}

In [22]:
dialect2id = {d: i for i, d in enumerate(sorted(list(dataset2dialect.values())))}

In [23]:
dialect2id

{'africa_american': 0,
 'afrikaan': 1,
 'american': 2,
 'british': 3,
 'indian': 4,
 'spanish_central_south_america': 5,
 'xhosa_zulu_sotho': 6}

In [59]:
with open('labels.json', 'r') as f:
    labels = json.load(f)

In [61]:
label2int = {l: i for i, l in enumerate(labels)}

In [63]:
label2int

{'_': 0,
 "'": 1,
 'A': 2,
 'B': 3,
 'C': 4,
 'D': 5,
 'E': 6,
 'F': 7,
 'G': 8,
 'H': 9,
 'I': 10,
 'J': 11,
 'K': 12,
 'L': 13,
 'M': 14,
 'N': 15,
 'O': 16,
 'P': 17,
 'Q': 18,
 'R': 19,
 'S': 20,
 'T': 21,
 'U': 22,
 'V': 23,
 'W': 24,
 'X': 25,
 'Y': 26,
 'Z': 27,
 ' ': 28}

In [177]:
mode = 'train'
max_duration = 100
dfs = []
for dataset in datasets:
    data_dir = dataset.replace('_scripts', '')
    df = pd.read_csv(f'{dataset}/trans_{mode}.csv', index_col=0)
    print(dataset, df.columns)
    df['file'] = [f'{data_dir}/{f}' for f in df['file']]
    
    if 'start' not in df.columns:
        assert 'end' not in df.columns
        df['start'] = 0.0
        df['end'] = [librosa.get_duration(filename=f"../{f}") for f in df.file]
    df['duration'] = df['end'] - df['start']
    dialect = dataset2dialect[dataset]
    df['dialect'] = dialect
    df['dialect_int'] = dialect2id[dialect]
    df['trans_length'] = [len(trn) for trn in df.trans ]
    df['trans_int'] = [ ' '.join([str(label2int[l]) for l in trn]) for trn in df.trans ]
    df = df[df.duration<max_duration]
    dfs.append(df)

ast_afrikaans_english_scripts Index(['file', 'trans', 'speaker', 'start', 'end'], dtype='object')
ast_black_english_scripts Index(['file', 'trans', 'speaker', 'start', 'end'], dtype='object')
coraal_scripts Index(['file', 'trans', 'speaker', 'start', 'end'], dtype='object')
hisp_eng_scripts Index(['file', 'trans', 'speaker'], dtype='object')
librispeechtrain_scripts Index(['file', 'trans', 'speaker'], dtype='object')
maheshchandra_scripts Index(['file', 'trans', 'speaker'], dtype='object')
wsjcam0_scripts Index(['file', 'trans', 'speaker'], dtype='object')


In [178]:
df_merged = pd.concat(dfs)

In [179]:
df_merged.sample(10)

Unnamed: 0,file,trans,speaker,start,end,duration,dialect,dialect_int,trans_length,trans_int
24322,librispeechtrain/1737/142396/1737-142396-0004....,YET THEY WENT THERE REGULARLY OF THEIR OWN ACC...,libri1737,0.0,11.34,11.34,american,2,208,26 6 21 28 21 9 6 26 28 24 6 15 21 28 21 9 6 1...
1003,librispeechtrain/6454/107462/6454-107462-0024....,THERE WAS ONE OF THIS NATURE WHICH TROUBLED HI...,libri6454,0.0,14.015,14.015,american,2,182,21 9 6 19 6 28 24 2 20 28 16 15 6 28 16 7 28 2...
1396,wsjcam0/data/primary_microphone/si_tr/c1x/c1xc...,ART COLLECTORS AND DEALERS HEAVED A SIGH OF RE...,c1x,0.0,3.84375,3.84375,british,3,61,2 19 21 28 4 16 13 13 6 4 21 16 19 20 28 2 15 ...
4762,wsjcam0/data/primary_microphone/si_tr/c0r/c0rc...,IN SOME SECTORS THOUGH INTRIGUING OPTIONS ARE ...,c0r,0.0,4.65625,4.65625,british,3,58,10 15 28 20 16 14 6 28 20 6 4 21 16 19 20 28 2...
1787,coraal/DCA_audio_2018.10.06/DCA_se3_ag1_m_04_1...,WE GET OUT BEFORE THE PUBLIC SCHOOLS DO,DCA_se3_ag1_m_04,2074.3484,2076.1932,1.8448,africa_american,0,39,24 6 28 8 6 21 28 16 22 21 28 3 6 7 16 19 6 28...
99,maheshchandra/wav/sentence154.wav,WHAT TIME IS IT,i2,0.0,1.741437,1.741437,indian,4,15,24 9 2 21 28 21 10 14 6 28 10 20 28 10 21
786,wsjcam0/data/primary_microphone/si_tr/c1z/c1zc...,THE FOLLOWING QUESTIONS CAN PROVIDE A STARTING...,c1z,0.0,8.65625,8.65625,british,3,118,21 9 6 28 7 16 13 13 16 24 10 15 8 28 18 22 6 ...
27856,librispeechtrain/6209/34600/6209-34600-0022.flac,WHAT A MAN FEELS A CHILD FEELS STILL MORE THE ...,libri6209,0.0,13.01,13.01,american,2,176,24 9 2 21 28 2 28 14 2 15 28 7 6 6 13 20 28 2 ...
28035,librispeechtrain/5514/19192/5514-19192-0038.flac,A VERY LIVELY AND UTTERLY UNEXPECTED FUSILLADE...,libri5514,0.0,14.84,14.84,american,2,223,2 28 23 6 19 26 28 13 10 23 6 13 26 28 2 15 5 ...
26733,librispeechtrain/1235/135883/1235-135883-0022....,THE SULTAN WITHOUT WAITING FOR SCHEHERAZADE TO...,libri1235,0.0,12.51,12.51,american,2,189,21 9 6 28 20 22 13 21 2 15 28 24 10 21 9 16 22...


In [180]:
df_merged[df_merged.duration<100].duration.max()

20.0

In [182]:
len(df_merged)

47978

In [183]:
df_merged.to_csv(f'trans_{mode}.csv')

In [136]:
from collections import Counter
cnt = Counter(df_merged.dialect)

In [137]:
cnt

Counter({'afrikaan': 388,
         'xhosa_zulu_sotho': 369,
         'africa_american': 1806,
         'spanish_central_south_america': 35,
         'american': 2924,
         'indian': 20,
         'british': 1134})

In [138]:
len(labels)

29

In [None]:
df

In [139]:
df.trans_length.mean()

91.1305114638448

In [140]:
df.trans_length.std()

36.95903942405144

In [141]:
df.trans_length.max()

205

In [146]:
dfs[-1].trans_length.max

<bound method Series.max of 0        53
1        81
2       155
3        24
4        48
       ... 
1129     75
1130     68
1131     52
1132     77
1133    117
Name: trans_length, Length: 1134, dtype: int64>

In [148]:
dfs[-1].duration.max()

14.21875