In [18]:
import sys
import os
import yaml
import pandas as pd
import numpy as np
import librosa
import torch
from easydict import EasyDict as edict
from os.path import join
from tqdm.auto import tqdm
# sys.path.insert(0, '../..')
if 'cd' not in globals():
    cd = True
    os.chdir('../..')
from utils import extract_feature

In [8]:
config_path = 'config/base_ctc_dialect.yml'
config = edict(yaml.load(open(config_path), Loader=yaml.SafeLoader))

In [9]:
config

{'use_gpu': True,
 'model': {'name': 'RNNCTC',
  'encoder': {'name': 'GRUBNEncoder',
   'bidirectional': True,
   'dropout': 0.5,
   'hidden_size': 64,
   'num_layers': 4},
  'decoder': {'name': 'CTCDecoder',
   'dropout': 0.0,
   'hidden_size': 128,
   'num_layers': 4},
  'checkpoint': None,
  'save_dir': 'checkpoints/base'},
 'train': {'dataset': 'DialectTranscriptionDataset',
  'data_dir': 'data/dialect_transcription',
  'meta_data': 'config/dialect_transcription/data/trans_train.csv',
  'batch_size': 32,
  'lr': 0.001,
  'lr_step': 5,
  'num_workers': 8,
  'grad_clip': 5.0,
  'optimizer': 'adam',
  'weight_decay': 0.0,
  'mom': 0.9,
  'end_epoch': 300,
  'print_freq': 20},
 'dev': {'dataset': 'DialectTranscriptionDataset',
  'data_dir': 'data/dialect_transcription',
  'meta_data': 'config/dialect_transcription/data/trans_dev.csv',
  'print_freq': 500},
 'test': {'dataset': 'DialectTranscriptionDataset',
  'data_dir': 'data/dialect_transcription',
  'meta_data': 'config/dialect_tran

In [35]:
sample_rate = 16000
window_size = 0.02
window_stride = 0.01
window = 'hamming'
def load_audio(path):
    if type(path) is str:
        sound, sample_rate = librosa.load(path, sr=16000)
    elif type(path) is tuple and len(path) == 3:
        path, start, duration = path
        sound, sample_rate = librosa.load(path, sr=16000, offset=start, duration=duration)
    # sample_rate, sound = read(path)
    # sound = sound.astype('float32') / 32767  # normalize audio
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound = sound.mean(axis=1)  # multiple channels, average
    return sound

def parse_audio(audio_path):

    y = load_audio(audio_path)

    n_fft = int(sample_rate * window_size)
    win_length = n_fft
    hop_length = int(sample_rate * window_stride)
    # STFT
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                     win_length=win_length, window=window)
    spect, phase = librosa.magphase(D)
    # S = log(S+1)
    spect = np.log1p(spect)
    spect = torch.FloatTensor(spect)
#     if self.normalize:
    mean = spect.mean()
    std = spect.std()
    spect.add_(-mean)
    spect.div_(std)

#     if self.spec_augment:
#         spect = spec_augment(spect)

    return spect.numpy().T

In [36]:
save_dir = 'data/dialect_transcription/stft'

In [43]:
names = set()
os.makedirs(config.feature.save_dir, exist_ok=True)
for mode in ['train', 'dev', 'test']:
    print('mode', mode)
    df = pd.read_csv(config[mode].meta_data)
    for i, sample in tqdm(df.iterrows(), total=len(df)):
        if i % 5000 == 0:
            print(f'{i}th sample')
        wave = sample['file']
        speaker = sample['speaker']
        start_time, duration_time = sample.start, sample.duration
        input_file = join(config[mode].data_dir, wave)
        feature = parse_audio((input_file, start_time, duration_time))
        save_name = f"{wave.replace('/','-')}-{'{:.2f}'.format(start_time)}-{'{:.2f}'.format(duration_time)}"
        save_path = join(save_dir, save_name)
        assert save_name not in names
        names.add(save_name)
        np.save(save_path, feature)


mode train


HBox(children=(IntProgress(value=0, max=47978), HTML(value='')))

0th sample
5000th sample
10000th sample
15000th sample
20000th sample
25000th sample
30000th sample
35000th sample
40000th sample
45000th sample
mode dev


HBox(children=(IntProgress(value=0, max=6528), HTML(value='')))

0th sample
5000th sample
mode test


HBox(children=(IntProgress(value=0, max=6676), HTML(value='')))

0th sample
5000th sample


In [None]:
import math
int(math.floor((sample_rate * window_size) / 2) + 1)