In [18]:
import io
from pathlib import Path
import numpy as np
import pandas as pd
import soundfile as sf
import yaml
import tensorboard as tb
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

from IPython.display import Audio, HTML
from IPython.core.display import display

SAMPLE_RATE = 22050
CONFIGS_PATH = Path("configs")
LOGS_PATH = Path("logs")
LOGGED_AUDIOS_NUM = 10

In [19]:
def beep(data):
    display(Audio(data, rate=SAMPLE_RATE))

In [20]:
pd.set_option('display.max_columns', None)
for config_path in CONFIGS_PATH.iterdir():
    if config_path.suffix != ".yaml":
        continue
    with open(config_path, "r") as stream:
        try:
            config = pd.json_normalize(yaml.safe_load(stream))
        except yaml.YAMLError as exc:
            print(exc)
    
    chp_logs_path = LOGS_PATH / f"{config['checkpoint_name'].values[0]}"
    if chp_logs_path.is_dir():
        display(config)
        ea = EventAccumulator(chp_logs_path)
        ea.Reload() 
        for audio_idx in range(LOGGED_AUDIOS_NUM):
            w_times, _, audio, _, _, _ = zip(*ea.Audio(f"Audio/Val/{audio_idx}"))
            data, _ = sf.read(io.BytesIO(audio[-1]))
            beep(data)

Unnamed: 0,sample_rate,hop_size,f_min,f_max,win_size,n_fft,n_mels,checkpoint_name,seed,batch_size,grad_clip_thresh,log_steps,iters_per_checkpoint,epochs,test_size,device,hifi.dir_path,hifi.model_name,hifi.config_name,loss.mels_weight,loss.duration_weight,optimizer.learning_rate,optimizer.adam_beta1,optimizer.adam_beta2,optimizer.adam_epsilon,optimizer.reg_weight,scheduler.start_decay,scheduler.decay_steps,scheduler.decay_rate,scheduler.last_epoch,data.text_dir,data.mels_dir,model.n_frames_per_step,model.encoder_config.n_convolutions,model.encoder_config.kernel_size,model.encoder_config.conv_channel,model.encoder_config.lstm_layers,model.encoder_config.lstm_hidden,model.encoder_config.dropout,model.gst_config.ref_enc_filters,model.gst_config.emb_dim,model.gst_config.num_heads,model.gst_config.token_num,model.attention_config.duration_config.lstm_layers,model.attention_config.duration_config.lstm_hidden,model.attention_config.duration_config.dropout,model.attention_config.range_config.lstm_layers,model.attention_config.range_config.lstm_hidden,model.attention_config.range_config.dropout,model.attention_config.eps,model.attention_config.positional_dim,model.attention_config.teacher_forcing_ratio,model.attention_config.attention_dropout,model.attention_config.positional_dropout,model.decoder_config.prenet_layers,model.decoder_config.prenet_dropout,model.decoder_config.decoder_rnn_dim,model.decoder_config.decoder_num_layers,model.decoder_config.teacher_forcing_ratio,model.decoder_config.dropout,model.postnet_config.embedding_dim,model.postnet_config.n_convolutions,model.postnet_config.kernel_size,model.postnet_config.dropout,model.mask_padding,model.phonem_embedding_dim,model.speaker_embedding_dim
0,22050,256,0,8000,1024,1024,80,tacotron_vctk_default_20_dur,42,16,1.0,1000,30000,2500,0.2,cuda,hifi,generator_v1,config.json,1.0,2.0,0.001,0.9,0.999,1e-06,1e-06,4000,50000,0.5,400000,data/vctk/mfa_outputs,data/vctk/mels,1,3,5,512,1,256,0.1,"[32, 32, 64, 64, 128, 128]",256,8,10,2,256,0.5,2,256,0.5,1e-06,32,1.0,0.1,0.0,"[256, 256]",0.5,512,3,1.0,0.1,512,5,5,0.1,True,512,256


Unnamed: 0,sample_rate,hop_size,f_min,f_max,win_size,n_fft,n_mels,checkpoint_name,seed,batch_size,grad_clip_thresh,log_steps,iters_per_checkpoint,epochs,test_size,device,hifi.dir_path,hifi.model_name,hifi.config_name,loss.mels_weight,loss.duration_weight,optimizer.learning_rate,optimizer.adam_beta1,optimizer.adam_beta2,optimizer.adam_epsilon,optimizer.reg_weight,scheduler.start_decay,scheduler.decay_steps,scheduler.decay_rate,scheduler.last_epoch,data.text_dir,data.mels_dir,model.n_frames_per_step,model.encoder_config.n_convolutions,model.encoder_config.kernel_size,model.encoder_config.conv_channel,model.encoder_config.lstm_layers,model.encoder_config.lstm_hidden,model.encoder_config.dropout,model.gst_config.ref_enc_filters,model.gst_config.emb_dim,model.gst_config.num_heads,model.gst_config.token_num,model.attention_config.duration_config.lstm_layers,model.attention_config.duration_config.lstm_hidden,model.attention_config.duration_config.dropout,model.attention_config.range_config.lstm_layers,model.attention_config.range_config.lstm_hidden,model.attention_config.range_config.dropout,model.attention_config.eps,model.attention_config.positional_dim,model.attention_config.teacher_forcing_ratio,model.attention_config.attention_dropout,model.attention_config.positional_dropout,model.decoder_config.prenet_layers,model.decoder_config.prenet_dropout,model.decoder_config.decoder_rnn_dim,model.decoder_config.decoder_num_layers,model.decoder_config.teacher_forcing_ratio,model.decoder_config.dropout,model.postnet_config.embedding_dim,model.postnet_config.n_convolutions,model.postnet_config.kernel_size,model.postnet_config.dropout,model.mask_padding,model.phonem_embedding_dim,model.speaker_embedding_dim
0,22050,256,0,8000,1024,1024,80,tacotron_vctk_default_20_dur_3_frames_gst,42,16,1.0,100,15000,2500,0.2,cuda:1,hifi,generator_v1,config.json,1.0,2.0,0.001,0.9,0.999,1e-06,1e-06,4000,50000,0.5,400000,data/vctk/mfa_outputs,data/vctk/mels,3,3,5,512,1,256,0.1,"[32, 32, 64, 64, 128, 128]",256,6,10,2,256,0.5,2,256,0.5,1e-06,32,1.0,0.1,0.0,"[256, 256]",0.5,512,3,1.0,0.1,512,5,5,0.1,True,512,256


Unnamed: 0,sample_rate,hop_size,f_min,f_max,win_size,n_fft,n_mels,checkpoint_name,seed,batch_size,grad_clip_thresh,log_steps,iters_per_checkpoint,epochs,test_size,device,hifi.dir_path,hifi.model_name,hifi.config_name,loss.mels_weight,loss.duration_weight,optimizer.learning_rate,optimizer.adam_beta1,optimizer.adam_beta2,optimizer.adam_epsilon,optimizer.reg_weight,scheduler.start_decay,scheduler.decay_steps,scheduler.decay_rate,scheduler.last_epoch,data.text_dir,data.mels_dir,model.n_frames_per_step,model.encoder_config.n_convolutions,model.encoder_config.kernel_size,model.encoder_config.conv_channel,model.encoder_config.lstm_layers,model.encoder_config.lstm_hidden,model.encoder_config.dropout,model.gst_config.ref_enc_filters,model.gst_config.emb_dim,model.gst_config.num_heads,model.gst_config.token_num,model.attention_config.duration_config.lstm_layers,model.attention_config.duration_config.lstm_hidden,model.attention_config.duration_config.dropout,model.attention_config.range_config.lstm_layers,model.attention_config.range_config.lstm_hidden,model.attention_config.range_config.dropout,model.attention_config.eps,model.attention_config.positional_dim,model.attention_config.teacher_forcing_ratio,model.attention_config.attention_dropout,model.attention_config.positional_dropout,model.decoder_config.prenet_layers,model.decoder_config.prenet_dropout,model.decoder_config.decoder_rnn_dim,model.decoder_config.decoder_num_layers,model.decoder_config.teacher_forcing_ratio,model.decoder_config.dropout,model.postnet_config.embedding_dim,model.postnet_config.n_convolutions,model.postnet_config.kernel_size,model.postnet_config.dropout,model.mask_padding,model.phonem_embedding_dim,model.speaker_embedding_dim
0,22050,256,0,8000,1024,1024,80,tacotron_vctk_default_20_dur_1_frames_6_head_gst,42,16,1.0,1000,30000,2500,0.2,cuda:1,hifi,generator_v1,config.json,1.0,2.0,0.001,0.9,0.999,1e-06,1e-06,4000,50000,0.5,400000,data/full/mfa_outputs,data/full/mels,1,3,5,512,1,256,0.1,"[32, 32, 64, 64, 128, 128]",256,8,10,2,256,0.5,2,256,0.5,1e-06,32,1.0,0.1,0.0,"[256, 256]",0.5,512,3,1.0,0.1,512,5,5,0.1,True,512,128


Unnamed: 0,sample_rate,hop_size,f_min,f_max,win_size,n_fft,n_mels,checkpoint_name,seed,batch_size,grad_clip_thresh,log_steps,iters_per_checkpoint,epochs,test_size,device,hifi.dir_path,hifi.model_name,hifi.config_name,loss.mels_weight,loss.duration_weight,optimizer.learning_rate,optimizer.adam_beta1,optimizer.adam_beta2,optimizer.adam_epsilon,optimizer.reg_weight,scheduler.start_decay,scheduler.decay_steps,scheduler.decay_rate,scheduler.last_epoch,data.text_dir,data.mels_dir,model.n_frames_per_step,model.encoder_config.n_convolutions,model.encoder_config.kernel_size,model.encoder_config.conv_channel,model.encoder_config.lstm_layers,model.encoder_config.lstm_hidden,model.encoder_config.dropout,model.gst_config.ref_enc_filters,model.gst_config.emb_dim,model.gst_config.num_heads,model.gst_config.token_num,model.attention_config.duration_config.lstm_layers,model.attention_config.duration_config.lstm_hidden,model.attention_config.duration_config.dropout,model.attention_config.range_config.lstm_layers,model.attention_config.range_config.lstm_hidden,model.attention_config.range_config.dropout,model.attention_config.eps,model.attention_config.positional_dim,model.attention_config.teacher_forcing_ratio,model.attention_config.attention_dropout,model.attention_config.positional_dropout,model.decoder_config.prenet_layers,model.decoder_config.prenet_dropout,model.decoder_config.decoder_rnn_dim,model.decoder_config.decoder_num_layers,model.decoder_config.teacher_forcing_ratio,model.decoder_config.dropout,model.postnet_config.embedding_dim,model.postnet_config.n_convolutions,model.postnet_config.kernel_size,model.postnet_config.dropout,model.mask_padding,model.phonem_embedding_dim,model.speaker_embedding_dim
0,22050,256,0,8000,1024,1024,80,tacotron_vctk_default,42,16,1.0,1000,30000,2500,0.2,cuda,hifi,generator_v1,config.json,1.0,2.0,0.001,0.9,0.999,1e-06,1e-06,4000,50000,0.5,400000,data/vctk/mfa_outputs,data/vctk/mels,1,3,5,512,1,256,0.1,"[32, 32, 64, 64, 128, 128]",256,8,10,2,256,0.5,2,256,0.5,1e-06,32,1.0,0.1,0.0,"[256, 256]",0.5,512,3,1.0,0.1,512,5,5,0.1,True,512,256


Unnamed: 0,sample_rate,hop_size,f_min,f_max,win_size,n_fft,n_mels,checkpoint_name,seed,batch_size,grad_clip_thresh,log_steps,iters_per_checkpoint,epochs,test_size,device,hifi.dir_path,hifi.model_name,hifi.config_name,loss.mels_weight,loss.duration_weight,optimizer.learning_rate,optimizer.adam_beta1,optimizer.adam_beta2,optimizer.adam_epsilon,optimizer.reg_weight,scheduler.start_decay,scheduler.decay_steps,scheduler.decay_rate,scheduler.last_epoch,data.text_dir,data.mels_dir,model.n_frames_per_step,model.encoder_config.n_convolutions,model.encoder_config.kernel_size,model.encoder_config.conv_channel,model.encoder_config.lstm_layers,model.encoder_config.lstm_hidden,model.encoder_config.dropout,model.gst_config.ref_enc_filters,model.gst_config.emb_dim,model.gst_config.num_heads,model.gst_config.token_num,model.attention_config.duration_config.lstm_layers,model.attention_config.duration_config.lstm_hidden,model.attention_config.duration_config.dropout,model.attention_config.range_config.lstm_layers,model.attention_config.range_config.lstm_hidden,model.attention_config.range_config.dropout,model.attention_config.eps,model.attention_config.positional_dim,model.attention_config.teacher_forcing_ratio,model.attention_config.attention_dropout,model.attention_config.positional_dropout,model.decoder_config.prenet_layers,model.decoder_config.prenet_dropout,model.decoder_config.decoder_rnn_dim,model.decoder_config.decoder_num_layers,model.decoder_config.teacher_forcing_ratio,model.decoder_config.dropout,model.postnet_config.embedding_dim,model.postnet_config.n_convolutions,model.postnet_config.kernel_size,model.postnet_config.dropout,model.mask_padding,model.phonem_embedding_dim,model.speaker_embedding_dim
0,22050,256,0,8000,1024,1024,80,tacotron_vctk_default_20_dur_3_frames_6_head_gst,42,16,1.0,1000,30000,2500,0.2,cuda:0,hifi,generator_v1,config.json,1.0,2.0,0.001,0.9,0.999,1e-06,1e-06,4000,50000,0.5,400000,data/full/mfa_outputs,data/full/mels,3,3,5,512,1,256,0.1,"[32, 32, 64, 64, 128, 128]",256,8,10,2,256,0.5,2,256,0.5,1e-06,32,1.0,0.1,0.0,"[256, 256]",0.5,512,3,1.0,0.1,512,5,5,0.1,True,512,128


In [None]:
# to convert to html
#!jupyter nbconvert report_for_logs.ipynb --no-input --to html --output report4logs