## Download LJSpeech

In [1]:
%pwd

'/home/jupyter/work/resources'

In [3]:
%ls

aligner.ipynb  [0m[01;34mdata[0m/     hw3.ipynb  [01;34mwaveglow[0m/
[01;34malignments[0m/    [01;34mdla_tts[0m/  [01;34mmodels[0m/    waveglow_256channels_universal_v5.pt


In [4]:
# !wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2

In [2]:
#!tar -xjf LJSpeech-1.1.tar.bz2

Unknown instance spec: tar

In [5]:
%pip install torch

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [6]:
%pip install librosa

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [8]:
%pip install torch==1.10.0+cu111 torchaudio==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html

Defaulting to user installation because normal site-packages is not writeable
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.10.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl (2137.6 MB)
[K     |████████████████████████████████| 2137.6 MB 120 bytes/s 
[?25hCollecting torchaudio==0.10.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torchaudio-0.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 40.4 MB/s 
Installing collected packages: torch, torchaudio
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.7.0 requires torch==1.6.0, but you have torch 1.10.0+cu111 which is incompatible.
mmdet 2.3.0rc0+c6b5ca2 requires Pillow<=6.2.2, but you have pillow 8.4.0 which is incompatible.
mm

## Featurizer

In [83]:
from IPython import display
from dataclasses import dataclass

import torch
from torch import nn

import torchaudio

import librosa
from matplotlib import pyplot as plt


@dataclass
class MelSpectrogramConfig:
    sr: int = 22050
    win_length: int = 1024
    hop_length: int = 256
    n_fft: int = 1024
    f_min: int = 0
    f_max: int = 8000
    n_mels: int = 80
    power: float = 1.0

    # value of melspectrograms if we fed a silence into `MelSpectrogram`
    pad_value: float = -11.5129251


class MelSpectrogram(nn.Module):

    def __init__(self, config: MelSpectrogramConfig):
        super(MelSpectrogram, self).__init__()

        self.config = config

        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
            sample_rate=config.sr,
            win_length=config.win_length,
            hop_length=config.hop_length,
            n_fft=config.n_fft,
            f_min=config.f_min,
            f_max=config.f_max,
            n_mels=config.n_mels
        )

        # The is no way to set power in constructor in 0.5.0 version.
        self.mel_spectrogram.spectrogram.power = config.power

        # Default `torchaudio` mel basis uses HTK formula. In order to be compatible with WaveGlow
        # we decided to use Slaney one instead (as well as `librosa` does by default).
        mel_basis = librosa.filters.mel(
            sr=config.sr,
            n_fft=config.n_fft,
            n_mels=config.n_mels,
            fmin=config.f_min,
            fmax=config.f_max
        ).T
        self.mel_spectrogram.mel_scale.fb.copy_(torch.tensor(mel_basis))

    def forward(self, audio: torch.Tensor) -> torch.Tensor:
        """
        :param audio: Expected shape is [B, T]
        :return: Shape is [B, n_mels, T']
        """

        mel = self.mel_spectrogram(audio) \
            .clamp_(min=1e-5) \
            .log_()

        return mel

OSError: /home/jupyter/.local/lib/python3.7/site-packages/torch/lib/libtorch_global_deps.so: cannot open shared object file: No such file or directory

In [None]:
featurizer = MelSpectrogram(MelSpectrogramConfig())

---

## Dataset

In [None]:
class LJSpeechDataset(torchaudio.datasets.LJSPEECH):

    def __init__(self, root):
        super().__init__(root=root)
        self._tokenizer = torchaudio.pipelines.TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.get_text_processor()

    def __getitem__(self, index: int):
        waveform, _, _, transcript = super().__getitem__(index)
        waveforn_length = torch.tensor([waveform.shape[-1]]).int()
        
        tokens, token_lengths = self._tokenizer(transcript)
        
        return waveform, waveforn_length, transcript, tokens, token_lengths
    
    def decode(self, tokens, lengths):
        result = []
        for tokens_, length in zip(tokens, lengths):
            text = "".join([
                self._tokenizer.tokens[token]
                for token in tokens_[:length]
            ])
            result.append(text)
        return result
                

In [None]:
dataset = LJSpeechDataset('.')

In [None]:
dataset[0]

In [None]:
from typing import Tuple, Dict, Optional, List, Union
from itertools import islice

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


@dataclass
class Batch:
    waveform: torch.Tensor
    waveforn_length: torch.Tensor
    transcript: List[str]
    tokens: torch.Tensor
    token_lengths: torch.Tensor
    durations: Optional[torch.Tensor] = None
        
    def to(self, device: torch.device) -> 'Batch':
        raise NotImplementedError


class LJSpeechCollator:

    def __call__(self, instances: List[Tuple]) -> Dict:
        waveform, waveforn_length, transcript, tokens, token_lengths = list(
            zip(*instances)
        )

        waveform = pad_sequence([
            waveform_[0] for waveform_ in waveform
        ]).transpose(0, 1)
        waveforn_length = torch.cat(waveforn_length)

        tokens = pad_sequence([
            tokens_[0] for tokens_ in tokens
        ]).transpose(0, 1)
        token_lengths = torch.cat(token_lengths)

        return Batch(waveform, waveforn_length, transcript, tokens, token_lengths)

In [None]:
dataloader = DataLoader(LJSpeechDataset('.'), batch_size=3, collate_fn=LJSpeechCollator())

In [None]:
dummy_batch = list(islice(dataloader, 1))[0]
dummy_batch

---

## Vocoder

In [18]:
# !git clone https://github.com/NVIDIA/waveglow.git
# %pip install googledrivedownloader

In [19]:
from google_drive_downloader import GoogleDriveDownloader as gdd

In [20]:
gdd.download_file_from_google_drive(
    file_id='1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF',
    dest_path='./waveglow_256channels_universal_v5.pt'
)

Downloading 1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF into ./waveglow_256channels_universal_v5.pt... Done.


In [21]:
import warnings
import sys
sys.path.append('waveglow/')

warnings.filterwarnings('ignore')


class Vocoder(nn.Module):

    def __init__(self):
        super(Vocoder, self).__init__()

        model = torch.load('waveglow_256channels_universal_v5.pt', map_location='cpu')[
            'model']
        self.net = model.remove_weightnorm(model)

    @torch.no_grad()
    def inference(self, spect: torch.Tensor):
        spect = self.net.upsample(spect)

        # trim the conv artifacts
        time_cutoff = self.net.upsample.kernel_size[0] - \
            self.net.upsample.stride[0]
        spect = spect[:, :, :-time_cutoff]

        spect = spect.unfold(2, self.net.n_group, self.net.n_group) \
            .permute(0, 2, 1, 3) \
            .contiguous() \
            .flatten(start_dim=2) \
            .transpose(-1, -2)

        # generate prior
        audio = torch.randn(spect.size(0), self.net.n_remaining_channels, spect.size(-1)) \
            .to(spect.device)

        for k in reversed(range(self.net.n_flows)):
            n_half = int(audio.size(1) / 2)
            audio_0 = audio[:, :n_half, :]
            audio_1 = audio[:, n_half:, :]

            output = self.net.WN[k]((audio_0, spect))

            s = output[:, n_half:, :]
            b = output[:, :n_half, :]
            audio_1 = (audio_1 - b) / torch.exp(s)
            audio = torch.cat([audio_0, audio_1], 1)

            audio = self.net.convinv[k](audio, reverse=True)

            if k % self.net.n_early_every == 0 and k > 0:
                z = torch.randn(
                    spect.size(0), self.net.n_early_size, spect.size(2),
                    device=spect.device
                )
                audio = torch.cat((z, audio), 1)

        audio = audio.permute(0, 2, 1) \
            .contiguous() \
            .view(audio.size(0), -1)

        return audio

In [23]:
# vocoder = Vocoder().to('cuda').eval()

In [None]:
waveform = dummy_batch.waveform[:1]
mels = featurizer(waveform).cuda()

In [None]:
plt.imshow(mels[0].cpu())

In [None]:
reconstructed_wav = vocoder.inference(mels).cpu()

In [None]:
plt.plot(reconstructed_wav.squeeze(), label='reconstructed', alpha=.5)
plt.plot(waveform.squeeze(), label='GT', alpha=.5)
plt.grid()
plt.legend()
plt.show()

In [None]:
display.display(display.Audio(reconstructed_wav, rate=22050))
display.display(display.Audio(waveform, rate=22050))

---

## Grapheme Aligner

In [None]:
@dataclass
class Point:
    token_index: int
    time_index: int
    score: float


@dataclass
class Segment:
    label: str
    start: int
    end: int
    score: float

    def __repr__(self):
        return f"{self.label}\t({self.score:4.2f}): [{self.start:5d}, {self.end:5d})"

    @property
    def length(self):
        return self.end - self.start


class GraphemeAligner(nn.Module):

    def __init__(self):
        super().__init__()

        self._wav2vec2 = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H.get_model()
        self._labels = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H.get_labels()
        self._char2index = {c: i for i, c in enumerate(self._labels)}
        self._unk_index = self._char2index['<unk>']
        self._resampler = torchaudio.transforms.Resample(
            orig_freq=MelSpectrogramConfig.sr, new_freq=16_000
        )

    def _decode_text(self, text):
        text = text.replace(' ', '|').upper()
        return torch.tensor([
            self._char2index.get(char, self._unk_index)
            for char in text
        ]).long()

    @torch.no_grad()
    def forward(
        self,
        wavs: torch.Tensor,
        wav_lengths: torch.Tensor,
        texts: Union[str, List[str]]
    ):
        if isinstance(texts, str):
            texts = [texts]
        batch_size = wavs.shape[0]

        durations = []
        for index in range(batch_size):
            current_wav = wavs[index, :wav_lengths[index]].unsqueeze(dim=0)
            current_wav = self._resampler(current_wav)
            emission, _ = self._wav2vec2(current_wav)
            emission = emission.log_softmax(dim=-1).squeeze(dim=0).cpu()

            tokens = self._decode_text(texts[index])

            trellis = self._get_trellis(emission, tokens)
            path = self._backtrack(trellis, emission, tokens)
            segments = self._merge_repeats(texts[index], path)

            num_frames = emission.shape[0]
            relative_durations = torch.tensor([
                segment.length / num_frames for segment in segments
            ])

            durations.append(relative_durations)
            
        durations = pad_sequence(durations).transpose(0, 1)
        return durations

    def _get_trellis(self, emission, tokens, blank_id=0):
        num_frame = emission.size(0)
        num_tokens = len(tokens)

        # Trellis has extra dimension for both time axis and tokens.
        # The extra dim for tokens represents <SoS> (start-of-sentence)
        # The extra dim for time axis is for simplification of the code.
        trellis = torch.full((num_frame + 1, num_tokens + 1), -float('inf'))
        trellis[:, 0] = 0
        for t in range(num_frame):
            trellis[t + 1, 1:] = torch.maximum(
                # Score for staying at the same token
                trellis[t, 1:] + emission[t, blank_id],

                # Score for changing to the next token
                trellis[t, :-1] + emission[t, tokens],
            )
        return trellis

    def _backtrack(self, trellis, emission, tokens, blank_id=0):
        # Note:
        # j and t are indices for trellis, which has extra dimensions
        # for time and tokens at the beginning.
        # When refering to time frame index `T` in trellis,
        # the corresponding index in emission is `T-1`.
        # Similarly, when refering to token index `J` in trellis,
        # the corresponding index in transcript is `J-1`.
        j = trellis.size(1) - 1
        t_start = torch.argmax(trellis[:, j]).item()

        path = []
        for t in range(t_start, 0, -1):
            # 1. Figure out if the current position was stay or change
            # Note (again):
            # `emission[J-1]` is the emission at time frame `J` of trellis dimension.
            # Score for token staying the same from time frame J-1 to T.
            stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
            # Score for token changing from C-1 at T-1 to J at T.
            changed = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]

            # 2. Store the path with frame-wise probability.
            prob = emission[t - 1, tokens[j - 1]
                            if changed > stayed else 0].exp().item()
            # Return token index and time index in non-trellis coordinate.
            path.append(Point(j - 1, t - 1, prob))

            # 3. Update the token
            if changed > stayed:
                j -= 1
                if j == 0:
                    break

        else:
            raise ValueError('Failed to align')

        return path[::-1]

    def _merge_repeats(self, text, path):
        i1, i2 = 0, 0
        segments = []
        while i1 < len(path):
            while i2 < len(path) and path[i1].token_index == path[i2].token_index:
                i2 += 1
            score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1)
            segments.append(
                Segment(
                    text[path[i1].token_index],
                    path[i1].time_index,
                    path[i2 - 1].time_index + 1,
                    score
                )
            )
            i1 = i2

        return segments

    @staticmethod
    def plot_trellis_with_path(trellis, path):
        # to plot trellis with path, we take advantage of 'nan' value
        trellis_with_path = trellis.clone()
        for i, p in enumerate(path):
            trellis_with_path[p.time_index, p.token_index] = float('nan')
        plt.imshow(trellis_with_path[1:, 1:].T, origin='lower')

In [None]:
device = torch.device('cuda:0')
aligner = GraphemeAligner().to(device)

In [None]:
dummy_batch

In [None]:
dummy_batch.durations = aligner(
    dummy_batch.waveform.to(device), dummy_batch.waveforn_length, dummy_batch.transcript
)

In [None]:
dummy_batch

## Visualize

In [None]:
index = 0

waveform = dummy_batch.waveform[index][:dummy_batch.waveforn_length[index]]
durations = dummy_batch.durations[index][:dummy_batch.token_lengths[index]]

# scale by waveform domain
durations = durations * dummy_batch.waveforn_length[index]
durations = durations.cumsum(dim=0).int()

print(dummy_batch.transcript[index])
left = 0
for right, char in zip(durations[:10], dummy_batch.transcript[index]):
    print(char)
    display.display(display.Audio(waveform[left:right], rate=22050))
    left = right
    print('-' * 99)

In [None]:
dummy_batch

In [None]:
!git clone https://github.com/MatyashDare/dla_tts
    

In [11]:
#!:bash
#pragma dataset init LJSpeech-1.1 --size 7Gb

set -e
cd /home/jupyter/mnt/datasets/LJSpeech-1.1
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
tar -xvf LJSpeech-1.1.tar.bz2
rm -rf LJSpeech-1.1.tar.bz2

In [13]:
#%pip install wandb

In [84]:
#!g1.1
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torchaudio
import os
import librosa
import wandb
from tqdm import tqdm
from dataclasses import dataclass
from data.data_preprocessing import LJSpeechDataset, LJSpeechCollator
from models.model import FastSpeechModel

In [85]:
#!g1.1
import warnings
import sys
sys.path.append('waveglow/')

warnings.filterwarnings('ignore')


class Vocoder(nn.Module):

    def __init__(self):
        super(Vocoder, self).__init__()

        model = torch.load('waveglow_256channels_universal_v5.pt', map_location='cpu')[
            'model']
        self.net = model.remove_weightnorm(model)

    @torch.no_grad()
    def inference(self, spect: torch.Tensor):
        spect = self.net.upsample(spect)

        
        
        # trim the conv artifacts
        time_cutoff = self.net.upsample.kernel_size[0] - \
            self.net.upsample.stride[0]
        spect = spect[:, :, :-time_cutoff]

        spect = spect.unfold(2, self.net.n_group, self.net.n_group) \
            .permute(0, 2, 1, 3) \
            .contiguous() \
            .flatten(start_dim=2) \
            .transpose(-1, -2)

        # generate prior
        audio = torch.randn(spect.size(0), self.net.n_remaining_channels, spect.size(-1)) \
            .to(spect.device)

        for k in reversed(range(self.net.n_flows)):
            n_half = int(audio.size(1) / 2)
            audio_0 = audio[:, :n_half, :]
            audio_1 = audio[:, n_half:, :]

            output = self.net.WN[k]((audio_0, spect))

            s = output[:, n_half:, :]
            b = output[:, :n_half, :]
            audio_1 = (audio_1 - b) / torch.exp(s)
            audio = torch.cat([audio_0, audio_1], 1)

            audio = self.net.convinv[k](audio, reverse=True)

            if k % self.net.n_early_every == 0 and k > 0:
                z = torch.randn(
                    spect.size(0), self.net.n_early_size, spect.size(2),
                    device=spect.device
                )
                audio = torch.cat((z, audio), 1)

        audio = audio.permute(0, 2, 1) \
            .contiguous() \
            .view(audio.size(0), -1)

        return audio


In [86]:
#!g1.1
vocoder = Vocoder().to('cuda').eval()

In [87]:
#!g1.1
def train(run, epoch, train_dataloader, model, optimizer, scheduler, log_loss_every, log_audio_every):
    model.train()
    i = 0
    for batch in tqdm(train_dataloader):
        pred_mel, pred_len = model(batch)

        mask = (torch.arange(pred_len.shape[1])[None, :].to(device)  <= batch['token_lengths'][:, None]).float()
        loss_len = criterion(pred_len * mask,  torch.log1p(batch["duration_multipliers"]) * mask)

        mask = (torch.arange(pred_mel.shape[1])[None, :].to(device)  <= batch['melspec_length'][:, None]).float()
        loss_mel = criterion(pred_mel * mask[:, :, None], batch['melspec'] * mask[:, :, None])
        loss = loss_mel + loss_len
        if i % log_loss_every == 0 and i != 0:
            run.log({"Total loss" : loss}, step=epoch * len(train_dataloader) + i)
            run.log({"Melspec Loss" : loss_mel}, step=epoch * len(train_dataloader) + i)
            run.log({"Duration Loss" : loss_len}, step=epoch * len(train_dataloader) + i)

        if i % log_audio_every == 0 and i != 0:
            mel_to_log = pred_mel[0]
            melspec_to_log  = pred_mel[0][:, :batch['melspec_length'][0]].unsqueeze(0)
            reconstructed_wav = vocoder.inference(melspec_to_log).squeeze().detach().cpu().numpy()
            run.log({"Audio Train" : wandb.Audio(reconstructed_wav, 22050)}, step=epoch * len(train_dataloader) + i)
            d = (torch.exp(pred_len[0]) - 1).round().int()
            d[d < 1] = 1
            d1 = d.cumsum(0)
            maxlen = d.sum().item()
            mask1 = torch.arange(maxlen)[None, :].to(device) < (d1[:, None])
            mask2 = torch.arange(maxlen)[None, :].to(device) >= (d1 - d)[:, None]
            mask = (mask2 * mask1).float()
            run.log({"`Durations predicted" : wandb.Image(mask.detach().cpu().numpy())}, step=epoch * len(train_dataloader) + i)
            d = batch['duration_multipliers'][0]
            d1 = d.cumsum(0)
            maxlen = d.sum().item()
            mask1 = torch.arange(maxlen)[None, :].to(device) < (d1[:, None])
            mask2 = torch.arange(maxlen)[None, :].to(device) >= (d1 - d)[:, None]
            mask = (mask2 * mask1).float()
            run.log({"Durations true" : wandb.Image(mask.detach().cpu().numpy())}, step=epoch * len(train_dataloader) + i)
        optimizer.zero_grad()
        loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        scheduler.step()
        i += 1


def validation(run, iteration, model):
    model.eval()
    tokenizer  = torchaudio.pipelines.TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.get_text_processor()
    sentences = ['A defibrillator is a device that gives a high energy electric shock to the heart of someone who is in cardiac arrest',
                 'Massachusetts Institute of Technology may be best known for its math, science and engineering education',
                 'Wasserstein distance or Kantorovich Rubinstein metric is a distance function defined between probability distributions on a given metric space']
    print("Validation:")
    audios = []
    durations = []
    for k, sentence in enumerate(sentences):
        tokens, length = tokenizer(sentence.lower())
        batch = {}
        batch['tokens'], batch['token_lengths'] = tokens.to(device), length.to(device)
        pred_mel, pred_len = model(batch, False)
        reconstructed_wav = vocoder.inference(pred_mel).squeeze().detach().cpu().numpy()
        d = (torch.exp(pred_len[0]) - 1).round().int()
        d[d < 1] = 1
        d1 = d.cumsum(0)
        maxlen = d.sum().item()
        mask1 = torch.arange(maxlen)[None, :].to(device) < (d1[:, None])
        mask2 = torch.arange(maxlen)[None, :].to(device) >= (d1 - d)[:, None]
        mask = (mask2 * mask1).float()
        audios.append(wandb.Audio(reconstructed_wav, 22050, caption=sentence))
        durations.append(wandb.Image(mask.detach().cpu().numpy(), caption=sentence))
    run.log({"Audio validation" : audios}, step=iteration)


if __name__ == '__main__':
    project_name = 'tts_total'
    name = 'FastSpeechAlignments'
    log_audio_every = 100
    log_loss_every = 5
    n_epochs = 100
    batch_size = 16
    device = 'cuda'
    train_dataloader = DataLoader(LJSpeechDataset('/home/jupyter/mnt/datasets/LJSpeech-1.1/'),
                                  batch_size=batch_size,
                                  collate_fn=LJSpeechCollator(device),
                                  shuffle=True)
    model = FastSpeechModel(vocab_size=38,
                            max_len=10000,
                            n_layers=6,
                            output_size=80,
                            model_size=256,
                            inter_size=1024,
                            inter_kernel_size=(9, 1),
                            head_num=2,
                            size_head=128,
                            p=0.1,
                            device=device).to(device)
    vocoder = Vocoder().eval().to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-6)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer,
                                              1e-4,
                                              total_steps=n_epochs * len(train_dataloader),
                                              div_factor=1e+4,
                                              pct_start=0.05,
                                              anneal_strategy='linear')

    with wandb.init(project=project_name, name=name) as run:
        for i in range(n_epochs):
            print(f'Start Epoch {i}')
            train(run, i, train_dataloader, model, optimizer, scheduler, log_loss_every, log_audio_every)
            validation(run, (i + 1) * len(train_dataloader), model)



























































































































































































































































































 49%|████▉     | 405/819 [01:46<01:49,  3.79it/s]


VBox(children=(Label(value=' 273.36MB of 273.36MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.9999939…

0,1
Duration Loss,█▆▆▅▅▅▄▃▃▃▃▃▂▂▂▂▂▂▂▂▁▂▁▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁
Melspec Loss,█▄▄▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Total loss,█▅▄▃▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Duration Loss,0.01938
Melspec Loss,0.28288
Total loss,0.30226


OSError: [Errno 28] No space left on device

In [None]:
#!g1.1
