In [40]:
import os
import re

import torch
import IPython
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from g2pk import G2p
from tqdm.notebook import tqdm

from models.transformer_tts import TransformerTTS
from datasets import get_datasets


g2p = G2p()

In [None]:
get_datasets(
    [
        "/home/nsw0311/nas_storage/datasets/tts/processed_data_24000/ine/"
    ],
    data_length: int,
    hop_length: Union[int, str],
    train_ratio: Optional[float] = 0.8,
    seed: Optional[int] = 42,
)

In [25]:
import glob


dirs = [
    "/home/nsw0311/nas_storage/datasets/tts/processed_data_24000/ine/",
    "/home/nsw0311/nas_storage/datasets/tts/processed_data_24000/kaist_female/",
    "/home/nsw0311/nas_storage/datasets/tts/processed_data_24000/kaist_male/"
]


file_list = []

for path in dirs:
    file_list += glob.glob(os.path.join(path, "*.pt"))

In [46]:
p = re.compile(r"[^ ㄱ-ㅎㅏ-ㅣ가-힣a-z0-9…~!?,.]")
tokens = set()


for path in tqdm(file_list):
    file = torch.load(path)
    
    text = file['text'].replace("...", "…")
    text = p.sub("", text).strip()

    text = g2p(text, descriptive=True)
    tokens.update(set(text))

  0%|          | 0/58826 [00:00<?, ?it/s]

In [47]:
tokens = sorted(tokens)
tokens

[' ',
 '!',
 ',',
 '.',
 '?',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '~',
 '…',
 'ㅇ',
 'ㅋ',
 'ㅠ',
 '가',
 '각',
 '간',
 '갇',
 '갈',
 '감',
 '갑',
 '강',
 '개',
 '객',
 '갣',
 '갭',
 '갱',
 '갸',
 '갹',
 '걈',
 '걍',
 '걔',
 '거',
 '걱',
 '건',
 '걷',
 '걸',
 '검',
 '겁',
 '겅',
 '게',
 '겍',
 '겐',
 '겓',
 '겔',
 '겜',
 '겟',
 '겨',
 '격',
 '견',
 '겯',
 '결',
 '겸',
 '겹',
 '경',
 '계',
 '곈',
 '고',
 '곡',
 '곤',
 '곧',
 '골',
 '곰',
 '곱',
 '공',
 '과',
 '관',
 '괄',
 '광',
 '괘',
 '괜',
 '괴',
 '괵',
 '굉',
 '교',
 '굑',
 '굥',
 '구',
 '국',
 '군',
 '굳',
 '굴',
 '굼',
 '굽',
 '궁',
 '궈',
 '권',
 '궐',
 '궤',
 '귀',
 '귄',
 '귇',
 '귈',
 '규',
 '귝',
 '균',
 '그',
 '극',
 '근',
 '귿',
 '글',
 '금',
 '급',
 '긍',
 '긔',
 '기',
 '긱',
 '긴',
 '긷',
 '길',
 '김',
 '깁',
 '깅',
 '까',
 '깍',
 '깐',
 '깓',
 '깔',
 '깜',
 '깝',
 '깡',
 '깨',
 '깬',
 '깯',
 '깽',
 '꺄',
 '꺅',
 '꺼',
 '꺽',
 '껀',
 '껃',
 '껄',
 '껌',
 '껍',
 '껑',
 '께',
 '껜',
 '껟',
 '껴',
 '껵',
 '껸',
 '껻',
 '껼',
 '꼉',
 '꼬'

In [52]:
# tokens = ["[PAD]", "[UNK]"] + tokens
token_dict = {t: i for i, t in enumerate(tokens)}

In [55]:
import json


with open("/home/nsw0311/nas_storage/etc/transformer_tts_tokenizer.json", "w", encoding="utf-8") as f:
    json.dump(token_dict, f)
token_dict

{'[PAD]': 2,
 '[UNK]': 3,
 ' ': 4,
 '!': 5,
 ',': 6,
 '.': 7,
 '?': 8,
 'a': 9,
 'b': 10,
 'c': 11,
 'd': 12,
 'e': 13,
 'f': 14,
 'g': 15,
 'h': 16,
 'i': 17,
 'j': 18,
 'k': 19,
 'l': 20,
 'm': 21,
 'n': 22,
 'o': 23,
 'p': 24,
 'q': 25,
 'r': 26,
 's': 27,
 't': 28,
 'u': 29,
 'v': 30,
 'w': 31,
 'x': 32,
 'y': 33,
 'z': 34,
 '~': 35,
 '…': 36,
 'ㅇ': 37,
 'ㅋ': 38,
 'ㅠ': 39,
 '가': 40,
 '각': 41,
 '간': 42,
 '갇': 43,
 '갈': 44,
 '감': 45,
 '갑': 46,
 '강': 47,
 '개': 48,
 '객': 49,
 '갣': 50,
 '갭': 51,
 '갱': 52,
 '갸': 53,
 '갹': 54,
 '걈': 55,
 '걍': 56,
 '걔': 57,
 '거': 58,
 '걱': 59,
 '건': 60,
 '걷': 61,
 '걸': 62,
 '검': 63,
 '겁': 64,
 '겅': 65,
 '게': 66,
 '겍': 67,
 '겐': 68,
 '겓': 69,
 '겔': 70,
 '겜': 71,
 '겟': 72,
 '겨': 73,
 '격': 74,
 '견': 75,
 '겯': 76,
 '결': 77,
 '겸': 78,
 '겹': 79,
 '경': 80,
 '계': 81,
 '곈': 82,
 '고': 83,
 '곡': 84,
 '곤': 85,
 '곧': 86,
 '골': 87,
 '곰': 88,
 '곱': 89,
 '공': 90,
 '과': 91,
 '관': 92,
 '괄': 93,
 '광': 94,
 '괘': 95,
 '괜': 96,
 '괴': 97,
 '괵': 98,
 '굉': 99,
 '교': 100,
 '굑': 101

In [2]:
os.chdir("/home/nsw0311/nas_storage/datasets/tts/processed_data/ine/")
file_list = sorted(os.listdir())
file_size = [os.path.getsize(f) for f in file_list]
longest_file = file_list[np.argmax(file_size)]

In [13]:
path = longest_file
sample = torch.load(path)

wave = sample["waveform"]
spectrogram = sample["mel_spectrogram"]
text = sample['text']

In [20]:
text

'제가, 예~전에, 어... 구입을 한, 헤드폰이 있는데 이게 저랑 너무 잘맞아가지고 굉장히 오래 쓰고있거든요?'

In [14]:
spectrogram.shape

torch.Size([80, 626])

In [10]:
IPython.display.Audio(wave, rate=16000)

In [4]:
model = TransformerTTS(
    vocab_size=3200
)

In [5]:
max_ids_length = 128
max_spectrogram_length = 512

In [6]:
input_ids = torch.randint(0, 3200, (1, len(text)))
spectrogram = spectrogram.unsqueeze(0)

In [7]:
input_ids = torch.nn.functional.pad(input_ids, (0, max_ids_length - input_ids.size(-1)))
spectrogram = torch.nn.functional.pad(spectrogram, (0, max_spectrogram_length - spectrogram.size(-1)))
input_ids.shape, spectrogram.shape

(torch.Size([1, 128]), torch.Size([1, 80, 512]))

In [8]:
spectrogram, end = model(input_ids, spectrogram)

In [9]:
spectrogram.shape, end.shape

(torch.Size([1, 512, 80]), torch.Size([1, 512, 1]))