In [1]:
import torch

# PyTorch Hub에서 사전 훈련된 모델을 다운로드
tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16')
tacotron2 = tacotron2.to('cuda')
tacotron2.eval() # 모델을 평가(inference) 모드

Downloading: "https://github.com/NVIDIA/DeepLearningExamples/zipball/torchhub" to /root/.cache/torch/hub/torchhub.zip
Downloading checkpoint from https://api.ngc.nvidia.com/v2/models/nvidia/tacotron2_pyt_ckpt_amp/versions/19.09.0/files/nvidia_tacotron2pyt_fp16_20190427


Tacotron2(
  (embedding): Embedding(148, 512)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0-2): 3 x Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (prenet): Prenet(
      (layers): ModuleList(
        (0): LinearNorm(
          (linear_layer): Linear(in_features=80, out_features=256, bias=False)
        )
        (1): LinearNorm(
          (linear_layer): Linear(in_features=256, out_features=256, bias=False)
        )
      )
    )
    (attention_rnn): LSTMCell(768, 1024)
    (attention_layer): Attention(
      (query_layer): LinearNorm(
        (linear_layer): Linear(in_features=1024, out_features=128, bias=False)
      )
      (memory_layer): LinearNorm(
        (linear_layer): Linear(in_fea

In [2]:
# NVIDIA의 WaveGlow 모델을 다운로드
waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval() #  평가 모드로 설정

Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Downloading checkpoint from https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ckpt_amp/versions/19.09.0/files/nvidia_waveglowpyt_fp16_20190427
  WeightNorm.apply(module, name, dim)
  WeightNorm.apply(module, name, dim)
  WeightNorm.apply(module, name, dim)
  WeightNorm.apply(module, name, dim)


WaveGlow(
  (upsample): ConvTranspose1d(80, 80, kernel_size=(1024,), stride=(256,))
  (WN): ModuleList(
    (0-3): 4 x WN(
      (in_layers): ModuleList(
        (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
        (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(4,))
        (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(8,))
        (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(16,), dilation=(16,))
        (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(32,), dilation=(32,))
        (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(64,), dilation=(64,))
        (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(128,), dilation=(128,))
      )
      (res_skip_layers): ModuleList(
        (0-6): 7 x Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
        (7

# 🧐 코드 체크
## 체크 코드 1
```python
sequences, lengths = utils.prepare_input_sequence([text])
```
- 텍스트를 모델이 이해할 수 있는 숫자 시퀀스로 변환
- 각 문자를 고유한 숫자(인덱스)로 매핑

```python
text = "Hello"
# 변환 과정:
# 'H' → 72, 'e' → 101, 'l' → 108, 'l' → 108, 'o' → 111
# sequences = [72, 101, 108, 108, 111]
# lengths = [5]  # 문자열 길이
```

## 체크 코드 2
```python
with torch.nograd():
    mel, , _ = tacotron2.infer(sequences, lengths)
    audio = waveglow.infer(mel)
audio_numpy = audio[0].data.cpu().numpy()
```
### 각 부분 설명 :
```python
with torch.no_grad():
```
- 그래디언트 계산을 비활성화
- 훈련이 아닌 추론시에는 그래디언트가 필요 없음
- 메모리 절약 및 속도 향상

```python
mel, _, _ = tacotron2.infer(sequences, lengths)
```
- **입력** : 숫자로 변환된 텍스트 시퀀스
- **출력** :
  - `mel` : mel-spectrogram (음성의 주파수 특성을 시각화한 데이터)
  - `_` , `_` : 사용하지 않는 다른 출력들 (attention weights 등)

```python
audio = waveglow.infer(mel)
```
- **입력** : mel-spectrogram
- **출력** : 실제 오디오 파형(waveform)

```python
audio_numpy = audio[0].data.cpu().numpy()
```
**단계별 변환 :**  
`audio[0]` - 배치에서 첫 번째 오디오 선택  
`.data` - 텐서에서 순수 데이터만 추출  
`.cpu()` - GPU 메모리에서 CPU 메모리로 이동  
`.numpy()` - PyTorch 텐서를 NumPy 배열로 변환

In [4]:
from IPython.display import Audio

# 전체 파이프라인:
# 텍스트 → Tacotron2 → mel-spectrogram → WaveGlow → 오디오 파형 → 재생

text = 'Clean thoroughly. Cleanliness is more important than your life.'

# TTS 유틸리티 함수들을 로드
utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')
sequences, lengths = utils.prepare_input_sequence([text])

# 음성 생성 과정
with torch.no_grad():
    mel, _, _ = tacotron2.infer(sequences, lengths) # 텍스트 → mel-spectrogram
    audio = waveglow.infer(mel) # mel-spectrogram → 오디오 파형
audio_numpy = audio[0].data.cpu().numpy()
rate = 22050 # 샘플링 레이트 (22.05kHz)

Audio(audio_numpy, rate=rate)

Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


In [5]:
# 텍스트를 음성으로 변환하고 WAV 파일로 저장

# WAV 파일 저장을 위한 라이브러리
from scipy.io.wavfile import write

text = """
If our sacrifices can push the next generation forward, then what are we willing to give up?
"""

utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')
sequences, lengths = utils.prepare_input_sequence([text])

with torch.no_grad():
    mel, _, _ = tacotron2.infer(sequences, lengths)
    audio = waveglow.infer(mel)
audio_numpy = audio[0].data.cpu().numpy()
rate = 22050

write("ErwinSmith.wav", rate, audio_numpy) # WAV 파일로 저장

Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


In [6]:
import os

os.listdir('/content')  # 'ErwinSmith.wav' 있는지 확인

['.config', 'ErwinSmith.wav', 'sample_data']

In [7]:
from google.colab import files
files.download("ErwinSmith.wav")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>