<a href="https://colab.research.google.com/github/LEANHDUC2005/Neural-Network---ANN/blob/main/Fastspeech%202%20Hifigan%20Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# FastSpeech 2 + HiFi-GAN: Training from scratch (LJSpeech)
# Compatible with Google Colab Free Tier (T4 GPU)

# Step 1: Install dependencies
!pip install torch torchaudio numpy matplotlib scipy tensorboard
!pip install phonemizer unidecode librosa


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [7]:
!apt-get install -y espeak-ng

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
The following NEW packages will be installed:
  espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
0 upgraded, 5 newly installed, 0 to remove and 34 not upgraded.
Need to get 4,526 kB of archives.
After this operation, 11.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libpcaudio0 amd64 1.1-6build2 [8,956 B]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libsonic0 amd64 0.2.0-11build1 [10.3 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 espeak-ng-data amd64 1.50+dfsg-10ubuntu0.1 [3,956 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libespeak-ng1 amd64 1.50+dfsg-10ubuntu0.1 [207 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 espeak-ng amd64 1.50+dfsg-1

In [2]:
# Step 2: Clone HiFi-GAN for waveform decoder
!git clone https://github.com/jik876/hifi-gan.git
%cd hifi-gan
!pip install -r requirements.txt
%cd ..

Cloning into 'hifi-gan'...
remote: Enumerating objects: 48, done.[K
remote: Total 48 (delta 0), reused 0 (delta 0), pack-reused 48 (from 1)[K
Receiving objects: 100% (48/48), 620.94 KiB | 11.94 MiB/s, done.
Resolving deltas: 100% (20/20), done.
/content/hifi-gan
[31mERROR: Could not find a version that satisfies the requirement torch==1.4.0 (from versions: 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.4.0[0m[31m
[0m/content


In [3]:
# Step 3: Download and extract LJSpeech dataset
!wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
!tar -xjf LJSpeech-1.1.tar.bz2

--2025-05-06 17:51:46--  https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
Resolving data.keithito.com (data.keithito.com)... 138.199.36.7, 2400:52e0:1e00::863:1
Connecting to data.keithito.com (data.keithito.com)|138.199.36.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2748572632 (2.6G) [text/plain]
Saving to: ‘LJSpeech-1.1.tar.bz2’


2025-05-06 17:52:04 (147 MB/s) - ‘LJSpeech-1.1.tar.bz2’ saved [2748572632/2748572632]



In [28]:
# Step 4: Define FastSpeech 2 components
import torch
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class VariancePredictor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv1d(input_dim, input_dim, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Dropout(0.5),
        )
        self.norm1 = nn.LayerNorm(input_dim)

        self.conv2 = nn.Sequential(
            nn.Conv1d(input_dim, input_dim, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Dropout(0.5),
        )
        self.norm2 = nn.LayerNorm(input_dim)

        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        x = x.transpose(1, 2)       # [B, C, T]
        x = self.conv1(x)           # [B, C, T]
        x = x.transpose(1, 2)       # [B, T, C]
        x = self.norm1(x)

        x = x.transpose(1, 2)       # [B, C, T]
        x = self.conv2(x)
        x = x.transpose(1, 2)
        x = self.norm2(x)

        return self.linear(x).squeeze(-1)

class VarianceAdaptor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.duration_predictor = VariancePredictor(input_dim)
        self.pitch_predictor = VariancePredictor(input_dim)
        self.energy_predictor = VariancePredictor(input_dim)

    def forward(self, x):
        duration = self.duration_predictor(x)
        pitch = self.pitch_predictor(x)
        energy = self.energy_predictor(x)
        return x, duration, pitch, energy

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(input_dim, hidden_dim, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Dropout(0.5),
        )
        self.norm = nn.LayerNorm(hidden_dim)

    def forward(self, x):
        x = self.conv(x.transpose(1, 2)).transpose(1, 2)
        x = self.norm(x)
        return x

class Decoder(nn.Module):
    def __init__(self, hidden_dim, mel_dim):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Dropout(0.5),
        )
        self.norm = nn.LayerNorm(hidden_dim)
        self.linear = nn.Linear(hidden_dim, mel_dim)

    def forward(self, x):
        x = self.conv(x.transpose(1, 2)).transpose(1, 2)
        x = self.norm(x)
        return self.linear(x)

class FastSpeech2(nn.Module):
    def __init__(self, vocab_size=300, d_model=256, mel_dim=80):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.encoder = Encoder(d_model, d_model)
        self.variance_adaptor = VarianceAdaptor(d_model)
        self.decoder = Decoder(d_model, mel_dim)

    def forward(self, phoneme_ids):
        x = self.embedding(phoneme_ids)
        x = self.pos_enc(x)
        x = self.encoder(x)
        x, dur, pitch, energy = self.variance_adaptor(x)
        mel = self.decoder(x)
        return mel, dur, pitch, energy

class FastSpeech2(nn.Module):
    def __init__(self, vocab_size=300, d_model=256, mel_dim=80):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model)
        self.encoder = Encoder(d_model, d_model)
        self.variance_adaptor = VarianceAdaptor(d_model)
        self.decoder = Decoder(d_model, mel_dim)

    def forward(self, phoneme_ids):
        x = self.embedding(phoneme_ids)
        x = self.pos_enc(x)
        x = self.encoder(x)
        x, dur, pitch, energy = self.variance_adaptor(x)
        mel = self.decoder(x)
        return mel, dur, pitch, energy

In [10]:
pip install --upgrade phonemizer




In [12]:
# Step 5: Preprocess LJSpeech to extract phonemes and mel-spectrograms
import os
import librosa
import numpy as np
from phonemizer import phonemize
from phonemizer.separator import Separator
from unidecode import unidecode
from tqdm import tqdm
import torchaudio
from phonemizer.backend import EspeakBackend
# Config
LJ_PATH = "LJSpeech-1.1"
SAMPLING_RATE = 22050
N_MELS = 80
HOP_LENGTH = 256

# Phoneme conversion
from phonemizer import phonemize
backend = EspeakBackend(language='en-us', preserve_punctuation=True)
def text_to_phonemes(text):
    return backend.phonemize([text], strip=True)[0]
    text = unidecode(text.strip())
    phones = phonemize(text, language='en-us', backend='espeak', separator=Separator(phone=' ', syllable=''))
    return phones

# Create mel-spectrogram
def wav_to_mel(wav_path):
    y, sr = librosa.load(wav_path, sr=SAMPLING_RATE)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=HOP_LENGTH, n_mels=N_MELS)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db.T  # [T, 80]

# Prepare dataset (subset for quick demo)
metadata_path = os.path.join(LJ_PATH, "metadata.csv")
with open(metadata_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Pick a small subset for demo
data = []
for line in tqdm(lines[:100]):
    parts = line.strip().split("|")
    wav_path = os.path.join(LJ_PATH, "wavs", parts[0] + ".wav")
    text = parts[2]
    phones = text_to_phonemes(text)
    mel = wav_to_mel(wav_path)
    data.append((phones, mel))

#Save to disk (optional):
torch.save(data, "train_subset.pt")


100%|██████████| 100/100 [00:01<00:00, 62.19it/s]


In [14]:
mv hifi_gan hifi-gan


In [22]:
# Step 6: Use pretrained HiFi-GAN (SpeechBrain) to synthesize .wav from mel
!pip install speechbrain --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/864.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m864.1/864.1 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/117.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.7/117.7 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/739.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.1/739.1 kB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [23]:

import torchaudio
from speechbrain.pretrained import HIFIGAN

hifi_model = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="pretrained-hifigan")

# Function to synthesize waveform from mel
# Input shape: (n_mels=80, T)
def mel_to_wav(mel):
    with torch.no_grad():
        if mel.dim() == 2:
            mel = mel.unsqueeze(0)
        wav = hifi_model.decode_batch(mel).squeeze().cpu().numpy()
    return wav

# Step 7: Check compatibility
# Ensure FastSpeech 2 mel output is compatible with HiFi-GAN

def check_mel_specs(mel):
    assert mel.shape[1] > 0, "Mel-spectrogram has no time steps"
    assert mel.shape[2] == 80, f"Expected 80 mel channels, got {mel.shape[2]}"
    print("✔ Mel has 80 channels")

    sample_rate = 22050
    hop_length = 256
    print(f"✔ Target sample_rate = {sample_rate}, hop_length = {hop_length}")

# Example test
example_mel = torch.randn(1, 100, 80)  # [B, T, n_mels]
check_mel_specs(example_mel)

# Test synthesis
waveform = mel_to_wav(example_mel[0].transpose(0, 1))
print(f"Generated waveform shape: {waveform.shape}")

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover
  from speechbrain.pretrained import HIFIGAN
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/tts-hifigan-ljspeech' if not cached


hyperparams.yaml:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--tts-hifigan-ljspeech/snapshots/17fbdc3aae35b81e1554111fa54eab5f2b70cedb/hyperparams.yaml' -> '/content/pretrained-hifigan/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/tts-hifigan-ljspeech' if not cached
  WeightNorm.apply(module, name, dim)
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained-hifigan.
INFO:speechbrain.utils.fetching:Fetch generator.ckpt: Fetching from HuggingFace Hub 'speechbrain/tts-hifigan-ljspeech' if not cached


generator.ckpt:   0%|          | 0.00/55.8M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--tts-hifigan-ljspeech/snapshots/17fbdc3aae35b81e1554111fa54eab5f2b70cedb/generator.ckpt' -> '/content/pretrained-hifigan/generator.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["generator"] = /content/pretrained-hifigan/generator.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: generator
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): generator -> /content/pretrained-hifigan/generator.ckpt


✔ Mel has 80 channels
✔ Target sample_rate = 22050, hop_length = 256
Generated waveform shape: (28160,)


In [29]:
import os
import random
import librosa
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

# --- Configuration ---
LJ_PATH = "LJSpeech-1.1"
SAMPLE_RATE = 22050
N_MELS = 80
HOP_LENGTH = 256

# --- Feature extraction ---
mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    win_length=1024,
    hop_length=HOP_LENGTH,
    n_mels=N_MELS
)

def wav_to_mel(wav_path):
    wav, _ = librosa.load(wav_path, sr=SAMPLE_RATE)
    wav_tensor = torch.tensor(wav).unsqueeze(0)
    mel = mel_transform(wav_tensor).squeeze(0).transpose(0, 1)  # [T, n_mels]
    return mel

# --- Dummy phoneme tokenizer ---
def text_to_dummy_phonemes(text):
    return [ord(c) % 256 for c in text if c.isalnum()]

# --- Dataset and Dataloader ---
class LJSubset(Dataset):
    def __init__(self, max_samples=100):
        self.data = []
        with open(os.path.join(LJ_PATH, "metadata.csv"), "r", encoding="utf-8") as f:
            for line in f.readlines()[:max_samples]:
                parts = line.strip().split("|")
                wav_path = os.path.join(LJ_PATH, "wavs", parts[0] + ".wav")
                text = parts[2]
                phoneme_ids = text_to_dummy_phonemes(text)
                mel = wav_to_mel(wav_path)
                self.data.append((phoneme_ids, mel))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        phones, mel = self.data[idx]
        phones = torch.tensor(phones, dtype=torch.long)
        return phones, mel

def collate_fn(batch):
    phones_batch, mel_batch = zip(*batch)
    phones_padded = nn.utils.rnn.pad_sequence(phones_batch, batch_first=True, padding_value=0)
    mel_padded = nn.utils.rnn.pad_sequence(mel_batch, batch_first=True)
    return phones_padded, mel_padded

train_loader = DataLoader(LJSubset(), batch_size=4, shuffle=True, collate_fn=collate_fn)

# --- Train a few steps ---
model = FastSpeech2()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()




In [32]:
print(mel_pred.shape)  # e.g., torch.Size([batch_size, 131])
print(mel_target.shape)  # e.g., torch.Size([batch_size, 833])


torch.Size([4, 131, 80])
torch.Size([4, 833, 80])


In [30]:
model.train()
for epoch in range(1):
    for i, (phones, mel_target) in enumerate(train_loader):
        mel_pred, _, _, _ = model(phones)
        loss = loss_fn(mel_pred, mel_target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 5 == 0:
            print(f"Epoch {epoch}, Step {i}, Loss = {loss.item():.4f}")
        if i == 20:
            break

# --- Test generation ---
model.eval()
test_input = torch.tensor([text_to_dummy_phonemes("hello world")], dtype=torch.long)
with torch.no_grad():
    mel_out, _, _, _ = model(test_input)
    print("Test mel shape:", mel_out.shape)
    check_mel_specs(mel_out)
    wav = mel_to_wav(mel_out[0].transpose(0, 1))

# --- Plot waveform ---
plt.figure(figsize=(12, 3))
plt.plot(wav)
plt.title("Generated Waveform")
plt.show()

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (131) must match the size of tensor b (833) at non-singleton dimension 1