In [1]:
!pip install transformers datasets soundfile accelerate speechbrain==0.5.16 librosa

Collecting speechbrain==0.5.16
  Downloading speechbrain-0.5.16-py3-none-any.whl.metadata (23 kB)
Collecting hyperpyyaml (from speechbrain==0.5.16)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.9->speechbrain==0.5.16)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.9->speechbrain==0.5.16)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.9->speechbrain==0.5.16)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.9->speechbrain==0.5.16)
  Downloading nvidia_curand_cu12-10

In [None]:
from huggingface_hub import login
login("hello")
print("✅ HF login ok")

✅ HF login ok


In [3]:
from datasets import load_dataset, VerificationMode

dataset = load_dataset(
    "parquet",
    data_files="https://huggingface.co/datasets/NhutP/VietSpeech/resolve/main/data/train-00018-of-00027.parquet",
    split="train",
    verification_mode=VerificationMode.NO_CHECKS,
)

print(dataset)

data/train-00018-of-00027.parquet:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 38002
})


In [4]:
half_size = len(dataset) // 4

# Select the first half of the dataset
dataset = dataset.select(range(half_size))

print(dataset)

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 9500
})


In [5]:
from datasets import Audio
dataset = dataset.cast_column("audio", Audio(decode=False))


In [None]:
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
from functools import partial

print("=== BƯỚC 3: TẢI PROCESSOR TỪ CHECKPOINT 900 ===")
base_repo_path = "oopssuper96/speecht5_finetuned_emirhan_tr"
checkpoint_revision = "9367721" 

print(f"Loading processor from {base_repo_path} (commit {checkpoint_revision})...")
processor = SpeechT5Processor.from_pretrained(
    base_repo_path, 
    revision=checkpoint_revision
)


2025-11-05 09:47:09.404325: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762336029.592246      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762336029.643108      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


=== BƯỚC 3: TẢI PROCESSOR TỪ CHECKPOINT 900 ===
Loading processor from oopssuper96/speecht5_finetuned_emirhan_tr (commit 9367721)...


preprocessor_config.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

In [8]:
tokenizer = processor.tokenizer


In [9]:
def extract_all_chars(batch):
    all_text = " ".join(batch["transcription"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

print(f"Characters in dataset not in tokenizer: {dataset_vocab - tokenizer_vocab}")

Map:   0%|          | 0/9500 [00:00<?, ? examples/s]

Characters in dataset not in tokenizer: {'à', 'ờ', 'ỷ', 'ẵ', 'ò', 'ễ', 'ỡ', 'ữ', 'ằ', 'ầ', '3', 'ộ', 'í', 'ã', 'è', 'ở', 'ể', 'ỏ', 'ứ', 'ỉ', 'ẩ', 'ớ', 'ị', 'â', 'ụ', 'ơ', 'ấ', 'ặ', 'ô', 'ũ', 'ỗ', 'ẳ', 'ạ', 'ẻ', 'ú', 'ắ', 'ẫ', 'ả', 'ẽ', 'ỹ', 'ĩ', 'ề', 'ử', 'ợ', 'ă', 'ý', 'ố', 'ọ', 'ì', 'ự', 'á', 'ủ', 'ừ', 'ồ', 'ẹ', 'ư', 'đ', 'ổ', 'ậ', 'ỳ', ' ', 'ó', 'ỵ', 'ù', 'ế', 'ệ', 'õ'}


In [None]:
import re

def normalize_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation (except apostrophes)
    text = re.sub(r'[^\w\s\']', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

# Define a function to add the normalized_text column
def add_normalized_text(example):
    example['normalized_text'] = normalize_text(example['transcription'])
    return example

# Apply the function to the dataset
print("Adding normalized_text column...")
dataset = dataset.map(add_normalized_text)


def extract_all_chars(batch):
    all_text = " ".join(batch["normalized_text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

Adding normalized_text column...


Map:   0%|          | 0/9500 [00:00<?, ? examples/s]

In [11]:
vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

print(f"Characters after normalization not in tokenizer: {dataset_vocab - tokenizer_vocab}")

Map:   0%|          | 0/9500 [00:00<?, ? examples/s]

Characters after normalization not in tokenizer: {'à', 'ờ', 'ỷ', 'ẵ', 'ò', 'ễ', 'ỡ', 'ữ', 'ằ', 'ầ', '3', 'ộ', 'í', 'ã', 'è', 'ở', 'ể', 'ỏ', 'ứ', 'ỉ', 'ẩ', 'ớ', 'ị', 'â', 'ụ', 'ơ', 'ấ', 'ặ', 'ô', 'ũ', 'ỗ', 'ẳ', 'ạ', 'ẻ', 'ú', 'ắ', 'ẫ', 'ả', 'ẽ', 'ỹ', 'ĩ', 'ề', 'ử', 'ợ', 'ă', 'ý', 'ố', 'ọ', 'ì', 'ự', 'á', 'ủ', 'ừ', 'ồ', 'ẹ', 'ư', 'đ', 'ổ', 'ậ', 'ỳ', ' ', 'ó', 'ỵ', 'ù', 'ế', 'ệ', 'õ'}


In [None]:
replacements = [
    ("à","af"),("á","as"),("ả","ar"),("ã","ax"),("ạ","aj"),
    
    ("ă","ah"),("ằ","ahf"),("ắ","ahs"),("ẳ","ahr"),("ẵ","ahx"),("ặ","ahj"),
    
    ("â","ay"),("ầ","ayf"),("ấ","ays"),("ẩ","ayr"),("ẫ","ayx"),("ậ","ayj"),
    
    ("è","ef"),("é","es"),("ẻ","er"),("ẽ","ex"),("ẹ","ej"),
    
    ("ê","ee"),("ề","eef"),("ế","ees"),("ể","eer"),("ễ","eex"),("ệ","eej"),
    
    ("ì","if"),("í","is"),("ỉ","ir"),("ĩ","ix"),("ị","ij"),
    
    ("ò","of"),("ó","os"),("ỏ","or"),("õ","ox"),("ọ","oj"),
    
    ("ô","oh"),("ồ","ohf"),("ố","ohs"),("ổ","ohr"),("ỗ","ohx"),("ộ","ohj"),

    ("ư","uw"),("ừ","uwf"),("ứ","uws"),("ử","uwr"),("ữ","uwx"),("ự","uwj"),

    ("ơ","ow"),("ờ","owf"),("ớ","ows"),("ở","owr"),("ỡ","owx"),("ợ","owj"),
    
    ("ù","uf"),("ú","us"),("ủ","ur"),("ũ","ux"),("ụ","uj"),
    
    ("ỳ","yf"),("ý","ys"),("ỷ","yr"),("ỹ","yx"),("ỵ","yj"),
    
    # Phụ âm 
    ("đ","d"),
    ("gi","z"),
    ("d","z"), 
    ("r","zh"),
    ("x","s"),
    ("s","sh"),
    ("tr","chr"),
    ("ch","ch"),
    ("th","th"),
    ("ph","f"),
    ("kh","kh"),
    ("nh","nh"),
    ("ng","ng"),
    ("gh","g"),
]  
def cleanup_text(inputs):
    for src, dst in replacements:
        inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst)
    return inputs

print("Cleaning up text...")
dataset = dataset.map(cleanup_text)

Cleaning up text...


Map:   0%|          | 0/9500 [00:00<?, ? examples/s]

In [13]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)

def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings


Using device: cuda


hyperparams.yaml: 0.00B [00:00, ?B/s]

embedding_model.ckpt:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

label_encoder.txt: 0.00B [00:00, ?B/s]

In [None]:
import soundfile as sf
import librosa
import io
def prepare_dataset(example):
    audio_data = example["audio"]

    # Kiểm tra xem audio có bytes hay path
    if "bytes" in audio_data and audio_data["bytes"] is not None:
        # Đọc từ bytes
        audio_bytes = audio_data["bytes"]
        speech_array, sampling_rate = sf.read(io.BytesIO(audio_bytes))
    elif "path" in audio_data and audio_data["path"] is not None:
        # Thử đọc từ path (nếu là đường dẫn đầy đủ)
        try:
            speech_array, sampling_rate = sf.read(audio_data["path"])
        except:
            # Nếu path không hoạt động, báo lỗi rõ ràng
            raise ValueError(f"Cannot read audio from path: {audio_data['path']}")
    else:
        raise ValueError(f"Audio data does not contain 'bytes' or valid 'path': {audio_data.keys()}")

    # Resample về 16kHz nếu cần
    if sampling_rate != 16000:
        speech_array = librosa.resample(
            speech_array,
            orig_sr=sampling_rate,
            target_sr=16000
        )
        sampling_rate = 16000

    # Process
    processed = processor(
        text=example["normalized_text"],
        audio_target=speech_array,
        sampling_rate=sampling_rate,
        return_attention_mask=False,
    )

    processed["labels"] = processed["labels"][0]
    processed["speaker_embeddings"] = create_speaker_embedding(speech_array)

    return processed
print("Processing dataset (this may take a while)...")
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)



Processing dataset (this may take a while)...


Map:   0%|          | 0/9500 [00:00<?, ? examples/s]

In [15]:
import numpy as np
speaker_emb = np.array(dataset[0]['speaker_embeddings'])
print(f"Speaker embeddings shape: {speaker_emb.shape}")

Speaker embeddings shape: (512,)


In [16]:
def is_not_too_long(input_ids):
    return len(input_ids) < 200

dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
print(f"Dataset size after filtering: {len(dataset)}")

dataset = dataset.train_test_split(test_size=0.1)
print(f"Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")

Filter:   0%|          | 0/9500 [00:00<?, ? examples/s]

Dataset size after filtering: 9428
Train: 8485, Test: 943


In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        batch = processor.pad(
            input_ids=input_ids, labels=label_features, return_tensors="pt"
        )

        batch["labels"] = batch["labels"].masked_fill(
            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        del batch["decoder_attention_mask"]

        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor(
                [len(feature["input_values"]) for feature in label_features]
            )
            target_lengths = target_lengths.new(
                [
                    length - length % model.config.reduction_factor
                    for length in target_lengths
                ]
            )
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch

data_collator = TTSDataCollatorWithPadding(processor=processor)


In [18]:

print("TẢI MODEL TỪ CHECKPOINT 900")
print(f"Loading model from {base_repo_path} (commit {checkpoint_revision})...")
model = SpeechT5ForTextToSpeech.from_pretrained(
    base_repo_path, 
    revision=checkpoint_revision
)
model.config.use_cache = False
model.generate = partial(model.generate, use_cache=True)
print("Model loaded successfully.")

TẢI MODEL TỪ CHECKPOINT 900
Loading model from oopssuper96/speecht5_finetuned_emirhan_tr (commit 9367721)...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/578M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

Model loaded successfully.


In [None]:

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="speecht5_finetuned_emirhan_tr",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    warmup_steps=100,
    max_steps=4000, 
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}, 
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=1,
    save_steps=500, 
    eval_steps=500,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=True, 
)
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    processing_class=processor, 
)

print("Starting training...")
print("LƯU Ý: Thanh progress bar sẽ đếm lại từ 0, nhưng model đang train VẪN LÀ model 900 steps.")
trainer.train()

print("=== HOÀN TẤT HUẤN LUYỆN! ===")

Starting training...
LƯU Ý: Thanh progress bar sẽ đếm lại từ 0, nhưng model đang train VẪN LÀ model 900 steps.


Step,Training Loss,Validation Loss
500,0.5375,0.507986
1000,0.5232,0.50311
1500,0.5141,0.500485
2000,0.4986,0.500243
2500,0.4924,0.499753
3000,0.4834,0.498179
3500,0.4827,0.50479
4000,0.4564,0.501308


=== HOÀN TẤT HUẤN LUYỆN! ===
