In [1]:
!pip install huggingface_hub
!pip install hf_transfer

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting hf_transfer
  Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m12.3 MB/s[0m  [33m0:00:00[0m3.8 MB/s[0m eta [36m0:00:01[0m01[0m
[?25hInstalling collected packages: hf_transfer
Successfully installed hf_transfer-0.1.9


In [None]:
import pandas as pd
import os

# Đường dẫn tới metadata gốc
metadata_path = "/projects/MedTrivita/SpeechModule/SyntheticData/phoaudiobook_metadata.parquet"

# Đọc metadata
df = pd.read_parquet(metadata_path)

# Tính thời lượng theo giờ và giây
df["duration_hr"] = df["duration"] / 3600.0

# ⚙️ Bước 1: Lọc dữ liệu — chỉ giữ audio trong khoảng 1s–20s
before_len = len(df)
df = df[(df["duration"] >= 1.0) & (df["duration"] <= 20.0)].reset_index(drop=True)
after_len = len(df)
print(f"🧹 Đã lọc {before_len - after_len} mẫu (duration <1s hoặc >20s). Còn lại {after_len} mẫu hợp lệ.")

# ⚙️ Bước 2: Đếm số dòng theo speaker
speaker_counts = df["speaker"].value_counts()

# Hiển thị top 10 speaker nhiều dòng nhất
print("\n🔝 Top 10 speaker có nhiều dòng nhất sau khi lọc:")
print(speaker_counts.head(10))

# ⚙️ Bước 3: Lấy speaker theo thứ tự giảm dần số dòng
sorted_speakers = speaker_counts.index.tolist()

# ⚙️ Bước 4: Chọn speaker cho đến khi tổng thời lượng đạt 25 giờ
selected_speakers = []
total_hours = 0.0
for spk in sorted_speakers:
    hours = df[df["speaker"] == spk]["duration_hr"].sum()
    selected_speakers.append(spk)
    total_hours += hours
    if total_hours >= 25:
        break

print(f"\n✅ Đã chọn {len(selected_speakers)} speaker, tổng thời lượng khoảng {total_hours:.2f} giờ.")

# ⚙️ Bước 5: Lọc dữ liệu chỉ gồm các speaker đã chọn
subset_df = df[df["speaker"].isin(selected_speakers)].copy()
subset_df = subset_df.sample(frac=1, random_state=42).reset_index(drop=True)

# ⚙️ Bước 6: Tính tổng thời lượng tích lũy để chia train/test
subset_df["cumsum_hr"] = subset_df["duration_hr"].cumsum()

# 24h train, 1h test
train_df = subset_df[subset_df["cumsum_hr"] <= 24]
test_df = subset_df[(subset_df["cumsum_hr"] > 24) & (subset_df["cumsum_hr"] <= 25)]

print(f"\n🎧 Train: {train_df['duration_hr'].sum():.2f} giờ ({len(train_df)} mẫu)")
print(f"🎧 Test:  {test_df['duration_hr'].sum():.2f} giờ ({len(test_df)} mẫu)")

# ⚙️ Bước 7: Ghi file train/test theo định dạng yêu cầu
def to_filename(path):
    return os.path.basename(path)

train_lines = [
    f"{to_filename(row.audio_path)}|{row.text}|{row.speaker_id}"
    for _, row in train_df.iterrows()
]
test_lines = [
    f"{to_filename(row.audio_path)}|{row.text}|{row.speaker_id}"
    for _, row in test_df.iterrows()
]

with open("train_stts2.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(train_lines))

with open("test_stts2.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_lines))

print("\n✅ Hoàn tất! Đã tạo train_stts2.txt và test_stts2.txt theo định dạng filename.wav|text|speaker_id")

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()
huggingface_api = os.getenv('HUGGINGFACE_API')

In [2]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="namkuner2402/Fonos_Dataset", filename="Data/epoch_2nd_00030.pth",token=huggingface_api, local_dir="", repo_type ="model")
hf_hub_download(repo_id="namkuner2402/Fonos_Dataset", filename="Modules/PLBERT/step_108000.t7",token=huggingface_api, local_dir="",repo_type ="model")
hf_hub_download(repo_id="namkuner2402/Fonos_Dataset", filename="Modules/ASR/epoch_00080.pth",token=huggingface_api, local_dir="",repo_type ="model")
hf_hub_download(repo_id="namkuner2402/Fonos_Dataset", filename="Modules/JDC/bst.t7",token=huggingface_api, local_dir="",repo_type ="model")

epoch_2nd_00008.pth:   0%|          | 0.00/2.03G [00:00<?, ?B/s]

step_108000.t7:   0%|          | 0.00/473M [00:00<?, ?B/s]

epoch_00080.pth:   0%|          | 0.00/94.6M [00:00<?, ?B/s]

bst.t7:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

'Modules/JDC/bst.t7'

In [9]:
!pip install -r requirements.txt

Collecting git+https://github.com/resemble-ai/monotonic_align.git (from -r requirements.txt (line 22))
  Cloning https://github.com/resemble-ai/monotonic_align.git to /tmp/pip-req-build-cgv9z5mz
  Running command git clone --filter=blob:none --quiet https://github.com/resemble-ai/monotonic_align.git /tmp/pip-req-build-cgv9z5mz
  Resolved https://github.com/resemble-ai/monotonic_align.git to commit 78b985be210a03d08bc3acc01c4df0442105366f
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting SoundFile (from -r requirements.txt (line 1))
  Obtaining dependency information for SoundFile from https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata
  Using cached soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)
Collecting munch

In [7]:
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.1.0
  Downloading https://download.pytorch.org/whl/cu121/torch-2.1.0%2Bcu121-cp311-cp311-linux_x86_64.whl (2200.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 GB[0m [31m659.0 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.16.0
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.16.0%2Bcu121-cp311-cp311-linux_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m97.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.1.0
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.1.0%2Bcu121-cp311-cp311-linux_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m92.2 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.1.0 (from torch==2.1.0)
  Downloading https://download.pytorch.org/whl/triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64

In [1]:
!python train_accelerate.py --config_path config.yml

bert loaded
bert_encoder loaded
predictor loaded
decoder loaded
text_encoder loaded
predictor_encoder loaded
style_encoder loaded
diffusion loaded
text_aligner loaded
pitch_extractor loaded
mpd loaded
msd loaded
wd loaded
BERT AdamW (
Parameter Group 0
    amsgrad: False
    base_momentum: 0.85
    betas: (0.9, 0.99)
    capturable: False
    differentiable: False
    eps: 1e-09
    foreach: None
    fused: None
    initial_lr: 0.0001
    lr: 0.0001
    max_lr: 0.0002
    max_momentum: 0.95
    maximize: False
    min_lr: 0
    weight_decay: 0.01
)
decoder AdamW (
Parameter Group 0
    amsgrad: False
    base_momentum: 0.85
    betas: (0.0, 0.99)
    capturable: False
    differentiable: False
    eps: 1e-09
    foreach: None
    fused: None
    initial_lr: 0.0001
    lr: 0.0001
    max_lr: 0.0002
    max_momentum: 0.95
    maximize: False
    min_lr: 0
    weight_decay: 0.0001
)
Epoch [9/30], Step [10/4693], Loss: 0.30988, Disc Loss: 3.78375, Dur Loss: 0.56769, CE Loss: 0.02530, Norm 

In [None]:
from huggingface_hub import HfApi
api = HfApi(token = huggingface_api)
api.upload_file(
    path_or_fileobj="Data/epoch_2nd_00015.pth",
    path_in_repo="Data/epoch_2nd_00015.pth",
    repo_id="namkuner2402/Fonos_Dataset",
    repo_type="model",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


epoch_2nd_00015.pth:   0%|          | 0.00/2.25G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/namkuner2402/Fonos_Dataset/commit/3fb99b08a13e7003c7f20b6a49bfadf4934aeda7', commit_message='Upload Data/epoch_2nd_00015.pth with huggingface_hub', commit_description='', oid='3fb99b08a13e7003c7f20b6a49bfadf4934aeda7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/namkuner2402/Fonos_Dataset', endpoint='https://huggingface.co', repo_type='model', repo_id='namkuner2402/Fonos_Dataset'), pr_revision=None, pr_num=None)