In [None]:
# ============================================================
# üöÄ Custom Voice Label Tool
# Author: Mr.Jack (https://github.com/Mr-Jack-Tung)
# Date: 2025-11-02
# Description: C√¥ng c·ª• ghi √¢m v√† g√°n nh√£n gi·ªçng n√≥i t√πy ch·ªânh, t·∫°o custom dataset v·ªõi giao di·ªán web s·ª≠ d·ª•ng Gradio
# ============================================================

In [1]:
!pip install gradio soundfile datasets --quiet

In [2]:
import gradio as gr
import os, json, time, uuid, soundfile as sf
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# üîß Th∆∞ m·ª•c l∆∞u d·ªØ li·ªáu
DATA_DIR = "voice_label_data"
os.makedirs(DATA_DIR, exist_ok=True)
META_FILE = os.path.join(DATA_DIR, "metadata.json")

# T·∫°o file metadata n·∫øu ch∆∞a c√≥
if not os.path.exists(META_FILE):
    with open(META_FILE, "w") as f:
        json.dump([], f)

# üì¶ H√†m l∆∞u audio + nh√£n
def save_label(audio, transcript):
    if audio is None:
        return "‚ö†Ô∏è B·∫°n ch∆∞a ghi √¢m ho·∫∑c upload audio n√†o.", None

    # Audio c√≥ th·ªÉ l√† dict {"name":..., "data":...}
    sr, wav = None, None
    if isinstance(audio, dict) and audio.get("data") is not None:
        sr, wav = audio["sampling_rate"], audio["data"]
    else:
        # fallback: gradio file object
        audio_path = audio if isinstance(audio, str) else audio.name
        wav, sr = sf.read(audio_path)

    # L∆∞u file wav m·ªõi
    filename = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}.wav"
    wav_path = os.path.join(DATA_DIR, filename)
    sf.write(wav_path, wav, sr)

    # L∆∞u metadata
    entry = {"audio": wav_path, "transcription": transcript}
    with open(META_FILE, "r") as f:
        data = json.load(f)
    data.append(entry)
    with open(META_FILE, "w") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    return f"‚úÖ ƒê√£ l∆∞u nh√£n cho file {filename}", gr.update(value="")

# üìú Xem danh s√°ch d·ªØ li·ªáu ƒë√£ ghi nh√£n
def list_labels():
    with open(META_FILE, "r") as f:
        data = json.load(f)
    if not data:
        return "Ch∆∞a c√≥ d·ªØ li·ªáu n√†o ƒë∆∞·ª£c g√°n nh√£n."
    return "\n".join([f"{i+1}. {os.path.basename(d['audio'])} ‚Äî {d['transcription']}" for i, d in enumerate(data)])

# üß© Giao di·ªán Gradio
with gr.Blocks() as demo:
    gr.Markdown("# üéôÔ∏è Voice Label Tool (for Whisper Fine-tuning)\nT·∫°o d·ªØ li·ªáu `audio + transcription` d·ªÖ d√†ng trong Colab!")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="üé§ Ghi √¢m ho·∫∑c t·∫£i audio")
            transcript = gr.Textbox(label="‚úçÔ∏è Nh·∫≠p ho·∫∑c ch·ªânh s·ª≠a vƒÉn b·∫£n", lines=2)
            save_btn = gr.Button("üíæ L∆∞u d·ªØ li·ªáu")
            msg = gr.Textbox(label="Th√¥ng b√°o", interactive=False)
            save_btn.click(save_label, inputs=[audio_input, transcript], outputs=[msg, transcript])

        with gr.Column():
            gr.Markdown("## üßæ D·ªØ li·ªáu ƒë√£ ghi nh√£n")
            refresh_btn = gr.Button("üîÑ L√†m m·ªõi danh s√°ch")
            label_list = gr.Textbox(label="Danh s√°ch nh√£n", lines=12, interactive=False)
            refresh_btn.click(list_labels, outputs=label_list)

    gr.Markdown("üëâ T·∫•t c·∫£ d·ªØ li·ªáu ƒë∆∞·ª£c l∆∞u t·∫°i `/content/voice_label_data/`.\nB·∫°n c√≥ th·ªÉ t·∫£i xu·ªëng v√† d√πng ƒë·ªÉ fine-tune Whisper!")

demo.launch(share=True, debug=True)

# Note: to recording audio in local, open the link with browser.


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://fe61ebb3451521d986.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://fe61ebb3451521d986.gradio.live




In [4]:
# üíæ Sau khi g√°n nh√£n xong

# T·∫•t c·∫£ d·ªØ li·ªáu c·ªßa b·∫°n s·∫Ω ƒë∆∞·ª£c l∆∞u trong:
# /content/voice_label_data/
# ‚îú‚îÄ‚îÄ audio_001.wav
# ‚îú‚îÄ‚îÄ audio_002.wav
# ‚îî‚îÄ‚îÄ metadata.json


# File metadata.json s·∫Ω c√≥ c·∫•u tr√∫c chu·∫©n Hugging Face:
# [
#   {"audio": "voice_label_data/audio_001.wav", "transcription": "xin ch√†o b·∫°n"},
#   {"audio": "voice_label_data/audio_002.wav", "transcription": "t√¥i l√† tr·ª£ l√Ω AI"}
# ]


# B·∫°n c√≥ th·ªÉ load l·∫°i r·∫•t d·ªÖ:
from datasets import load_dataset
dataset = load_dataset("json", data_files="voice_label_data/metadata.json")
dataset

Generating train split: 3 examples [00:00, 111.31 examples/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 3
    })
})