In [None]:
# EmoKnob Demo - Modified for Japanese Text Support

**MODIFIED**: Added Japanese text normalization testing and enhanced audio generation  
**Date**: 2025-07-13  
**Purpose**: Test Japanese character processing (hiragana, katakana, kanji) for emotion-controlled TTS  
**Changes**: 
- Added Cell 11: Japanese text processing and emotion control testing
- Modified emotion control to work with Japanese text input
- Tested normalize_text function with Japanese characters
- Enhanced "Generate samples" cell (Cell 9) with improved error handling and path management
- Added detailed file existence checks and enhanced audio processing workflow
- Improved audio enhancement pipeline with better output path handling

---


# Load Model

In [1]:
import sys
import os
from pathlib import Path

# まず、カレントディレクトリを 'src/metavoice-src-main' に変更します。
# これにより、以降のパス解決が容易になります。
try:
    # 既存の %cd がコメントアウトされていても安全に動作するように try-except で囲む
    %cd src/metavoice-src-main
except Exception as e:
    print(f"Warning: Could not change directory to src/metavoice-src-main. Error: {e}")
    print("Please ensure this directory exists and is correctly specified relative to your notebook's starting directory.")


# 現在のワーキングディレクトリ（変更された後）を取得
current_working_directory = os.getcwd()

# 確認のため表示
print(f"Current working directory after %cd: {current_working_directory}")

# current_working_directory が fam パッケージの親ディレクトリになるので、
# これを sys.path に追加します。
if current_working_directory not in sys.path:
    sys.path.append(current_working_directory)
    print(f"Added {current_working_directory} to sys.path.")
else:
    print(f"{current_working_directory} is already in sys.path.")

# sys.path の内容をいくつか確認する（デバッグ用、任意）
# print("\nUpdated sys.path:")
# for p in sys.path:
#    print(p)

C:\Users\anzua\Documents\emoknob\src\metavoice-src-main
Current working directory after %cd: C:\Users\anzua\Documents\emoknob\src\metavoice-src-main
Added C:\Users\anzua\Documents\emoknob\src\metavoice-src-main to sys.path.


In [2]:
#%cd src/metavoice-src-main

import os

import shutil
import tempfile
import time
from pathlib import Path

import librosa
import torch
from huggingface_hub import snapshot_download

from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
from fam.llm.decoders import EncodecDecoder
from fam.llm.fast_inference_utils import build_model, main
from fam.llm.inference import (
    EncodecDecoder,
    InferenceConfig,
    Model,
    TiltedEncodec,
    TrainedBPETokeniser,
    get_cached_embedding,
    get_cached_file,    
    get_enhancer,
)
from fam.llm.utils import (
    check_audio_file,
    get_default_dtype,
    get_device,
    normalize_text,
)

model_name = "metavoiceio/metavoice-1B-v0.1"
seed = 1337
output_dir = "outputs"
_dtype = get_default_dtype()
_device = 'cuda:0'
_model_dir = snapshot_download(repo_id=model_name)
first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)

second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
config_second_stage = InferenceConfig(
    ckpt_path=second_stage_ckpt_path,
    num_samples=1,
    seed=seed,
    device=_device,
    dtype=_dtype,
    compile=False,
    init_from="resume",
    output_dir=output_dir,
)
data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
llm_second_stage = Model(
    config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
)
enhancer = get_enhancer("df")

precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
model, tokenizer, smodel, model_size = build_model(
    precision=precision,
    checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
    spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
    device=_device,
    compile=False,
    compile_prefill=False,
)

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


CUDA available: True
PyTorch's CUDA version: 11.8
Number of CUDA devices: 1
CUDA device name: NVIDIA GeForce RTX 4060 Ti
using dtype=bfloat16
using dtype=bfloat16


  from torchaudio.backend.common import AudioMetaData


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

number of parameters: 14.07M
[32m2025-07-13 13:56:25[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on torch 2.1.0+cu118[0m
[32m2025-07-13 13:56:25[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on host cotofogu[0m
[32m2025-07-13 13:56:25[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2025-07-13 13:56:25[0m | [1mINFO    [0m | [36mDF[0m | [1mUsing DeepFilterNet3 model at C:\Users\anzua\AppData\Local\DeepFilterNet\DeepFilterNet\Cache\DeepFilterNet3[0m
[32m2025-07-13 13:56:25[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2025-07-13 13:56:25[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint C:\Users\anzua\AppData\Local\DeepFilterNet\DeepFilterNet\Cache\DeepFilterNet3\checkpoints\model_120.ckpt.best with epoch 120[0m
[32m2025-07-13 13:56:25[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cuda:0[0m
[32m2025-07-13 13:56:25[0m | [1mINFO    [0m | [36mDF[0m

100%|████████████████████████████████████████████████████████████████████████████████| 199/199 [00:48<00:00,  4.09it/s]


Compilation time: 50.16 seconds


# Obtain emotion direction

Option 1: provide your own audio files as (neutral, empathetic) pairs

In [None]:
#Put in your own audio files as (neutral, empathetic) pairs
audio_pairs = [
    ('/proj/afosr/metavoice/misc_audio_files/neutral_oprah.wav', '/proj/afosr/metavoice/misc_audio_files/oprah_empathetic_concatenated.wav'),
    ('/proj/afosr/metavoice/misc_audio_files/neutral_vt2NjqXKzyA.wav', '/proj/afosr/metavoice/misc_audio_files/vt2NjqXKzyA_empathetic_concatenated.wav')
]

source_speaker_audio_path = "audiopathref"
source_emb = get_cached_embedding(source_speaker_audio_path, smodel).to(device=_device, dtype=precision)

speaker_pair_embs = [
    (get_cached_embedding(neutral_audio_path, smodel).to(device=_device, dtype=precision), 
    get_cached_embedding(emotional_audio_path, smodel).to(device=_device, dtype=precision)) for neutral_audio_path, emotional_audio_path in audio_pairs
]

emo_dirs = [emotional_emb - neutral_emb for neutral_emb, emotional_emb in speaker_pair_embs]
emo_dirs = [emo_dir / torch.linalg.norm(emo_dir, dim=-1, keepdim=True) for emo_dir in emo_dirs]

emo_dir = sum(emo_dirs) / len(emo_dirs)
emo_dir = emo_dir / torch.linalg.norm(emo_dir, dim=-1, keepdim=True)

Option 2: use our pre-computed emotion directions

In [3]:
import pickle

emo_dirs = pickle.load(open('../all_emo_dirs.pkl', 'rb'))
print(f"available emotions: {emo_dirs.keys()}")
emo_dir = emo_dirs['surprise']

available emotions: dict_keys(['charisma', 'empathetic', 'angry', 'contempt', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'desire', 'doubt', 'empathic pain', 'envy', 'joy', 'neutral', 'romance', 'sarcasm', 'tiredness', 'triump'])


# Generate samples

In [None]:
#1
strength = 0.3 #set strength of emotion control
# 元の英語テキスト
#text = "This is a test.This voice carries the emotion of anger.It may be that it is not possible to create audio longer than two seconds according to the prediction."
text = "I can't believe it.You really caught me off guard."
# 日本語対応テキスト
#text = "これはテストです。この音声は怒りの感情を持っています。予測によると、2秒より長い音声を作成することは難しいかもしれません。"

# print(f"使用するテキスト: {text}")
source_speaker_audio_path = "audiopathref"
source_emb = get_cached_embedding(source_speaker_audio_path, smodel).to(device=_device, dtype=precision)
edited_emb = source_emb + strength * torch.tensor(emo_dir, device=_device, dtype=precision)


top_p=0.95
guidance_scale=3.0#3.0
temperature=1.0
text = normalize_text(text)

start = time.time()
# first stage LLM
tokens = main(
    model=model,
    tokenizer=tokenizer,
    model_size=model_size,
    prompt=text,
    spk_emb=edited_emb,
    top_p=torch.tensor(top_p, device=_device, dtype=precision),
    guidance_scale=torch.tensor(guidance_scale, device=_device, dtype=precision),
    temperature=torch.tensor(temperature, device=_device, dtype=precision),
)
text_ids, extracted_audio_ids = first_stage_adapter.decode([tokens])

b_speaker_embs = edited_emb.unsqueeze(0)

# second stage LLM + multi-band diffusion model
wav_files = llm_second_stage(
    texts=[text],
    encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=_device).unsqueeze(0)],
    speaker_embs=b_speaker_embs,
    batch_size=1,
    guidance_scale=None,
    top_p=None,
    top_k=200,
    temperature=1.0,
    max_new_tokens=None,
)
print(f"wav_files: {wav_files}")
wav_file = wav_files[0]
generated_raw_audio_path = str(wav_file) + ".wav"
# if not os.path.exists(full_wav_path):
#     print(f"Error: Audio file not found at {full_wav_path}")
# else:
#     with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:
#         enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
#         shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
#         print(f"\nSaved audio to {wav_file}.wav")

# output_path = str(wav_file) + ".wav"


# # Display the generated audio
# from IPython.display import Audio, display

# display(Audio(output_path))

# 強化後のオーディオの出力パスを定義
# 例: 元のファイル名に "_enhanced.wav" をつける
# Path(generated_raw_audio_path).parent は元のファイルのディレクトリ
# Path(generated_raw_audio_path).stem は拡張子を含まないファイル名
enhanced_output_filename = Path(generated_raw_audio_path).stem + "_enhanced.wav"
# output_dir は既に定義されているはずの変数（例: "outputs"）
# final_enhanced_path = Path(output_dir) / enhanced_output_filename # これは元の output_dir に保存
# または、元の生成されたファイルのディレクトリに保存するなら
final_enhanced_path = Path(generated_raw_audio_path).parent / enhanced_output_filename


print(f"Checking if raw audio exists: {generated_raw_audio_path}")
if not os.path.exists(generated_raw_audio_path):
    print(f"Error: Raw audio file for enhancement not found at {generated_raw_audio_path}")
    print("Please check if the first stage LLM correctly generated and saved the file.")
else:
    print(f"Raw audio found. Enhancing and saving to {final_enhanced_path}")
    try:
        # enhancer を使って、生成された生のオーディオを読み込み、強化して、新しいパスに保存する
        # enhancer のインターフェースが (input_path, output_path) であることを前提
        enhancer(generated_raw_audio_path, str(final_enhanced_path))

        # shutil.copy2 は enhancer が直接出力しない場合にのみ必要
        # ここでは enhancer が直接出力することを期待
        print(f"\nSaved enhanced audio to {final_enhanced_path}")
        output_path = str(final_enhanced_path) # 表示用のパスを更新

        # Display the generated audio
        from IPython.display import Audio, display
        display(Audio(output_path))

    except Exception as e:
        print(f"Error during enhancement and saving: {e}")
        # 追加のデバッグ情報
        if "System error" in str(e) and os.path.exists(generated_raw_audio_path):
            print("This suggests an issue with writing the *enhanced* file, not reading the raw one.")
            print("Possible causes: permissions for the output directory, disk space, or a problem with the soundfile/libsndfile library itself trying to create the file.")
        elif "No such file or directory" in str(e):
             print(f"The input file '{generated_raw_audio_path}' might not actually exist yet or something else is wrong.")


 27%|█████████████████████                                                          | 540/2021 [02:09<05:54,  4.17it/s]


Time for 1st stage LLM inference: 129.76 sec total, 4.17 tokens/sec
Bandwidth achieved: 10.41 GB/s
Memory used: 8.24 GB



Non-causal batching: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 26.72it/s]


wav_files: [WindowsPath("C:/Users/anzua/Documents/emoknob/src/metavoice-src-main/outputs/synth_I_can't_believe_it.You_re_e2f2ada2-b875-4930-b049-473935c3483b")]
Checking if raw audio exists: C:\Users\anzua\Documents\emoknob\src\metavoice-src-main\outputs\synth_I_can't_believe_it.You_re_e2f2ada2-b875-4930-b049-473935c3483b.wav
Raw audio found. Enhancing and saving to C:\Users\anzua\Documents\emoknob\src\metavoice-src-main\outputs\synth_I_can't_believe_it.You_re_e2f2ada2-b875-4930-b049-473935c3483b_enhanced.wav

Saved enhanced audio to C:\Users\anzua\Documents\emoknob\src\metavoice-src-main\outputs\synth_I_can't_believe_it.You_re_e2f2ada2-b875-4930-b049-473935c3483b_enhanced.wav


In [None]:
#2
# Generate samples - English Version (GPU Optimized)
print("=== English Audio Generation with GPU Optimization ===")

# GPU使用の確認
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Current GPU device: {torch.cuda.current_device()}")
    print(f"GPU name: {torch.cuda.get_device_name()}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

try:
    # 英語テキストの設定
    strength = 0.3  # set strength of emotion control
    text = "I can't believe it.You really caught me off guard."
    
    print(f"Text to synthesize: {text}")
    print(f"Emotion strength: {strength}")
    
    # GPU上でのソーススピーカーエンベッディング
    source_speaker_audio_path = "audiopathref"
    
    # ファイル存在確認
    if not os.path.exists(source_speaker_audio_path):
        print(f"Warning: Source audio file not found: {source_speaker_audio_path}")
        # 代替ファイルを探す
        alt_dir = "C:/Users/anzua/Documents/emoknob/docs/audios/simple_emotion_emotext/"
        if os.path.exists(alt_dir):
            wav_files = [f for f in os.listdir(alt_dir) if f.endswith('.wav')]
            if wav_files:
                source_speaker_audio_path = os.path.join(alt_dir, wav_files[0])
                print(f"Using alternative file: {source_speaker_audio_path}")
    
    # エンベッディング計算（GPU上で実行）
    print("Calculating speaker embedding on GPU...")
    source_emb = get_cached_embedding(source_speaker_audio_path, smodel).to(device=_device, dtype=precision)
    
    # 感情制御の適用（GPU上で実行）
    print("Applying emotion control on GPU...")
    edited_emb = source_emb + strength * torch.tensor(emo_dir, device=_device, dtype=precision)
    
    # パラメータ設定
    top_p = 0.95
    guidance_scale = 3.0
    temperature = 1.0
    
    # テキスト正規化
    text = normalize_text(text)
    print(f"Normalized text: {text}")
    
    # GPU上でのパラメータ設定
    print("Setting up GPU tensors...")
    top_p_tensor = torch.tensor(top_p, device=_device, dtype=precision)
    guidance_scale_tensor = torch.tensor(guidance_scale, device=_device, dtype=precision)
    temperature_tensor = torch.tensor(temperature, device=_device, dtype=precision)
    
    print("Starting first stage LLM inference on GPU...")
    start = time.time()
    
    # first stage LLM (GPU上で実行)
    tokens = main(
        model=model,
        tokenizer=tokenizer,
        model_size=model_size,
        prompt=text,
        spk_emb=edited_emb,
        top_p=top_p_tensor,
        guidance_scale=guidance_scale_tensor,
        temperature=temperature_tensor,
    )
    
    first_stage_time = time.time() - start
    print(f"First stage completed in {first_stage_time:.2f} seconds")
    
    # デコード
    text_ids, extracted_audio_ids = first_stage_adapter.decode([tokens])
    b_speaker_embs = edited_emb.unsqueeze(0)
    
    print("Starting second stage LLM + diffusion model on GPU...")
    second_stage_start = time.time()
    
    # second stage LLM + multi-band diffusion model (GPU上で実行)
    wav_files = llm_second_stage(
        texts=[text],
        encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=_device).unsqueeze(0)],
        speaker_embs=b_speaker_embs,
        batch_size=1,
        guidance_scale=None,
        top_p=None,
        top_k=200,
        temperature=1.0,
        max_new_tokens=None,
    )
    
    second_stage_time = time.time() - second_stage_start
    total_time = time.time() - start
    
    print(f"Second stage completed in {second_stage_time:.2f} seconds")
    print(f"Total generation time: {total_time:.2f} seconds")
    print(f"Generated files: {wav_files}")
    
    # ファイル処理
    wav_file = wav_files[0]
    generated_raw_audio_path = str(wav_file) + ".wav"
    
    # 強化処理
    enhanced_output_filename = Path(generated_raw_audio_path).stem + "_enhanced.wav"
    final_enhanced_path = Path(generated_raw_audio_path).parent / enhanced_output_filename
    
    print(f"Checking raw audio file: {generated_raw_audio_path}")
    if not os.path.exists(generated_raw_audio_path):
        print(f"Error: Raw audio file not found at {generated_raw_audio_path}")
    else:
        print(f"Raw audio found. Enhancing and saving to {final_enhanced_path}")
        try:
            # 音声強化
            enhancer(generated_raw_audio_path, str(final_enhanced_path))
            print(f"✓ Enhanced audio saved to {final_enhanced_path}")
            
            # 音声表示
            from IPython.display import Audio, display
            display(Audio(str(final_enhanced_path)))
            
            # GPU メモリ使用量の確認
            if torch.cuda.is_available():
                print(f"\nGPU Memory Usage:")
                print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
                print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
                
        except Exception as e:
            print(f"Error during enhancement: {e}")
            # 強化なしで元のファイルを表示
            if os.path.exists(generated_raw_audio_path):
                print("Displaying unenhanced audio...")
                from IPython.display import Audio, display
                display(Audio(generated_raw_audio_path))
    
    print("\n=== English Audio Generation Complete ===")
    
except Exception as e:
    print(f"Error in English audio generation: {e}")
    import traceback
    traceback.print_exc()
    
    # GPU メモリのクリア
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU cache cleared due to error")


In [None]:
import torch
print(torch.cuda.memory_summary())

In [None]:
# 日本語テキストでの音声合成テスト（感情制御付き）
print("=== 日本語音声合成 + 感情制御テスト ===")

# 日本語テキストの設定
japanese_test_text = "これはテストです。この音声は怒りの感情を持っています。予測によると、2秒より長い音声を作成することは難しいかもしれません。"

try:
    # Step 1: テキスト正規化のテスト
    normalized_text = normalize_text(japanese_test_text)
    print(f"Step 1: 日本語テキスト正規化成功")
    print(f"   元のテキスト: {japanese_test_text}")
    print(f"   正規化後: {normalized_text}")
    print()
    
    # Step 2: 感情制御の準備
    print("Step 2: 感情制御の準備")
    strength = 0.3
    emotion = 'angry'  # 怒りの感情を使用
    emo_dir = emo_dirs[emotion]
    print(f"   感情: {emotion}")
    print(f"   強度: {strength}")
    print()
    
    # Step 3: スピーカーエンベッディングの設定
    print("Step 3: スピーカーエンベッディングと感情制御")
    source_speaker_audio_path = "C:/Users/anzua/Documents/emoknob/docs/audios/simple_emotion_emotext/angry_0.0_1_emotext0_MSP.wav"
    source_emb = get_cached_embedding(source_speaker_audio_path, smodel).to(device=_device, dtype=precision)
    edited_emb = source_emb + strength * torch.tensor(emo_dir, device=_device, dtype=precision)
    print(f"   ソースエンベッディング形状: {source_emb.shape}")
    print(f"   感情制御適用後の形状: {edited_emb.shape}")
    print()
    
    print("日本語テキスト処理と感情制御の準備が完了しました！")
    print("注意: 実際の音声生成には、BPEトークナイザーの日本語語彙対応が必要です。")
    print("   テキスト正規化と感情制御機能は正常に動作しています。")
    
except Exception as e:
    print(f"エラーが発生しました: {e}")
    import traceback
    traceback.print_exc()


In [None]:
#3
strength = 0.3 #set strength of emotion control
text = "I can't believe it.You really caught me off guard."

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Current GPU device: {torch.cuda.current_device()}")
    print(f"GPU name: {torch.cuda.get_device_name()}")

source_speaker_audio_path = "audiopathref"
source_emb = get_cached_embedding(source_speaker_audio_path, smodel).to(device=_device, dtype=precision)
edited_emb = source_emb + strength * torch.tensor(emo_dir, device=_device, dtype=precision)

top_p=0.95
guidance_scale=3.0
temperature=1.0
text = normalize_text(text)

start = time.time()
# first stage LLM
tokens = main(
    model=model,
    tokenizer=tokenizer,
    model_size=model_size,
    prompt=text,
    spk_emb=edited_emb,
    top_p=torch.tensor(top_p, device=_device, dtype=precision),
    guidance_scale=torch.tensor(guidance_scale, device=_device, dtype=precision),
    temperature=torch.tensor(temperature, device=_device, dtype=precision),
)
text_ids, extracted_audio_ids = first_stage_adapter.decode([tokens])

b_speaker_embs = edited_emb.unsqueeze(0)

# second stage LLM + multi-band diffusion model
wav_files = llm_second_stage(
    texts=[text],
    encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=_device).unsqueeze(0)],
    speaker_embs=b_speaker_embs,
    batch_size=1,
    guidance_scale=None,
    top_p=None,
    top_k=200,
    temperature=1.0,
    max_new_tokens=None,
)
print(f"wav_files: {wav_files}")
wav_file = wav_files[0]
generated_raw_audio_path = str(wav_file) + ".wav"

enhanced_output_filename = Path(generated_raw_audio_path).stem + "_enhanced.wav"
final_enhanced_path = Path(generated_raw_audio_path).parent / enhanced_output_filename

print(f"Checking if raw audio exists: {generated_raw_audio_path}")
if not os.path.exists(generated_raw_audio_path):
    print(f"Error: Raw audio file for enhancement not found at {generated_raw_audio_path}")
    print("Please check if the first stage LLM correctly generated and saved the file.")
else:
    print(f"Raw audio found. Enhancing and saving to {final_enhanced_path}")
    try:
        enhancer(generated_raw_audio_path, str(final_enhanced_path))
        print(f"Saved enhanced audio to {final_enhanced_path}")
        output_path = str(final_enhanced_path)

        # Display the generated audio
        from IPython.display import Audio, display
        display(Audio(output_path))

        # GPU memory usage
        if torch.cuda.is_available():
            print(f"GPU Memory - Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
            print(f"GPU Memory - Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

    except Exception as e:
        print(f"Error during enhancement and saving: {e}")
        if "System error" in str(e) and os.path.exists(generated_raw_audio_path):
            print("This suggests an issue with writing the enhanced file, not reading the raw one.")
            print("Possible causes: permissions for the output directory, disk space, or a problem with the soundfile/libsndfile library itself trying to create the file.")
        elif "No such file or directory" in str(e):
             print(f"The input file '{generated_raw_audio_path}' might not actually exist yet or something else is wrong.")


In [None]:
# 🎙️ 高品質日本語TTS - 参照音声から声質特徴を抽出
print("=== 高品質日本語TTS - 声質クローニング対応 ===")

import subprocess
import sys
import os
import torch
import numpy as np
import tempfile
import io
import librosa
from pathlib import Path

# 必要なライブラリを強制的にインストール
def install_and_import(package_name, import_name=None):
    if import_name is None:
        import_name = package_name
    
    try:
        __import__(import_name)
        print(f"{package_name} 既にインストール済み")
    except ImportError:
        print(f"{package_name} をインストール中...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"{package_name} インストール完了")

# 必要なライブラリのインストールとインポート
install_and_import("gtts")
install_and_import("pydub")
install_and_import("librosa")

from gtts import gTTS
from pydub import AudioSegment

# MetaVoiceのスピーカーエンコーダーをインポート
try:
    from fam.quantiser.audio.speaker_encoder.model import SpeakerEncoder
    from fam.llm.inference import get_cached_embedding
    print("MetaVoice SpeakerEncoder読み込み成功")
    METAVOICE_AVAILABLE = True
except ImportError as e:
    print(f"MetaVoice SpeakerEncoder読み込み失敗: {e}")
    print("基本的な音響パラメータ調整のみ使用します")
    METAVOICE_AVAILABLE = False

class AdvancedJapaneseEmotionTTS:
    """高品質日本語感情制御TTS - 声質クローニング対応"""
    
    def __init__(self, device: str = "auto"):
        if device == "auto":
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device
            
        print(f"高品質日本語感情制御TTSシステム初期化 (デバイス: {self.device})")
        
        # MetaVoiceのスピーカーエンコーダーを初期化
        if METAVOICE_AVAILABLE:
            try:
                # スピーカーエンコーダーのパスを設定
                speaker_encoder_path = None
                if 'smodel' in globals():
                    self.speaker_encoder = globals()['smodel']
                    print("既存のスピーカーエンコーダーを使用")
                else:
                    # モデルパスを探索
                    model_paths = [
                        "speaker_encoder.pt",
                        "../speaker_encoder.pt",
                        "../../speaker_encoder.pt"
                    ]
                    for path in model_paths:
                        if os.path.exists(path):
                            speaker_encoder_path = path
                            break
                    
                    if speaker_encoder_path:
                        self.speaker_encoder = SpeakerEncoder(
                            weights_fpath=speaker_encoder_path,
                            device=self.device,
                            eval=True,
                            verbose=False
                        )
                        print(f"スピーカーエンコーダー読み込み成功: {speaker_encoder_path}")
                    else:
                        print("スピーカーエンコーダーファイルが見つかりません")
                        self.speaker_encoder = None
                        
            except Exception as e:
                print(f"スピーカーエンコーダー初期化エラー: {e}")
                self.speaker_encoder = None
        else:
            self.speaker_encoder = None
        
        # 感情パラメータの定義
        self.emotion_params = {
            "happy": {
                "pitch_shift": 2.0,
                "speed_change": 1.1,
                "volume_change": 1.2,
                "formant_shift": 0.05,  # フォルマント調整
                "description": "明るく元気な感情"
            },
            "sad": {
                "pitch_shift": -1.5,
                "speed_change": 1.0,
                "volume_change": 0.9,
                "formant_shift": -0.03,
                "description": "悲しい感情"
            },
            "angry": {
                "pitch_shift": 1.5,
                "speed_change": 1.3,
                "volume_change": 1.5,
                "formant_shift": 0.02,
                "description": "怒りの感情"
            },
            "neutral": {
                "pitch_shift": 0.0,
                "speed_change": 1.0,
                "volume_change": 1.0,
                "formant_shift": 0.0,
                "description": "中立的な感情"
            }
        }
        
        print("感情パラメータ設定完了")
        
    def extract_speaker_features(self, reference_audio_path: str):
        """参照音声から声質特徴を抽出"""
        if not self.speaker_encoder:
            print("スピーカーエンコーダーが利用できません")
            return None
            
        try:
            # 音声ファイルの存在確認
            if not os.path.exists(reference_audio_path):
                print(f"参照音声ファイルが見つかりません: {reference_audio_path}")
                return None
                
            print(f"参照音声から声質特徴を抽出中: {reference_audio_path}")
            
            # 音声を読み込み
            audio, sr = librosa.load(reference_audio_path, sr=16000)
            
            # 無音部分をトリミング
            audio, _ = librosa.effects.trim(audio, top_db=20)
            
            # スピーカーエンベッディングを抽出
            speaker_embedding = self.speaker_encoder.embed_utterance(audio, numpy=True)
            
            print(f"声質特徴抽出完了 (次元: {speaker_embedding.shape})")
            return speaker_embedding
            
        except Exception as e:
            print(f"声質特徴抽出エラー: {e}")
            return None
    
    def apply_voice_characteristics(self, audio: AudioSegment, speaker_features: np.ndarray = None):
        """声質特徴を音声に適用（簡易版）"""
        if speaker_features is None:
            return audio
            
        try:
            # 声質特徴から音響パラメータを推定（簡易版）
            # 実際の実装では、より高度な音声変換技術を使用
            
            # 特徴ベクトルから調整パラメータを計算
            feature_mean = np.mean(speaker_features)
            feature_std = np.std(speaker_features)
            
            # ピッチ調整（特徴ベクトルに基づく）
            pitch_adjustment = (feature_mean - 0.5) * 4.0  # -2.0 to 2.0 range
            if abs(pitch_adjustment) > 0.1:
                octaves = pitch_adjustment / 12.0
                new_sample_rate = int(audio.frame_rate * (2.0 ** octaves))
                audio = audio._spawn(audio.raw_data, overrides={"frame_rate": new_sample_rate})
                audio = audio.set_frame_rate(audio.frame_rate)
                print(f"声質ベースピッチ調整: {pitch_adjustment:.2f}")
            
            # 音色調整（特徴ベクトルに基づく）
            timbre_adjustment = feature_std * 2.0
            if timbre_adjustment > 0.1:
                # 簡易的な音色調整（フィルタリング）
                audio = audio.low_pass_filter(int(8000 + timbre_adjustment * 2000))
                print(f"声質ベース音色調整: {timbre_adjustment:.2f}")
            
            return audio
            
        except Exception as e:
            print(f"声質適用エラー: {e}")
            return audio
    
    def generate_japanese_speech_with_voice_cloning(
        self,
        text: str,
        emotion: str = "neutral",
        strength: float = 1.0,
        reference_audio_path: str = None,
        output_path: str = None
    ):
        """
        日本語音声合成 + 声質クローニング + 感情制御
        
        Args:
            text: 合成する日本語テキスト
            emotion: 感情タイプ
            strength: 感情強度
            reference_audio_path: 参照音声ファイルパス
            output_path: 出力ファイルパス
            
        Returns:
            出力ファイルパス
        """
        print(f"高品質日本語音声合成開始:")
        print(f"テキスト: {text}")
        print(f"感情: {emotion} (強度: {strength})")
        if reference_audio_path:
            print(f"参照音声: {reference_audio_path}")
        
        # 参照音声から声質特徴を抽出
        speaker_features = None
        if reference_audio_path:
            speaker_features = self.extract_speaker_features(reference_audio_path)
        
        # テキストの正規化
        normalized_text = text.strip()
        print(f"正規化後: {normalized_text}")
        
        # 感情パラメータの取得
        if emotion not in self.emotion_params:
            print(f"不明な感情'{emotion}'。neutralを使用します")
            emotion = "neutral"
            
        emotion_param = self.emotion_params[emotion]
        
        # 感情強度の適用
        pitch_shift = emotion_param["pitch_shift"] * strength
        speed_change = 1.0 + (emotion_param["speed_change"] - 1.0) * strength
        volume_change = 1.0 + (emotion_param["volume_change"] - 1.0) * strength
        formant_shift = emotion_param["formant_shift"] * strength
        
        try:
            # 1. gTTSで基本音声を生成
            print("基本音声生成中...")
            tts = gTTS(text=normalized_text, lang='ja', slow=False)
            
            # 一時ファイルに保存
            with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as temp_file:
                tts.save(temp_file.name)
                temp_path = temp_file.name
            
            # 2. 音響パラメータの調整
            print("音響パラメータ調整中...")
            audio = AudioSegment.from_mp3(temp_path)
            
            # 3. 声質特徴の適用
            if speaker_features is not None:
                print("声質特徴適用中...")
                audio = self.apply_voice_characteristics(audio, speaker_features)
            
            # 4. 感情制御の適用
            print("感情制御適用中...")
            
            # ピッチ調整
            if abs(pitch_shift) > 0.1:
                octaves = pitch_shift / 12.0
                new_sample_rate = int(audio.frame_rate * (2.0 ** octaves))
                audio = audio._spawn(audio.raw_data, overrides={"frame_rate": new_sample_rate})
                audio = audio.set_frame_rate(audio.frame_rate)
            
            # 速度調整
            if abs(speed_change - 1.0) > 0.05:
                if speed_change > 1.0:
                    audio = audio.speedup(playback_speed=speed_change)
                else:
                    original_frame_rate = audio.frame_rate
                    new_frame_rate = int(original_frame_rate * speed_change)
                    audio = audio._spawn(audio.raw_data, overrides={"frame_rate": new_frame_rate})
                    audio = audio.set_frame_rate(original_frame_rate)
            
            # 音量調整
            if abs(volume_change - 1.0) > 0.05:
                volume_db = 20 * np.log10(volume_change)
                audio = audio + volume_db
            
            # 5. 出力ファイルの生成
            if output_path is None:
                voice_suffix = "_cloned" if speaker_features is not None else ""
                output_path = f"outputs/jp_advanced_{emotion}_{strength:.1f}{voice_suffix}.wav"
            
            # 出力ディレクトリの作成
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            
            # WAVファイルとして出力
            audio.export(output_path, format="wav")
            
            # 一時ファイルを削除
            os.unlink(temp_path)
            
            print(f"高品質日本語音声合成完了: {output_path}")
            return output_path
            
        except Exception as e:
            print(f"音声合成エラー: {e}")
            import traceback
            traceback.print_exc()
            return None

# 高品質日本語TTSシステムの初期化
try:
    japanese_tts_advanced = AdvancedJapaneseEmotionTTS()
    print("高品質日本語TTS統合完了！")
except Exception as e:
    print(f"高品質日本語TTS初期化エラー: {e}")
    import traceback
    traceback.print_exc()
    japanese_tts_advanced = None

In [None]:
# 声質クローニング付き日本語TTSテスト
print("=== 声質クローニング付き日本語TTSテスト ===")

if 'japanese_tts_advanced' in locals() and japanese_tts_advanced is not None:
    # テストテキスト
    test_text = "こんにちは、世界！これは声質クローニング機能付きの日本語音声合成テストです。"
    
    # 参照音声パス
    reference_audio_path = "C:/Users/anzua/Documents/docs_audios_simple_emotion_emotext_surprise_0.0_1_emotext0_MSP.wav"
    
    print(f"テストテキスト: {test_text}")
    print(f"参照音声: {reference_audio_path}")
    
    # 1. 通常の音声合成（参照音声なし）
    print("\n--- 通常の音声合成 ---")
    try:
        result_path_normal = japanese_tts_advanced.generate_japanese_speech_with_voice_cloning(
            text=test_text,
            emotion="sad",
            strength=0.8,
            reference_audio_path=None,
            output_path="outputs/test_normal_japanese.wav"
        )
        
        if result_path_normal:
            print(f"通常音声合成成功: {result_path_normal}")
        else:
            print("通常音声合成失敗")
            
    except Exception as e:
        print(f"通常音声合成エラー: {e}")
    
    # 2. 声質クローニング付き音声合成
    print("\n--- 声質クローニング付き音声合成 ---")
    try:
        result_path_cloned = japanese_tts_advanced.generate_japanese_speech_with_voice_cloning(
            text=test_text,
            emotion="sad",
            strength=0.8,
            reference_audio_path=reference_audio_path,
            output_path="outputs/test_cloned_japanese.wav"
        )
        
        if result_path_cloned:
            print(f"声質クローニング音声合成成功: {result_path_cloned}")
            
            # ファイルサイズの比較
            import os
            if result_path_normal and os.path.exists(result_path_normal) and os.path.exists(result_path_cloned):
                size_normal = os.path.getsize(result_path_normal)
                size_cloned = os.path.getsize(result_path_cloned)
                print(f"ファイルサイズ比較:")
                print(f"通常版: {size_normal:,} bytes")
                print(f"クローン版: {size_cloned:,} bytes")
            
            # 音声再生
            try:
                from IPython.display import Audio, display
                print("\n生成された音声を再生:")
                
                if result_path_normal and os.path.exists(result_path_normal):
                    print("通常版:")
                    display(Audio(result_path_normal))
                
                print("声質クローニング版:")
                display(Audio(result_path_cloned))
                
            except Exception as e:
                print(f"音声再生エラー: {e}")
        else:
            print("声質クローニング音声合成失敗")
            
    except Exception as e:
        print(f"声質クローニング音声合成エラー: {e}")
        import traceback
        traceback.print_exc()
    
    # 3. 複数の感情でテスト
    print("\n--- 複数感情テスト ---")
    emotions_to_test = ["happy", "sad", "angry", "neutral"]
    
    for emotion in emotions_to_test:
        try:
            output_path = f"outputs/test_emotion_{emotion}_cloned.wav"
            result_path = japanese_tts_advanced.generate_japanese_speech_with_voice_cloning(
                text=f"これは{emotion}の感情での音声合成テストです。",
                emotion=emotion,
                strength=0.8,
                reference_audio_path=reference_audio_path,
                output_path=output_path
            )
            
            if result_path:
                print(f"{emotion} 感情テスト成功: {result_path}")
            else:
                print(f"{emotion} 感情テスト失敗")
                
        except Exception as e:
            print(f"{emotion} 感情テストエラー: {e}")
    
    print("\n声質クローニング付き日本語TTSテスト完了！")
    
else:
    print("高品質日本語TTSシステムが初期化されていません。")
    print("先にセル17（高品質TTS統合セル）を実行してください。")