In [None]:
# EmoKnob Demo - Modified for Japanese Text Support

**MODIFIED**: Added Japanese text normalization testing and enhanced audio generation  
**Date**: 2025-07-13  
**Purpose**: Test Japanese character processing (hiragana, katakana, kanji) for emotion-controlled TTS  
**Changes**: 
- Added Cell 11: Japanese text processing and emotion control testing
- Modified emotion control to work with Japanese text input
- Tested normalize_text function with Japanese characters
- Enhanced "Generate samples" cell (Cell 9) with improved error handling and path management
- Added detailed file existence checks and enhanced audio processing workflow
- Improved audio enhancement pipeline with better output path handling

---


# Load Model

In [1]:
import sys
import os
from pathlib import Path

# まず、カレントディレクトリを 'src/metavoice-src-main' に変更します。
# これにより、以降のパス解決が容易になります。
try:
    # 既存の %cd がコメントアウトされていても安全に動作するように try-except で囲む
    %cd src/metavoice-src-main
except Exception as e:
    print(f"Warning: Could not change directory to src/metavoice-src-main. Error: {e}")
    print("Please ensure this directory exists and is correctly specified relative to your notebook's starting directory.")


# 現在のワーキングディレクトリ（変更された後）を取得
current_working_directory = os.getcwd()

# 確認のため表示
print(f"Current working directory after %cd: {current_working_directory}")

# current_working_directory が fam パッケージの親ディレクトリになるので、
# これを sys.path に追加します。
if current_working_directory not in sys.path:
    sys.path.append(current_working_directory)
    print(f"Added {current_working_directory} to sys.path.")
else:
    print(f"{current_working_directory} is already in sys.path.")

# sys.path の内容をいくつか確認する（デバッグ用、任意）
# print("\nUpdated sys.path:")
# for p in sys.path:
#    print(p)

C:\Users\anzua\Documents\emoknob\src\metavoice-src-main
Current working directory after %cd: C:\Users\anzua\Documents\emoknob\src\metavoice-src-main
Added C:\Users\anzua\Documents\emoknob\src\metavoice-src-main to sys.path.


In [None]:
#%cd src/metavoice-src-main

import os

import shutil
import tempfile
import time
from pathlib import Path

import librosa
import torch
from huggingface_hub import snapshot_download

from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
from fam.llm.decoders import EncodecDecoder
from fam.llm.fast_inference_utils import build_model, main
from fam.llm.inference import (
    EncodecDecoder,
    InferenceConfig,
    Model,
    TiltedEncodec,
    TrainedBPETokeniser,
    get_cached_embedding,
    get_cached_file,    
    get_enhancer,
)
from fam.llm.utils import (
    check_audio_file,
    get_default_dtype,
    get_device,
    normalize_text,
)

model_name = "metavoiceio/metavoice-1B-v0.1"
seed = 1337
output_dir = "outputs"
_dtype = get_default_dtype()
_device = 'cuda:0'
_model_dir = snapshot_download(repo_id=model_name)
first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)

second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
config_second_stage = InferenceConfig(
    ckpt_path=second_stage_ckpt_path,
    num_samples=1,
    seed=seed,
    device=_device,
    dtype=_dtype,
    compile=False,
    init_from="resume",
    output_dir=output_dir,
)
data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
llm_second_stage = Model(
    config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
)
enhancer = get_enhancer("df")

precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
model, tokenizer, smodel, model_size = build_model(
    precision=precision,
    checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
    spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
    device=_device,
    compile=False,
    compile_prefill=False,
)

# Obtain emotion direction

Option 1: provide your own audio files as (neutral, empathetic) pairs

In [None]:
#Put in your own audio files as (neutral, empathetic) pairs
audio_pairs = [
    ('/proj/afosr/metavoice/misc_audio_files/neutral_oprah.wav', '/proj/afosr/metavoice/misc_audio_files/oprah_empathetic_concatenated.wav'),
    ('/proj/afosr/metavoice/misc_audio_files/neutral_vt2NjqXKzyA.wav', '/proj/afosr/metavoice/misc_audio_files/vt2NjqXKzyA_empathetic_concatenated.wav')
]

source_speaker_audio_path = #plug in audio file for voice cloning
source_emb = get_cached_embedding(source_speaker_audio_path, smodel).to(device=_device, dtype=precision)

speaker_pair_embs = [
    (get_cached_embedding(neutral_audio_path, smodel).to(device=_device, dtype=precision), 
    get_cached_embedding(emotional_audio_path, smodel).to(device=_device, dtype=precision)) for neutral_audio_path, emotional_audio_path in audio_pairs
]

emo_dirs = [emotional_emb - neutral_emb for neutral_emb, emotional_emb in speaker_pair_embs]
emo_dirs = [emo_dir / torch.linalg.norm(emo_dir, dim=-1, keepdim=True) for emo_dir in emo_dirs]

emo_dir = sum(emo_dirs) / len(emo_dirs)
emo_dir = emo_dir / torch.linalg.norm(emo_dir, dim=-1, keepdim=True)

Option 2: use our pre-computed emotion directions

In [3]:
import pickle

emo_dirs = pickle.load(open('../all_emo_dirs.pkl', 'rb'))
print(f"available emotions: {emo_dirs.keys()}")
emo_dir = emo_dirs['sad']

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


/proj/afosr/metavoice/emoknob/src/metavoice-src-main
available emotions: dict_keys(['charisma', 'empathetic', 'angry', 'contempt', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'desire', 'doubt', 'empathic pain', 'envy', 'joy', 'neutral', 'romance', 'sarcasm', 'tiredness', 'triump'])


# Generate samples

In [None]:
strength = 0.3 #set strength of emotion control
# 元の英語テキスト
#text = "This is a test.This voice carries the emotion of anger.It may be that it is not possible to create audio longer than two seconds according to the prediction."

# 日本語対応テキスト
text = "これはテストです。この音声は怒りの感情を持っています。予測によると、2秒より長い音声を作成することは難しいかもしれません。"

# print(f"使用するテキスト: {text}")
source_speaker_audio_path = #aoudiopath
source_emb = get_cached_embedding(source_speaker_audio_path, smodel).to(device=_device, dtype=precision)
edited_emb = source_emb + strength * torch.tensor(emo_dir, device=_device, dtype=precision)


top_p=0.95
guidance_scale=3.0#3.0
temperature=1.0
text = normalize_text(text)

start = time.time()
# first stage LLM
tokens = main(
    model=model,
    tokenizer=tokenizer,
    model_size=model_size,
    prompt=text,
    spk_emb=edited_emb,
    top_p=torch.tensor(top_p, device=_device, dtype=precision),
    guidance_scale=torch.tensor(guidance_scale, device=_device, dtype=precision),
    temperature=torch.tensor(temperature, device=_device, dtype=precision),
)
text_ids, extracted_audio_ids = first_stage_adapter.decode([tokens])

b_speaker_embs = edited_emb.unsqueeze(0)

# second stage LLM + multi-band diffusion model
wav_files = llm_second_stage(
    texts=[text],
    encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=_device).unsqueeze(0)],
    speaker_embs=b_speaker_embs,
    batch_size=1,
    guidance_scale=None,
    top_p=None,
    top_k=200,
    temperature=1.0,
    max_new_tokens=None,
)
print(f"wav_files: {wav_files}")
wav_file = wav_files[0]
generated_raw_audio_path = str(wav_file) + ".wav"
# if not os.path.exists(full_wav_path):
#     print(f"Error: Audio file not found at {full_wav_path}")
# else:
#     with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:
#         enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
#         shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
#         print(f"\nSaved audio to {wav_file}.wav")

# output_path = str(wav_file) + ".wav"


# # Display the generated audio
# from IPython.display import Audio, display

# display(Audio(output_path))

# 強化後のオーディオの出力パスを定義
# 例: 元のファイル名に "_enhanced.wav" をつける
# Path(generated_raw_audio_path).parent は元のファイルのディレクトリ
# Path(generated_raw_audio_path).stem は拡張子を含まないファイル名
enhanced_output_filename = Path(generated_raw_audio_path).stem + "_enhanced.wav"
# output_dir は既に定義されているはずの変数（例: "outputs"）
# final_enhanced_path = Path(output_dir) / enhanced_output_filename # これは元の output_dir に保存
# または、元の生成されたファイルのディレクトリに保存するなら
final_enhanced_path = Path(generated_raw_audio_path).parent / enhanced_output_filename


print(f"Checking if raw audio exists: {generated_raw_audio_path}")
if not os.path.exists(generated_raw_audio_path):
    print(f"Error: Raw audio file for enhancement not found at {generated_raw_audio_path}")
    print("Please check if the first stage LLM correctly generated and saved the file.")
else:
    print(f"Raw audio found. Enhancing and saving to {final_enhanced_path}")
    try:
        # enhancer を使って、生成された生のオーディオを読み込み、強化して、新しいパスに保存する
        # enhancer のインターフェースが (input_path, output_path) であることを前提
        enhancer(generated_raw_audio_path, str(final_enhanced_path))

        # shutil.copy2 は enhancer が直接出力しない場合にのみ必要
        # ここでは enhancer が直接出力することを期待
        print(f"\nSaved enhanced audio to {final_enhanced_path}")
        output_path = str(final_enhanced_path) # 表示用のパスを更新

        # Display the generated audio
        from IPython.display import Audio, display
        display(Audio(output_path))

    except Exception as e:
        print(f"Error during enhancement and saving: {e}")
        # 追加のデバッグ情報
        if "System error" in str(e) and os.path.exists(generated_raw_audio_path):
            print("This suggests an issue with writing the *enhanced* file, not reading the raw one.")
            print("Possible causes: permissions for the output directory, disk space, or a problem with the soundfile/libsndfile library itself trying to create the file.")
        elif "No such file or directory" in str(e):
             print(f"The input file '{generated_raw_audio_path}' might not actually exist yet or something else is wrong.")


In [None]:
import torch
print(torch.cuda.memory_summary())

In [None]:
# 日本語テキストでの音声合成テスト（感情制御付き）
print("=== 日本語音声合成 + 感情制御テスト ===")

# 日本語テキストの設定
japanese_test_text = "これはテストです。この音声は怒りの感情を持っています。予測によると、2秒より長い音声を作成することは難しいかもしれません。"

try:
    # Step 1: テキスト正規化のテスト
    normalized_text = normalize_text(japanese_test_text)
    print(f"Step 1: 日本語テキスト正規化成功")
    print(f"   元のテキスト: {japanese_test_text}")
    print(f"   正規化後: {normalized_text}")
    print()
    
    # Step 2: 感情制御の準備
    print("Step 2: 感情制御の準備")
    strength = 0.3
    emotion = 'angry'  # 怒りの感情を使用
    emo_dir = emo_dirs[emotion]
    print(f"   感情: {emotion}")
    print(f"   強度: {strength}")
    print()
    
    # Step 3: スピーカーエンベッディングの設定
    print("Step 3: スピーカーエンベッディングと感情制御")
    source_speaker_audio_path = "C:/Users/anzua/Documents/emoknob/docs/audios/simple_emotion_emotext/angry_0.0_1_emotext0_MSP.wav"
    source_emb = get_cached_embedding(source_speaker_audio_path, smodel).to(device=_device, dtype=precision)
    edited_emb = source_emb + strength * torch.tensor(emo_dir, device=_device, dtype=precision)
    print(f"   ソースエンベッディング形状: {source_emb.shape}")
    print(f"   感情制御適用後の形状: {edited_emb.shape}")
    print()
    
    print("日本語テキスト処理と感情制御の準備が完了しました！")
    print("注意: 実際の音声生成には、BPEトークナイザーの日本語語彙対応が必要です。")
    print("   テキスト正規化と感情制御機能は正常に動作しています。")
    
except Exception as e:
    print(f"エラーが発生しました: {e}")
    import traceback
    traceback.print_exc()
