#### 使用edgetts模擬錄音輸入

In [3]:
import random

import edge_tts
from edge_tts import VoicesManager


async def dynamic_voice_selection(gender="Male", language="zh"):
    voice_manager = await VoicesManager.create()
    voices = voice_manager.find(Gender=gender, Language=language)
    voice = random.choice(voices)["Name"]
    voice = 'zh-TW-YunJheNeural'
    return voice

async def lang_voice_sellection(lang):
    voice_dict = {
        'zh' : 'zh-TW-HsiaoChenNeural',
        'en' : 'en-US-AvaNeural', 
        'ja' : 'ja-JP-KeitaNeural', 
        'ko' : 'ko-KR-HyunsuNeural'
    }
    if lang not in voice_dict:
        print('lang not support at this time')
        return False
    
    return voice_dict[lang]
    

async def edgetts_generate(text, voice, output): 
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(output)

text = "晚上繼續寫程式"
language = "zh"
output = "test.wav"
voice = await dynamic_voice_selection(language=language)

await edgetts_generate(text, voice, output)


In [6]:
voice_manager = await VoicesManager.create()
voices = voice_manager.find()
lang_code = {}
for voice in voices:
    if voice['Language'] not in lang_code:
        lang_code[voice['Language']] = 1

lang_code.keys()

dict_keys(['af', 'sq', 'am', 'ar', 'az', 'bn', 'bs', 'bg', 'my', 'ca', 'zh', 'hr', 'cs', 'da', 'nl', 'en', 'et', 'fil', 'fi', 'fr', 'gl', 'ka', 'de', 'el', 'gu', 'he', 'hi', 'hu', 'is', 'id', 'ga', 'it', 'ja', 'jv', 'kn', 'kk', 'km', 'ko', 'lo', 'lv', 'lt', 'mk', 'ms', 'ml', 'mt', 'mr', 'mn', 'ne', 'nb', 'ps', 'fa', 'pl', 'pt', 'ro', 'ru', 'sr', 'si', 'sk', 'sl', 'so', 'es', 'su', 'sw', 'sv', 'ta', 'te', 'th', 'tr', 'uk', 'ur', 'uz', 'vi', 'cy', 'zu'])

#### 對音檔進行whisper translate

- whisper can only do X -> X (transcribe) and X -> English (translate)
- 所以whisper訓練只有其他語言翻譯英文，並沒有其他語言翻譯其他語言的功能

In [1]:
from faster_whisper import WhisperModel


model_size = "large-v3"


# Run on GPU with FP16
# model = WhisperModel(model_size, device="cuda", compute_type="float16")

# or run on GPU with INT8
model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

segments, info = model.transcribe("test.mp3", beam_size=5, task='translate')

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))


  from .autonotebook import tqdm as notebook_tqdm


ValueError: Invalid model size 'large-v3', expected one of: tiny.en, tiny, base.en, base, small.en, small, medium.en, medium, large-v1, large-v2, large

#### llm translate
- llama3 

In [5]:
import time

from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    AIMessagePromptTemplate,
)

def llm_translate(source_language, target_language, source_sentence):

    multilingual_prompt_dict = {
        "en" : 'How is the weather today', 
        'ja' : '今日の天気はどうですか', 
        'zh' : '今天的天氣如何', 
        'ko' : '방법 날씨가 오늘', 
    }

    if source_language not in multilingual_prompt_dict:
        print('source_language not support at this time')
        return False
    if target_language not in multilingual_prompt_dict:
        print('target_language not support at this time')
        return False

    chat_model = ChatOllama(
        base_url='http://localhost:11434',
        model='llama3',
        temperature=0,
    )
    prompt_texts = [
        """You are a helpful translator and only output the result in json format.\nEvery word should be carefully translated.\nTranslate this from <{source_language}> to <{target_language}>\n""",
        """<{source_language}>:{source_sentence_example}\n""", 
        """<{target_language}>:{target_sentence_example}\n""", 
        """<{source_language}>:{source_sentence}\n""", 
    ]

    prompt_templates = []
    for i, text in enumerate(prompt_texts):
        if i == 0:
            prompt_templates.append(SystemMessagePromptTemplate.from_template(text))
        else:
            case_number = int((i + 1) / 2)
            if i % 2 == 1:
                prompt_templates.append(
                    HumanMessagePromptTemplate.from_template(
                        f"{text}"
                    )
                )
            else:
                prompt_templates.append(
                    AIMessagePromptTemplate.from_template(
                        f"{text}"
                    )
                )

    chat_template = ChatPromptTemplate.from_messages(prompt_templates)

    prompt_messages = []

    source_sentence_example = multilingual_prompt_dict['zh']
    target_sentence_example = multilingual_prompt_dict['ja']
    prompt_message = chat_template.format_prompt(
        source_language=source_language, 
        target_language=target_language, 
        source_sentence_example=source_sentence_example, 
        target_sentence_example=target_sentence_example, 
        source_sentence=source_sentence
    )

    prompt_messages.append(prompt_message.to_messages())

    print(prompt_messages)

    start_time = time.perf_counter()
    generation = chat_model.generate(prompt_messages)
    end_time = time.perf_counter()

    print(f'time spend: {end_time - start_time}')

    single_generation = generation.generations[0]
    print(single_generation[0].text)
    print('='*20)

    return single_generation[0].text

In [6]:
source_language = 'zh'
target_language = 'ja'
source_sentence = '今天午餐要吃甚麼'
output = llm_translate(source_language, target_language, source_sentence)
translate_text = output.split(':')[1]
translate_text

[[SystemMessage(content='You are a helpful translator and only output the result in json format.\nEvery word should be carefully translated.\nTranslate this from <zh> to <ja>\n'), HumanMessage(content='<zh>:今天的天氣如何\n'), AIMessage(content='<ja>:今日の天気はどうですか\n'), HumanMessage(content='<zh>:今天午餐要吃甚麼\n')]]
time spend: 3.2050712070195004
<ja>:今日の昼食は何を食べますか


'今日の昼食は何を食べますか'

In [10]:
voice = await lang_voice_sellection(target_language)
output_file = 'test_translate.wav'
await edgetts_generate(translate_text, voice, output_file)

In [11]:
## 試一下 openvoice的TTS

#### translate wav voice conversion to source voice

In [12]:
import sys
sys.path.append("/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/OpenVoice")
import os
import torch
from openvoice import se_extractor
from openvoice.api import ToneColorConverter

In [13]:
ckpt_converter = '../OpenVoice/checkpoints_v2/converter'
device = "cuda:0" if torch.cuda.is_available() else "cpu"

tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')



Loaded checkpoint '../OpenVoice/checkpoints_v2/converter/checkpoint.pth'
missing/unexpected keys: [] []


In [14]:
source_wav = 'test_translate.wav'
target_wav = 'test.wav'

source_se, source_audio_name = se_extractor.get_se(source_wav, tone_color_converter, vad=False)
target_se, target_audio_name = se_extractor.get_se(target_wav, tone_color_converter, vad=False)

save_path = 'test_translate_conversion.wav'

encode_message = "@MyShell"
tone_color_converter.convert(
    audio_src_path=source_wav, 
    src_se=source_se, 
    tgt_se=target_se, 
    output_path=save_path,
    message=encode_message)

Estimating duration from bitrate, this may be inaccurate


OpenVoice version: v2


Estimating duration from bitrate, this may be inaccurate


OpenVoice version: v2


  return F.conv1d(input, weight, bias, self.stride,


In [4]:
## pyanonotate audio segment

# instantiate the model
from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection
model = Model.from_pretrained(
        "pyannote/segmentation-3.0", 
        use_auth_token="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")


pipeline = VoiceActivityDetection(segmentation=model)

torchvision is not available - cannot save figures


In [5]:
HYPER_PARAMETERS = {
  # remove speech regions shorter than that many seconds.
  "min_duration_on": 0.0,
  # fill non-speech regions shorter than that many seconds.
  "min_duration_off": 0.0
}
pipeline.instantiate(HYPER_PARAMETERS)
vad = pipeline("test_translate_conversion.wav")

In [16]:
list(vad.get_timeline().support())

[<Segment(0.132219, 2.14034)>]

In [18]:
vad.get_timeline().support()[0].start

for timeline in vad.get_timeline().support():
    print(timeline.start, timeline.end, timeline.word)

AttributeError: 'Segment' object has no attribute 'word'