In [1]:
import numpy as np
import torch
import pandas as pd

In [2]:
import argparse
import os
from espnet2.utils.types import str2bool

In [3]:
!ls /jiawei/LDV/data/3分段/boy5/LDV	| head

boy5_plastic_LDV_001.wav
boy5_plastic_LDV_002.wav
boy5_plastic_LDV_003.wav
boy5_plastic_LDV_004.wav
boy5_plastic_LDV_005.wav
boy5_plastic_LDV_006.wav
boy5_plastic_LDV_007.wav
boy5_plastic_LDV_008.wav
boy5_plastic_LDV_009.wav
boy5_plastic_LDV_010.wav


In [4]:
parser = argparse.ArgumentParser()
parser.add_argument("--src", type=str)
parser.add_argument("--dest", type=str)
parser.add_argument("--external_g2p", type=str2bool, default=True)

args = parser.parse_args('')

In [5]:
args.src = '/jiawei/LDV/data/3分段/boy5/'
args.dest = '/jiawei/LDV/data/kaldi/train'

In [6]:
!mkdir -p '/jiawei/LDV/data/kaldi/train'

In [7]:
wav_dir = os.path.join(args.src, "LDV")
utt2spk = open(os.path.join(args.dest, "utt2spk"), "w", encoding="utf-8")
text = open(os.path.join(args.dest, "text"), "w", encoding="utf-8")

In [8]:
from pypinyin import pinyin, lazy_pinyin, Style

In [9]:
pinyin('中心')

[['zhōng'], ['xīn']]

In [10]:
pinyin('但他認為很少台灣人意識到自己離準備應對該挑戰有多遠')

[['dàn'],
 ['tā'],
 ['rèn'],
 ['wèi'],
 ['hěn'],
 ['shǎo'],
 ['tái'],
 ['wān'],
 ['rén'],
 ['yì'],
 ['shí'],
 ['dào'],
 ['zì'],
 ['jǐ'],
 ['lí'],
 ['zhǔn'],
 ['bèi'],
 ['yīng'],
 ['duì'],
 ['gāi'],
 ['tiāo'],
 ['zhàn'],
 ['yǒu'],
 ['duō'],
 ['yuǎn']]

In [11]:
pinyin('但他認為很少台灣人意識到自己離準備應對該挑戰有多遠', style=Style.TONE3)

[['dan4'],
 ['ta1'],
 ['ren4'],
 ['wei4'],
 ['hen3'],
 ['shao3'],
 ['tai2'],
 ['wan1'],
 ['ren2'],
 ['yi4'],
 ['shi2'],
 ['dao4'],
 ['zi4'],
 ['ji3'],
 ['li2'],
 ['zhun3'],
 ['bei4'],
 ['ying1'],
 ['dui4'],
 ['gai1'],
 ['tiao1'],
 ['zhan4'],
 ['you3'],
 ['duo1'],
 ['yuan3']]

In [12]:
import whisper


In [13]:
model = whisper.load_model("large")

In [14]:
# audio = whisper.load_audio('/jiawei/LDV/data/3分段/boy5/target/boy5_pl_clean_001.wav')
audio = whisper.load_audio('/jiawei/espnet/egs2/aishell3/tts1/downloads/train/wav/SSB0005/SSB00050001.wav')
audio = whisper.pad_or_trim(audio)

In [15]:
audio.shape

(480000,)

In [19]:
result = model.transcribe('/jiawei/LDV/data/3分段/boy5/target/boy5_pl_clean_001.wav', language='Chinese')
print(result['text'])

这学期学校有书法比赛


In [39]:
file_name = '/jiawei/LDV/data/3分段/boy5/LDV/boy5_plastic_LDV_001.wav'
result = model.transcribe(file_name, language='Chinese')
text = result['text'] 
print(text)

这学期学校有书法比赛


In [40]:
result2 = pinyin(result['text'], style=Style.TONE3)
# print(' '.join(result))

In [41]:
result2

[['zhe4'],
 ['xue2'],
 ['qi1'],
 ['xue2'],
 ['xiao4'],
 ['you3'],
 ['shu1'],
 ['fa3'],
 ['bi3'],
 ['sai4']]

In [45]:
wav = os.path.basename(file_name)
combined = f"{wav} "

for char, pinyin_list in zip(text, result2):
    combined += f"{char} {''.join(pinyin_list)} "
combined = combined.strip()

In [46]:
combined

'boy5_plastic_LDV_001.wav 这 zhe4 学 xue2 期 qi1 学 xue2 校 xiao4 有 you3 书 shu1 法 fa3 比 bi3 赛 sai4'

In [51]:
# 導入os模塊，用於操作文件和目錄
import os
# 導入shutil模塊，用於複製文件
import shutil

# 定義原始資料的根目錄
source_root = '/jiawei/LDV/data/3分段'

# 定義目標資料的根目錄
target_root = '/jiawei/dataset'

# 定義訓練集和測試集的比例
train_ratio = 0.8

# 遍歷原始資料的子目錄，例如boy3, boy5等
for sub_dir in os.listdir(source_root):
    # 獲取子目錄的完整路徑
    sub_path = os.path.join(source_root, sub_dir, 'LDV')
    # 判斷是否是文件夾
    if os.path.isdir(sub_path):
        # 獲取子目錄下的所有wav文件
        wav_files = [f for f in os.listdir(sub_path) if f.endswith('.wav')]
        # 計算訓練集和測試集的數量
        train_num = int(len(wav_files) * train_ratio)
        test_num = len(wav_files) - train_num
        # 將wav文件分成訓練集和測試集
        train_files = wav_files[:train_num]
        test_files = wav_files[train_num:]
        # 創建訓練集和測試集的目錄
        train_dir = os.path.join(target_root, 'train', 'wav', sub_dir)
        test_dir = os.path.join(target_root, 'test', 'wav', sub_dir)
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(test_dir, exist_ok=True)
        # 將訓練集和測試集的文件複製到對應的目錄
        for train_file in train_files:
            # 獲取原始文件的完整路徑
            source_file = os.path.join(sub_path, train_file)
            # 獲取目標文件的完整路徑
            target_file = os.path.join(train_dir, train_file)
            # 複製文件
            shutil.copy(source_file, target_file)
        for test_file in test_files:
            # 獲取原始文件的完整路徑
            source_file = os.path.join(sub_path, test_file)
            # 獲取目標文件的完整路徑
            target_file = os.path.join(test_dir, test_file)
            # 複製文件
            shutil.copy(source_file, target_file)


In [1]:
# 導入os模塊，用於操作文件和目錄
import os
# 導入whisper模塊，用於語音轉文字
import whisper
# 導入pypinyin模塊，用於漢字轉拼音
from pypinyin import pinyin, Style

# 定義wav文件的根目錄
wav_root = '/jiawei/dataset'
whisper_model = whisper.load_model('large-v2')

# 定義一個函數，用於轉錄wav文件並返回文本和拼音
def transcribe_wav(wav_file_path):
    # 使用Whisper的large-v2模型轉錄wav文件
    result = whisper_model.transcribe(wav_file_path)
    # 獲取轉錄的文本
    text = result['text']
    # 使用pypinyin將文本轉換為拼音（帶聲調）
    pinyin_list = pinyin(text, style=Style.TONE3)
    # 返回文本和拼音
    return text, pinyin_list

# 定義一個函數，用於將文本和拼音結合成指定的格式
def combine_text_pinyin(wav_file, text, pinyin_list):
    # 將文本和拼音結合成指定的格式
    combined = f"{wav_file} "
    for char, pinyin_item in zip(text, pinyin_list):
        combined += f"{char} {''.join(pinyin_item)} "
    # 移除最後的空格
    combined = combined.strip()
    # 返回結合後的字符串
    return combined

# 定義一個函數，用於遍歷wav文件夾下的子目錄，並將轉錄結果寫入content.txt文件
def process_wav_dir(wav_dir, content_file):
    # 遍歷wav文件夾下的子目錄，例如boy3, boy5等
    for wav_sub_dir in os.listdir(wav_dir):
        # 獲取子目錄的完整路徑
        wav_sub_path = os.path.join(wav_dir, wav_sub_dir)
        # 判斷是否是文件夾
        if os.path.isdir(wav_sub_path):
            # 獲取子目錄下的所有wav文件
            wav_files = [f for f in os.listdir(wav_sub_path) if f.endswith('.wav')]
            # 遍歷每個wav文件
            for wav_file in wav_files:
                # 獲取wav文件的完整路徑
                wav_file_path = os.path.join(wav_sub_path, wav_file)
                # 轉錄wav文件並獲得文本和拼音
                text, pinyin_list = transcribe_wav(wav_file_path)
                # 將文本和拼音結合成指定的格式
                combined = combine_text_pinyin(wav_file, text, pinyin_list)
                # 將結果寫入content.txt文件，每行一個wav文件的結果
                content_file.write(combined + '\n')

# 遍歷wav文件的子目錄，例如train, test等
for sub_dir in os.listdir(wav_root):
    # 獲取子目錄的完整路徑
    sub_path = os.path.join(wav_root, sub_dir)
    # 判斷是否是文件夾
    if os.path.isdir(sub_path):
        # 定義content.txt文件的路徑，根據不同的子目錄生成不同的文件
        content_path = os.path.join(sub_path, 'content.txt')
        # 打開content.txt文件，準備寫入
        with open(content_path, 'w', encoding='utf-8') as f:
            # 獲取子目錄下的wav文件夾的路徑
            wav_path = os.path.join(sub_path, 'wav')
            # 調用函數，遍歷wav文件夾下的子目錄，並將轉錄結果寫入content.txt文件
            process_wav_dir(wav_path, f)


<_io.TextIOWrapper name='/jiawei/dataset/train/content.txt' mode='w' encoding='utf-8'>
