This code takes in cleaned .json and segmented .wav files to generate a single txt file in [.wav name] [transcript] format, which is required in Azure Custom Speech.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# test on 1 episode
import os
import json

# Paths for the WAV files and JSON transcript
wav_folder = "/content/drive/MyDrive/NYP/Year2/AI Engineering Project/AIEP [shared]/wav_segments/IWantToBeATowkay/“I Want To Be A Towkay” Episode 1《亲家，冤家做头家》第一集"  # Folder containing WAV files
json_file_path = "/content/drive/MyDrive/NYP/Year2/AI Engineering Project/AIEP [shared]/ocr_cleaned/IWantToBeATowkay/“I Want To Be A Towkay” Episode 1《亲家，冤家做头家》第一集_ocr_cleaned.json"  # JSON transcript file
output_txt_file = "/content/transcript.txt"  # Output TXT file path

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_txt_file), exist_ok=True)

# Load the JSON file
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    transcript_data = json.load(json_file)

# Write the Azure-compatible TXT file
with open(output_txt_file, 'w', encoding='utf-8') as txt_file:
    for segment in transcript_data:
        # Match WAV file with its transcript
        wav_file_name = segment['frame_name'].replace('.jpg', '.wav')
        wav_file_path = os.path.join(wav_folder, wav_file_name)

        if os.path.exists(wav_file_path):
            # Write in the format: "<WAV file>\t<Transcript>"
            txt_file.write(f"{wav_file_name}\t{segment['transcript_zh']}\n")
        else:
            print(f"Missing WAV file: {wav_file_name}")

print(f"Generated TXT file: {output_txt_file}")

Generated TXT file: /content/transcript.txt


In [None]:
import os
import json

# Root directories
json_root_directory = "/content/drive/MyDrive/NYP/Year2/AI Engineering Project/AIEP [shared]/ocr_cleaned"
wav_root_directory = "/content/drive/MyDrive/NYP/Year2/AI Engineering Project/AIEP [shared]/wav_segments"

# Process each drama folder
for drama_folder in os.listdir(json_root_directory):
    json_drama_path = os.path.join(json_root_directory, drama_folder)
    wav_drama_path = os.path.join(wav_root_directory, drama_folder)

    if os.path.isdir(json_drama_path) and os.path.isdir(wav_drama_path):
        print(f"Processing Drama: {drama_folder}")

        # Process each episode JSON file in the drama folder
        for json_file in os.listdir(json_drama_path):
            if json_file.endswith(".json"):
                # Get the episode name (remove '_ocr_cleaned.json')
                episode_name = os.path.splitext(json_file)[0].replace("_ocr_cleaned", "")
                json_file_path = os.path.join(json_drama_path, json_file)
                wav_episode_path = os.path.join(wav_drama_path, episode_name)

                # Check if the corresponding WAV episode folder exists
                if os.path.isdir(wav_episode_path):
                    print(f"  Processing Episode: {episode_name}")

                    # Output TXT file path
                    output_txt_file = os.path.join(wav_episode_path, f"{episode_name}.txt")

                    # Load the JSON transcript
                    with open(json_file_path, 'r', encoding='utf-8') as json_file:
                        transcript_data = json.load(json_file)

                    # Write to the TXT file
                    with open(output_txt_file, 'w', encoding='utf-8') as txt_file:
                        for segment in transcript_data:
                            # Match WAV file with its transcript
                            wav_file_name = segment['frame_name'].replace('.jpg', '.wav')
                            wav_file_path = os.path.join(wav_episode_path, wav_file_name)

                            if os.path.exists(wav_file_path):
                                # Write in the format: "<WAV file>\t<Transcript>"
                                txt_file.write(f"{wav_file_name}\t{segment['transcript_zh']}\n")
                            else:
                                print(f"    Missing WAV file: {wav_file_name}")

                    print(f"    TXT file generated: {output_txt_file}")
                else:
                    print(f"  Missing WAV folder for Episode: {episode_name}")
    else:
        print(f"Skipping non-matching folder: {drama_folder}")

print("Batch processing completed!")

Processing Drama: EatAlready
  Processing Episode: 《吃饱没？3》第九集 - “Eat Already？ 3” Episode 9
    TXT file generated: /content/drive/MyDrive/NYP/Year2/AI Engineering Project/AIEP [shared]/wav_segments/EatAlready/《吃饱没？3》第九集 - “Eat Already？ 3” Episode 9/《吃饱没？3》第九集 - “Eat Already？ 3” Episode 9.txt
  Processing Episode: 《吃饱没？4》 第二集 ＂Eat Already？ 4＂ Episode 2
    TXT file generated: /content/drive/MyDrive/NYP/Year2/AI Engineering Project/AIEP [shared]/wav_segments/EatAlready/《吃饱没？4》 第二集 ＂Eat Already？ 4＂ Episode 2/《吃饱没？4》 第二集 ＂Eat Already？ 4＂ Episode 2.txt
  Processing Episode: 《吃饱没？4》 第八集 ＂Eat Already？ 4＂ Episode 8
    TXT file generated: /content/drive/MyDrive/NYP/Year2/AI Engineering Project/AIEP [shared]/wav_segments/EatAlready/《吃饱没？4》 第八集 ＂Eat Already？ 4＂ Episode 8/《吃饱没？4》 第八集 ＂Eat Already？ 4＂ Episode 8.txt
  Processing Episode: 《吃饱没？3》第一集 - “Eat Already？ 3” Episode 1
    TXT file generated: /content/drive/MyDrive/NYP/Year2/AI Engineering Project/AIEP [shared]/wav_segments/EatAlready/《吃饱没？