## 路徑設定及安裝

In [None]:
# !pip install git+https://github.com/openai/whisper.git

In [None]:
import os
from pathlib import Path
from groq import Groq
from pathlib import Path
import whisperx
import torch
import json
from tqdm import tqdm  # ← 這樣匯入的是函數，而非整個模組
import difflib
import re
from pydub import AudioSegment
import noisereduce as nr
import librosa
import soundfile as sf
import numpy as np
import whisper
import re
import random
import shutil

base_path = Path(r"your_path")
WAV_Dataset = base_path / "WAV"
WAV1_Dataset = base_path / "WAV1"
WAV2_Dataset = base_path / "WAV2"

submission_task1_answer_S = base_path / "submission/task1_answer_S.txt"
submission_task1_answer_L = base_path / "submission/task1_answer_L.txt"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {device}")

## 將EN的分開兩個檔案輸出

In [None]:
WAV1_Dataset.mkdir(exist_ok=True)
WAV2_Dataset.mkdir(exist_ok=True)

# 取得所有 .wav 檔案
wav_files = sorted([f for f in WAV_Dataset.glob("*.wav")])

# 分半
half = len(wav_files) // 2
wav1_files = wav_files[:half]
wav2_files = wav_files[half:]

# 複製檔案
for f in wav1_files:
    shutil.copy(str(f), WAV1_Dataset / f.name)
for f in wav2_files:
    shutil.copy(str(f), WAV2_Dataset / f.name)

# ✅ 檢查是否有重複分類
wav1_names = set(f.name for f in wav1_files)
wav2_names = set(f.name for f in wav2_files)
duplicate_files = wav1_names & wav2_names

print(f"共分割 {len(wav_files)} 個檔案，WAV1: {len(wav1_files)}，WAV2: {len(wav2_files)}")

if duplicate_files:
    print(f"⚠️ 發現 {len(duplicate_files)} 個重複檔案：")
    for name in sorted(duplicate_files):
        print(f"  - {name}")
else:
    print("✅ 無重複分類檔案")

## task1_answer_EN_WAV1

In [None]:
# === 載入 whisperx 模型（強制英文）===
batch_size = 1
model = whisperx.load_model(
    "large-v3",
    device=device,
    language="en",               # 強制英文辨識
    vad_options={"vad": False}   # 關閉 VAD
)

# === 開始處理 WAV 檔 ===
with open(submission_task1_answer_S, "w", encoding="utf-8") as fout:
    wav_files = sorted(WAV1_Dataset.glob("*.wav"))

    with tqdm(total=len(wav_files), desc="Transcribing WAV files") as pbar:
        for i, file in enumerate(wav_files):
            file_id = file.stem
            try:
                # 語音轉英文文字
                result = model.transcribe(str(file), batch_size=batch_size, language="en")
                segments = result.get("segments", [])
                full_text = " ".join(seg["text"].strip() for seg in segments).strip()

                # 清理特殊 token
                for token in ["<|startoftranscript|>", "<|en|>", "<|transcribe|>", "<|notimestamps|>"]:
                    full_text = full_text.replace(token, "")
                full_text = full_text.replace("Ġ", "").strip()

                # 寫入結果
                fout.write(f"{file_id}\t{full_text}\n")
                fout.flush()

            except Exception as e:
                print(f"[ERROR] 處理 {file_id} 失敗：{e}")
                continue

            # 每 10 筆更新一次進度條，最後補上餘數
            if (i + 1) % 10 == 0 or (i + 1) == len(wav_files):
                step = 10 if (i + 1) % 10 == 0 else (len(wav_files) % 10)
                pbar.update(step)

print(f"\n✅ 完成轉錄：共處理 {len(wav_files)} 筆，結果已儲存至：{submission_task1_answer_S}")


## task1_answer_EN_WAV2

In [None]:
# === 載入 whisperx 模型（強制英文）===
batch_size = 1
model = whisperx.load_model(
    "large-v3",
    device=device,
    language="en",               # 強制英文辨識
    vad_options={"vad": False}   # 關閉 VAD
)


with open(submission_task1_answer_L, "w", encoding="utf-8") as fout:
    wav_files = sorted(WAV2_Dataset.glob("*.wav"))

    with tqdm(total=len(wav_files), desc="Transcribing WAV files") as pbar:
        for i, file in enumerate(wav_files):
            file_id = file.stem
            try:
                # 語音轉英文文字
                result = model.transcribe(str(file), batch_size=batch_size, language="en")
                segments = result.get("segments", [])
                full_text = " ".join(seg["text"].strip() for seg in segments).strip()

                # 清理特殊 token
                for token in ["<|startoftranscript|>", "<|en|>", "<|transcribe|>", "<|notimestamps|>"]:
                    full_text = full_text.replace(token, "")
                full_text = full_text.replace("Ġ", "").strip()

                # 寫入結果
                fout.write(f"{file_id}\t{full_text}\n")
                fout.flush()

            except Exception as e:
                print(f"[ERROR] 處理 {file_id} 失敗：{e}")
                continue

            # 每 10 筆更新一次進度條，最後補上餘數
            if (i + 1) % 10 == 0 or (i + 1) == len(wav_files):
                step = 10 if (i + 1) % 10 == 0 else (len(wav_files) % 10)
                pbar.update(step)

print(f"\n✅ 完成轉錄：共處理 {len(wav_files)} 筆，結果已儲存至：{submission_task1_answer_L}")

## 調整task1_answer_S.txt順序

In [None]:
from pathlib import Path
import pandas as pd

# 讀取檔案（tab 分隔）
df = pd.read_csv(submission_task1_answer_S, sep="\t", header=None, names=["fid", "text"])

# 檢查總筆數與重複 fid
print(f"總筆數：{len(df)}")
duplicates = df[df.duplicated(subset="fid", keep=False)]
if not duplicates.empty:
    print("發現重複的 fid：")
    print(duplicates)
else:
    print("無重複 fid")

# 依 fid 排序並重新寫回檔案
df_sorted = df.sort_values(by="fid")
df_sorted.to_csv(submission_task1_answer_S, sep="\t", index=False, header=False, encoding="utf-8")
print(f"\n已重新依 fid 排序並儲存至：{submission_task1_answer_S}")

## 調整task1_answer_L.txt順序

In [None]:
from pathlib import Path
import pandas as pd

# 讀取檔案（tab 分隔）
df = pd.read_csv(submission_task1_answer_L, sep="\t", header=None, names=["fid", "text"])

# 檢查總筆數與重複 fid
print(f"總筆數：{len(df)}")
duplicates = df[df.duplicated(subset="fid", keep=False)]
if not duplicates.empty:
    print("發現重複的 fid：")
    print(duplicates)
else:
    print("無重複 fid")

# 依 fid 排序並重新寫回檔案
df_sorted = df.sort_values(by="fid")
df_sorted.to_csv(submission_task1_answer_L, sep="\t", index=False, header=False, encoding="utf-8")
print(f"\n已重新依 fid 排序並儲存至：{submission_task1_answer_L}")