In [10]:
from pathlib import Path
import json

def create_metadata(data_root, split):
    metadata = []
    
    audio_dir = Path(data_root) / f"{split}_audio"
    json_dir = Path(data_root) / f"{split}_json"

    for audio_file in audio_dir.glob("*.wav"):
        json_path = json_dir / f"{audio_file.stem}.json"
        
        if json_path.exists():
            metadata.append({
                "audio_path": str(audio_file.relative_to(data_root)),
                "json_path": str(json_path.relative_to(data_root))
            })
    
    # metadata.jsonl 저장
    with open(f"{data_root}/{split}_metadata.jsonl", "w") as f:
        for item in metadata:
            f.write(json.dumps(item) + "\n")

# 실행 예시
data_root = "data"
create_metadata(data_root, "train")
create_metadata(data_root, "val")


In [29]:
import json
from datasets import load_dataset, Audio

def preprocess(example):
    # 절대 경로 생성
    base_dir = "data"  # 데이터 루트 디렉토리 지정
    audio_path = os.path.join(base_dir, example["audio_path"])
    json_path = os.path.join(base_dir, example["json_path"])
    
    # 오디오 로드 (16kHz)
    example["audio"] = Audio(sampling_rate=16000).decode_example({"path": audio_path, "bytes": None})
    
    # JSON 로드 및 텍스트 추출
    with open(json_path, "r", encoding="utf-8") as f:
        metadata = json.load(f)
        
    # 실제 학습용 텍스트(정답) 필드 선택
    example["text"] = metadata["transcription"]["AnswerLabelText"]
    return example

dataset = load_dataset(
    "json",
    data_files={
        "train": "data/train_metadata.jsonl",
        "validation": "data/val_metadata.jsonl"
    }
)
# 전처리 적용
dataset = dataset.map(preprocess)


Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1430/1430 [00:13<00:00, 106.38 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1402/1402 [00:10<00:00, 136.44 examples/s]


In [31]:

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor

model_id = "ghost613/whisper-large-v3-turbo-korean"
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
).to(device)

processor = AutoProcessor.from_pretrained(model_id)


In [33]:
def prepare_features(batch):
    # 오디오 특징 추출
    audio = batch["audio"]
    sentence = batch["text"]
    inputs = processor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=sentence,
        return_tensors="pt"
    )
    batch["input_features"] = inputs.input_features[0]
    batch["labels"] = inputs.labels[0]
    return batch

dataset = dataset.map(prepare_features)


Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1430/1430 [04:00<00:00,  5.96 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1402/1402 [03:14<00:00,  7.20 examples/s]


In [34]:
from transformers import DataCollatorSpeechSeq2SeqWithPadding
from evaluate import load

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

wer_metric = load("wer")
cer_metric = load("cer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer, "cer": cer}



ImportError: cannot import name 'DataCollatorSpeechSeq2SeqWithPadding' from 'transformers' (/usr/local/lib/python3.8/dist-packages/transformers/__init__.py)

In [None]:
from evaluate import load

wer_metric = load("wer")
cer_metric = load("cer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # 외국인 발화 특화 정규화
    pred_str = [s.replace("+", " ").strip() for s in pred_str]
    label_str = [s.replace("+", " ").strip() for s in label_str]

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    
    return {"wer": wer, "cer": cer}


SyntaxError: invalid syntax (1724963975.py, line 1)

In [None]:
trainer.save_model("./whisper-finetuned-ktf")

In [None]:
import torch

# Load your fine-tuned model
model = WhisperForConditionalGeneration.from_pretrained("./whisper-finetuned-ktf")
processor = WhisperProcessor.from_pretrained(model_name)

# Load an audio file
audio_input = processor.feature_extractor("path/to/audio.wav", return_tensors="pt").input_features

# Generate transcription
with torch.no_grad():
    predicted_ids = model.generate(audio_input)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(transcription)