# 1. Environment

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [None]:
!pip install --upgrade datasets[audio] accelerate evaluate jiwer tensorboard gradio

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting gradio
  Downloading gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting datasets[audio]
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets[audio])
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets[audio])
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting rapidfuzz<4,>=3 (from j

In [None]:
!pip install transformers==4.45.2

Collecting transformers==4.45.2
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.2
    Uninstalling transformers-4.46.2:
      Successfully uninstalled transformers-4.46.2
Successfully installed transformers-4.45.2


In [None]:
!nvidia-smi

Sat Nov 16 03:06:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

2

In [None]:
import torch

torch.cuda.is_available()

True

In [None]:
_DATASETS_DIR = 'datasets/'
_TRANSCRIPT_DIR = 'transcript/'

In [None]:
# os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'


import torchaudio
import numpy as np

# 2. Dataset

In [None]:
from datasets import Dataset, Audio, DatasetDict, load_from_disk
import pandas as pd
import os

In [None]:
names = ['train', 'test']
dataset = DatasetDict()

for name in names:
    df = pd.read_csv(_TRANSCRIPT_DIR + name + '.csv')
    df['file'] = df['file'].apply(lambda x: os.path.join(_DATASETS_DIR, str(x)))

    dataset[name] = Dataset.from_pandas(df)
    dataset[name] = dataset[name].cast_column('file', Audio(sampling_rate=16000))

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['file', 'text'],
        num_rows: 4200
    })
    test: Dataset({
        features: ['file', 'text'],
        num_rows: 940
    })
})

# 3. Load WhisperProcessor

In [None]:
from transformers import WhisperProcessor

# Whisper 프로세서 로드
model_name = "openai/whisper-tiny"
processor = WhisperProcessor.from_pretrained(model_name, language="ko", task="transcribe")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

In [None]:
def prepare_dataset(batch):
    try:
        audio = batch["file"]
        batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
        batch["labels"] = processor(text=batch["text"]).input_ids

    except Exception as e:
        print(f"Error processing batch: {batch}, Error: {e}")
        raise e

    return batch

def prepare_dataset_batched(batch):
    try:
        # 오디오 데이터를 배치 단위로 처리
        batch["input_features"] = [
            processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
            for audio in batch["file"]
        ]

        # 텍스트 데이터를 배치 단위로 처리
        batch["labels"] = [processor(text=text).input_ids for text in batch["text"]]

    except Exception as e:
        print(f"Error processing batch: {batch}, Error: {e}")
        raise e

    return batch

In [None]:
# dataset = dataset.map(prepare_dataset_batched, remove_columns=dataset['train'].column_names, batched=True, batch_size=16, num_proc=12)

In [None]:
# dataset.save_to_disk(_DATASETS_DIR + 'speech_dataset_Whisper')

In [None]:
dataset = load_from_disk(_DATASETS_DIR + 'speech_dataset_Whisper')

# 4. Training & Evaluation

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperConfig


model = WhisperForConditionalGeneration.from_pretrained(model_name)

model.generation_config.language = "korean"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

## Define a data collator

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollator:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 입력과 레이블 분리 (입력과 레이블은 길이가 다르기 때문에 다른 패딩 방법 필요)
        # 오디오 입력을 처리하여 torch 텐서 반환
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # 토큰화된 레이블 시퀀스를 가져옴
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # 레이블을 최대 길이로 패딩
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 손실을 무시하기 위해 패딩을 -100으로 대체
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 만약 이전 토큰화 단계에서 시작 토큰(bos token)이 추가되었다면 제거 (나중에 추가됨)
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollator(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

## Evaluation metrics

In [None]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(model_name, language="ko", task="transcribe")

In [None]:
_SUFFIX = '_early_stop'

from transformers import WhisperForConditionalGeneration
import evaluate


# 모델 구성 생성 시 dropout 비율을 설정
model = WhisperForConditionalGeneration.from_pretrained(model_name, attention_dropout=0.2, activation_dropout=0.2)
model.generation_config.language = "korean"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None


# CER 메트릭 로드
metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # -100을 pad_token_id로 대체
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # 예측값과 레이블을 문자열로 디코딩
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # CER 계산
    cer = 100 * metric.compute(predictions=pred_str, references=label_str)

    with open(_BASE_DIR + "/transcription_errors" + _SUFFIX + '.txt', "a") as f:
        for ref, pred in zip(label_str, pred_str):
            f.write(f"Reference: {ref}\n")
            f.write(f"Prediction: {pred}\n")
            f.write(f"Correct: {ref == pred}\n")
            f.write("\n")

    accuracy = sum([1 for ref, pred in zip(label_str, pred_str) if ref == pred]) / len(label_str)

    return {"cer": cer, "accuracy": accuracy}

from transformers import Seq2SeqTrainingArguments

# 학습 설정
training_args = Seq2SeqTrainingArguments(
    output_dir=_BASE_DIR + 'whisper-child' + _SUFFIX,  # 학습결과 저장 디렉토리
    per_device_train_batch_size=8,         # 장치당 batch 크기(값이 상승하면 메모리 사용량 증가)
    gradient_accumulation_steps=2,          # 기울기 누적 단계 수(값을 늘리면 작은 batch 크기로 큰효과 를 낼수있음)
    learning_rate=1e-5,                     # 가중치 를 업데이트 하는 속도
    lr_scheduler_type="cosine",
    warmup_steps=250,                       # 학습률이 선형적으로 증가하는 단계 수
    weight_decay=0.1,
    max_steps=3000,                         # 최대 학습 수(값을 높이면 학습 수 증가)
    gradient_checkpointing=True,            # 메모리 사용량 줄이기 위한 체크포인팅 활성화
    fp16=True,                              # 16비트 부동소수점 연산 사용헤 메모리 효율성 및 연산 속도 상승
    eval_strategy="steps",                  # 평가전략 (현재 'steps'로 설정 일정 단계마다 평가 수행)
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    eval_steps=250,                        # 평가 주기
    save_steps=250,                         # 체크 포인트 저장 주기(주기마다 모델 상태 저장)
    logging_steps=10,                       # 로그 기록 주기
    report_to=["tensorboard"],              # 로그 기록할 툴 설정(현재 Tensorboard)
    load_best_model_at_end=True,           # 학습 종료시 가장 성능이 좋은 모델을 load 할지 t/f
    metric_for_best_model="cer",          # 학습 종료시 모델 결정할때 사용할 평가 지표( 현재 WER(단어 오류율))[ 현재 평가 지표에서 튜닝중간에 에러가 발생하여 잠시 주석]
    greater_is_better=False,                # 평가 지표가 클수록 좋은지 나타내는 설정(WER는 낮을수록 좋으므로 현재 'False')
    push_to_hub=False,                       # 완료 후 Hugging Face Hub 에 업로드 할지 결정

)

from transformers import Seq2SeqTrainer, EarlyStoppingCallback

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 학습 실행
trainer.train()


# 학습된 모델 저장
model.save_pretrained('model/' + _SUFFIX)           # 학습 완료 모델 저장 경로
processor.save_pretrained('model/' + _SUFFIX)       # 프로세서 설정 저장(추후에 모델 로드시 설정을 동일하게 사용가능)


max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss,Cer,Accuracy
250,0.0185,0.01992,1.985816,0.979787
500,0.0015,0.010002,1.241135,0.981915
750,0.0002,0.014868,1.950355,0.979787
1000,0.0,0.000754,0.035461,0.998936
1250,0.0,0.001479,0.106383,0.996809
1500,0.0,8.7e-05,0.0,1.0
1750,0.0,0.000424,0.035461,0.998936
2000,0.0,0.000508,0.141844,0.997872
2250,0.0,0.000433,0.035461,0.998936


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


[]