<a href="https://colab.research.google.com/github/KTFplus/KTFfintune/blob/master/huggingspace_save.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchaudio transformers huggingface_hub




In [None]:
import os
import json
from pathlib import Path
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch

def save_model_for_huggingface(checkpoint_path: str, output_dir: str):
    """체크포인트를 허깅페이스 형식으로 변환 저장"""
    model = WhisperForConditionalGeneration.from_pretrained(checkpoint_path)
    processor = WhisperProcessor.from_pretrained(checkpoint_dir)
    print("모델 로드 성공!")

    # 필수 파일들 저장
    model.save_pretrained(output_dir)
    processor.save_pretrained(output_dir)

    # 학습 정보 추가 (필요시)
    training_info = {
        "training_steps": 2500,
        "fine_tuned": "Korean",
        "dataset": "Custom Korean Dataset"
    }
    with open(Path(output_dir) / "training_info.json", 'w') as f:
        json.dump(training_info, f)

def upload_to_huggingface(model_path: str, repo_name: str, token: str):
    """허깅페이스 허브에 모델 업로드"""
    from huggingface_hub import HfApi, create_repo

    api = HfApi()
    try:
        create_repo(repo_name, token=token, exist_ok=True)
    except Exception as e:
        logger.error(f"Repository creation failed: {e}")
        raise

    api.upload_folder(
        folder_path=model_path,
        repo_id=repo_name,
        commit_message="Initial upload of Korean fine-tuned Whisper model",
        repo_type="model"
    )

if __name__ == "__main__":
    # 1. 체크포인트 변환
    checkpoint_dir = "/content/drive/MyDrive/whisper-small-korean-finetuned"
    checkpoint_path = "/content/drive/MyDrive/whisper-small-korean-finetuned/checkpoint-2000"
    output_dir = "finetuned2000"
    save_model_for_huggingface(checkpoint_path, output_dir)


모델 로드 성공!


In [None]:
# upload_hf.py
from huggingface_hub import login
# 웹에서 생성한 토큰 입력


upload_to_huggingface(
    model_path="finetuned2000",
    repo_name="urewui/ktf2000",
)

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

In [None]:
import zipfile

zip_path = '/content/val_features.zip'        # 압축 파일 경로
extract_folder = '/content/preprocessed_whisper/features'      # 압축 해제할 폴더

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)


In [None]:
    # 필요한 라이브러리 설치
    !pip install transformers
    !pip install datasets
    !pip install jiwer
    !pip install torchaudio
    !pip install tqdm

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from tor



token 설정 필요

In [None]:
import torch
import json
import numpy as np
from pathlib import Path
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Dataset
from jiwer import wer, cer
import logging
from tqdm import tqdm
import gc
from google.colab import files
import time
import os

# 로깅 설정
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class WhisperDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, processor, max_length=448):
        self.dataset = dataset
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        features = np.load(item["feature_path"])
        assert features.shape[0] == 80, f"Invalid feature shape: {features.shape}"
        features = torch.tensor(features, dtype=torch.float)
        labels = self.processor.tokenizer(
            item["text"],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.max_length
        ).input_ids.squeeze()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        return {
            "input_features": features,
            "labels": labels,
            "text": item["text"],
            "file_name": item["file_name"]
        }

def evaluate_single_model(model_name, val_metadata_path, features_dir, num_samples=5, batch_size=4):
    # GPU 사용 가능 여부 확인
    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info(f"Using device: {device}")

    # GPU 메모리 확인
    if device == "cuda":
        logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        logger.info(f"Available GPU Memory: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

    # Validation 데이터 로드
    logger.info("Loading validation metadata...")
    with open(val_metadata_path, 'r', encoding='utf-8') as f:
        val_metadata = json.load(f)
    # 800개만 사용
    val_metadata = val_metadata[:800]

    # feature_path 수정
    for item in val_metadata:
        # 파일 이름만 추출
        file_name = item["feature_path"].split("/")[-1]
        # 새로운 경로로 수정
        item["feature_path"] = f"/content/drive/MyDrive/preprocessed_whisper/features/{file_name}"

    val_dataset = Dataset.from_list(val_metadata)
    logger.info(f"Loaded {len(val_dataset)} validation samples")

    start_time = time.time()

    # 모델과 프로세서 로드
    logger.info(f"Loading model and processor: {model_name}")
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)

    model = model.to(device)
    model.eval()
    logger.info("Model loaded successfully")

    # 데이터셋 준비
    val_torch_dataset = WhisperDataset(val_dataset, processor)

    # 평가 결과 저장
    all_predictions = []
    all_references = []
    sample_results = []

    # 배치 단위로 평가 진행
    logger.info("Starting evaluation...")
    with torch.no_grad():
        pbar = tqdm(range(0, len(val_torch_dataset), batch_size),
                   desc=f"Evaluating {model_name}",
                   ncols=100)

        for idx in pbar:
            batch_items = [val_torch_dataset[i] for i in range(idx, min(idx + batch_size, len(val_torch_dataset)))]

            # 배치 데이터 준비
            input_features = torch.stack([item["input_features"] for item in batch_items]).to(device)

            # 예측
            predicted_ids = model.generate(input_features)
            transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)

            # 결과 저장
            for i, (transcription, item) in enumerate(zip(transcriptions, batch_items)):
                all_predictions.append(transcription)
                all_references.append(item["text"])

                if idx + i < num_samples:
                    sample_results.append({
                        "file_name": item["file_name"],
                        "reference": item["text"],
                        "prediction": transcription
                    })

            # 진행 상황 업데이트
            pbar.set_postfix({
                'processed': f"{min(idx + batch_size, len(val_torch_dataset))}/{len(val_torch_dataset)}",
                'memory': f"{torch.cuda.memory_allocated() / 1e9:.1f}GB"
            })

            # 메모리 정리
            del input_features, predicted_ids
            torch.cuda.empty_cache()
            gc.collect()

    # 메트릭 계산
    logger.info("Calculating metrics...")
    wer_score = wer(all_references, all_predictions)
    cer_score = cer(all_references, all_predictions)

    results = {
        "model_name": model_name,
        "wer": wer_score,
        "cer": cer_score,
        "samples": sample_results,
        "duration": time.time() - start_time
    }

    logger.info(f"\nEvaluation completed in {results['duration']:.2f} seconds")
    logger.info(f"WER: {wer_score:.4f}")
    logger.info(f"CER: {cer_score:.4f}")

    # 샘플 결과 출력
    logger.info("\nSample Results:")
    for i, sample in enumerate(sample_results, 1):
        logger.info(f"\nSample {i}:")
        logger.info(f"File: {sample['file_name']}")
        logger.info(f"Reference: {sample['reference']}")
        logger.info(f"Prediction: {sample['prediction']}")

    # 메모리 정리
    del model
    torch.cuda.empty_cache()
    gc.collect()

    return results

def compare_results(results_dir):
    """여러 모델의 평가 결과를 비교"""
    all_results = {}
    for result_file in os.listdir(results_dir):
        if result_file.endswith('_results.json'):
            with open(os.path.join(results_dir, result_file), 'r', encoding='utf-8') as f:
                results = json.load(f)
                model_name = results['model_name']
                all_results[model_name] = {
                    'wer': results['wer'],
                    'cer': results['cer'],
                    'duration': results['duration']
                }

    # 결과 비교 출력
    logger.info("\nModel Comparison Summary:")
    logger.info("="*70)
    logger.info(f"{'Model Name':<30} {'WER':<10} {'CER':<10} {'Duration (s)':<15}")
    logger.info("-"*70)
    for model_name, metrics in all_results.items():
        logger.info(f"{model_name:<30} {metrics['wer']:<10.4f} {metrics['cer']:<10.4f} {metrics['duration']:<15.2f}")
    logger.info("="*70)

def main():
    # 필요한 라이브러리 설치
    logger.info("Installing required packages...")

    # GPU 메모리 최적화
    logger.info("Optimizing GPU memory...")
    torch.cuda.empty_cache()
    gc.collect()

    # 평가할 모델들
    model_names = [
        "urewui/ktf",
        "urewui/ktf2000",
        "openai/whisper-small"    # 기본 모델     # 파인튠된 모델
    ]


    # 결과 저장 디렉토리 생성
    results_dir = "evaluation_results"
    os.makedirs(results_dir, exist_ok=True)

    # 각 모델별로 개별 평가
    for model_name in model_names:
        logger.info(f"\n{'='*50}")
        logger.info(f"Starting evaluation for {model_name}")
        logger.info(f"{'='*50}")

        # 모델 평가
        results = evaluate_single_model(
            model_name=model_name,
            val_metadata_path="/content/drive/MyDrive/preprocessed_whisper/val_metadata.json",
            features_dir="/content/drive/MyDrive/preprocessed_whisper/features",
            num_samples=5,
            batch_size=2
        )

        # 결과 저장
        result_file = os.path.join(results_dir, f"{model_name.replace('/', '_')}_results.json")
        with open(result_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

        # 결과 파일 다운로드
        files.download(result_file)

    # 모든 결과 비교
    compare_results(results_dir)

    logger.info("All evaluations completed!")

if __name__ == "__main__":
    main()

Evaluating urewui/ktf: 100%|█████| 400/400 [09:21<00:00,  1.40s/it, processed=800/800, memory=2.4GB]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

preprocessor_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/2.91k [00:00<?, ?B/s]

Evaluating urewui/ktf2000: 100%|█| 400/400 [09:35<00:00,  1.44s/it, processed=800/800, memory=2.4GB]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Evaluating openai/whisper-small: 100%|█| 400/400 [10:18<00:00,  1.55s/it, processed=800/800, memory=


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import torch
import json
import numpy as np
from pathlib import Path
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Dataset
from jiwer import wer, cer
import logging
from tqdm import tqdm
import gc
from google.colab import files
import time
import os

# 로깅 설정
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class WhisperDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, processor, max_length=448):
        self.dataset = dataset
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        features = np.load(item["feature_path"])
        assert features.shape[0] == 80, f"Invalid feature shape: {features.shape}"
        features = torch.tensor(features, dtype=torch.float)
        labels = self.processor.tokenizer(
            item["text"],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.max_length
        ).input_ids.squeeze()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        return {
            "input_features": features,
            "labels": labels,
            "text": item["text"],
            "file_name": item["file_name"]
        }

def evaluate_single_model(model_name, val_metadata_path, features_dir, num_samples=5, batch_size=4):
    # GPU 사용 가능 여부 확인
    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info(f"Using device: {device}")

    # GPU 메모리 확인
    if device == "cuda":
        logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
        logger.info(f"Available GPU Memory: {torch.cuda.memory_allocated() / 1e9:.1f} GB")

    # Validation 데이터 로드
    logger.info("Loading validation metadata...")
    with open(val_metadata_path, 'r', encoding='utf-8') as f:
        val_metadata = json.load(f)

    # feature_path 수정
    for item in val_metadata:
        # 파일 이름만 추출
        file_name = item["feature_path"].split("/")[-1]
        # 새로운 경로로 수정
        item["feature_path"] = f"/content/drive/MyDrive/preprocessed_whisper/features/{file_name}"

    val_dataset = Dataset.from_list(val_metadata)
    logger.info(f"Loaded {len(val_dataset)} validation samples")

    start_time = time.time()

    # 모델과 프로세서 로드
    logger.info(f"Loading model and processor: {model_name}")
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)

    model = model.to(device)
    model.eval()
    logger.info("Model loaded successfully")

    # 데이터셋 준비
    val_torch_dataset = WhisperDataset(val_dataset, processor)

    # 평가 결과 저장
    all_predictions = []
    all_references = []
    sample_results = []

    # 배치 단위로 평가 진행
    logger.info("Starting evaluation...")
    with torch.no_grad():
        pbar = tqdm(range(0, len(val_torch_dataset), batch_size),
                   desc=f"Evaluating {model_name}",
                   ncols=100)

        for idx in pbar:
            batch_items = [val_torch_dataset[i] for i in range(idx, min(idx + batch_size, len(val_torch_dataset)))]

            # 배치 데이터 준비
            input_features = torch.stack([item["input_features"] for item in batch_items]).to(device)

            # 예측
            predicted_ids = model.generate(input_features)
            transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)

            # 결과 저장
            for i, (transcription, item) in enumerate(zip(transcriptions, batch_items)):
                all_predictions.append(transcription)
                all_references.append(item["text"])

                if idx + i < num_samples:
                    sample_results.append({
                        "file_name": item["file_name"],
                        "reference": item["text"],
                        "prediction": transcription
                    })

            # 진행 상황 업데이트
            pbar.set_postfix({
                'processed': f"{min(idx + batch_size, len(val_torch_dataset))}/{len(val_torch_dataset)}",
                'memory': f"{torch.cuda.memory_allocated() / 1e9:.1f}GB"
            })

            # 메모리 정리
            del input_features, predicted_ids
            torch.cuda.empty_cache()
            gc.collect()

    # 메트릭 계산
    logger.info("Calculating metrics...")
    wer_score = wer(all_references, all_predictions)
    cer_score = cer(all_references, all_predictions)

    results = {
        "model_name": model_name,
        "wer": wer_score,
        "cer": cer_score,
        "samples": sample_results,
        "duration": time.time() - start_time
    }

    logger.info(f"\nEvaluation completed in {results['duration']:.2f} seconds")
    logger.info(f"WER: {wer_score:.4f}")
    logger.info(f"CER: {cer_score:.4f}")

    # 샘플 결과 출력
    logger.info("\nSample Results:")
    for i, sample in enumerate(sample_results, 1):
        logger.info(f"\nSample {i}:")
        logger.info(f"File: {sample['file_name']}")
        logger.info(f"Reference: {sample['reference']}")
        logger.info(f"Prediction: {sample['prediction']}")

    # 메모리 정리
    del model
    torch.cuda.empty_cache()
    gc.collect()

    return results

def compare_results(results_dir):
    """여러 모델의 평가 결과를 비교"""
    all_results = {}
    for result_file in os.listdir(results_dir):
        if result_file.endswith('_results.json'):
            with open(os.path.join(results_dir, result_file), 'r', encoding='utf-8') as f:
                results = json.load(f)
                model_name = results['model_name']
                all_results[model_name] = {
                    'wer': results['wer'],
                    'cer': results['cer'],
                    'duration': results['duration']
                }

    # 결과 비교 출력
    logger.info("\nModel Comparison Summary:")
    logger.info("="*70)
    logger.info(f"{'Model Name':<30} {'WER':<10} {'CER':<10} {'Duration (s)':<15}")
    logger.info("-"*70)
    for model_name, metrics in all_results.items():
        logger.info(f"{model_name:<30} {metrics['wer']:<10.4f} {metrics['cer']:<10.4f} {metrics['duration']:<15.2f}")
    logger.info("="*70)

def main():
    # 필요한 라이브러리 설치
    logger.info("Installing required packages...")

    # GPU 메모리 최적화
    logger.info("Optimizing GPU memory...")
    torch.cuda.empty_cache()
    gc.collect()

    # 평가할 모델들
    model_names = [
        "urewui/ktf",
        "openai/whisper-small"    # 기본 모델     # 파인튠된 모델
    ]


    # 결과 저장 디렉토리 생성
    results_dir = "evaluation_results"
    os.makedirs(results_dir, exist_ok=True)

    # 각 모델별로 개별 평가
    for model_name in model_names:
        logger.info(f"\n{'='*50}")
        logger.info(f"Starting evaluation for {model_name}")
        logger.info(f"{'='*50}")

        # 모델 평가
        results = evaluate_single_model(
            model_name=model_name,
            val_metadata_path="/content/drive/MyDrive/preprocessed_whisper/val_metadata.json",
            features_dir="/content/drive/MyDrive/preprocessed_whisper/features",
            num_samples=10,
            batch_size=4
        )

        # 결과 저장
        result_file = os.path.join(results_dir, f"{model_name.replace('/', '_')}_results.json")
        with open(result_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

        # 결과 파일 다운로드
        files.download(result_file)

    # 모든 결과 비교
    compare_results(results_dir)

    logger.info("All evaluations completed!")

if __name__ == "__main__":
    main()

Evaluating urewui/ktf: 100%|███| 516/516 [18:14<00:00,  2.12s/it, processed=2063/2063, memory=2.4GB]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Evaluating openai/whisper-small: 100%|█| 516/516 [20:30<00:00,  2.38s/it, processed=2063/2063, memor


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>