# ASR Testing Notebook

This notebook contains various ASR (Automatic Speech Recognition) model tests including:
- NeMo ASR (Parakeet TDT)
- Whisper (OpenAI)
- NVIDIA Canary
- Audio denoising techniques

## Import Required Libraries

In [None]:
# Standard libraries
import os
import warnings
warnings.filterwarnings('ignore')

# Audio processing libraries
import numpy as np
import librosa
import soundfile as sf
from scipy.signal import butter, filtfilt

# NeMo ASR
import nemo.collections.asr as nemo_asr
from nemo.collections.speechlm2.models import SALM

# Whisper ASR
import whisper

# Custom models (from your project)
try:
    from models import separate_fast, dnsmos, whisper_asr, silero_vad
    print("Custom models imported successfully")
except ImportError as e:
    print(f"Note: Custom models not found - {e}")
    print("You may need to run this from the correct directory")

# Jupyter display
import IPython.display as ipd

print("All libraries imported successfully!")

## 1. NeMo ASR (NVIDIA Parakeet TDT)

In [None]:
# Load NeMo ASR model (Parakeet TDT 0.6B)
asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")

### Download sample audio file

In [2]:
!wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav

--2025-12-02 02:04:15--  https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
Resolving dldata-public.s3.us-east-2.amazonaws.com (dldata-public.s3.us-east-2.amazonaws.com)... 52.219.94.146, 3.5.133.198, 52.219.177.162, ...
Connecting to dldata-public.s3.us-east-2.amazonaws.com (dldata-public.s3.us-east-2.amazonaws.com)|52.219.94.146|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 237964 (232K) [audio/wav]
Saving to: ‘2086-149220-0033.wav’


2025-12-02 02:04:17 (254 KB/s) - ‘2086-149220-0033.wav’ saved [237964/237964]



### Transcribe with NeMo

In [None]:
# Transcribe audio file
output = asr_model.transcribe(['2086-149220-0033.wav'])
print("Transcription:", output[0].text)

### Detailed output (Hypothesis object)

In [None]:
# View full output details
print(output[0])

## 2. Custom Whisper ASR (with VAD)

In [None]:
# Load custom Whisper ASR model with VAD
# Note: This requires the custom whisper_asr module from your project
asr_model_whisper = whisper_asr.load_asr_model(
    "large-v3",
    "cuda",
    compute_type="float16",
    threads=4,
    language="en",
)

In [None]:
# Note: This custom model requires vad_segments parameter
# Example (uncomment and provide VAD segments):
# transcribe = asr_model_whisper.transcribe('2086-149220-0033.wav', vad_segments=your_vad_segments)
print("This custom Whisper model requires VAD segments as input.")

## 3. NVIDIA Canary (Speech Language Model)

In [None]:
# Set GPU device
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Load NVIDIA Canary model
model = SALM.from_pretrained('nvidia/canary-qwen-2.5b')

### Generate transcription with Canary

In [None]:
# Generate transcription using Canary model
answer_ids = model.generate(
    prompts=[
        [{"role": "user", "content": f"Transcribe the following: {model.audio_locator_tag}", "audio": ["2086-149220-0033.wav"]}]
    ],
    max_new_tokens=128,
)
transcription = model.tokenizer.ids_to_text(answer_ids[0].cpu())
print("Canary Transcription:", transcription)

## 4. OpenAI Whisper (Standard)

In [3]:
import whisper

# 파일 경로 변수 지정
file_path = "/mnt/ddn/kyudan/Audio-data-centric/podcast-pipeline/data/test/_final/-sepreformer-True-vad-True-diaModel-dia3-initPrompt-False-merge_gap-2.0-seg_th-0.11-cl_min-11-cl-th-0.5-LLM-case_0/test_english_with_overlap/test_english_with_overlap/00003_SPEAKER_01.mp3"

# 모델 로드 (large-v3)
# GPU가 있으면 자동으로 cuda를 사용하지만, 명시적으로 device="cuda"를 넣어도 됩니다.
model = whisper.load_model("large-v3")

print("Transcribing...")
result = model.transcribe(file_path)

# 결과 출력
print(f"Text: {result['text']}")

# 필요하면 결과 저장
# with open("result.txt", "w") as f:
#     f.write(result['text'])

  import pynvml  # type: ignore[import]
100%|██████████████████████████████████████| 2.88G/2.88G [00:11<00:00, 260MiB/s]


Transcribing...
Text:  You're the emcee on the show, Ira. Oh, great. Ira, are you Ira?


In [None]:
# 원하는 방법의 결과를 파일로 저장 (옵션)
output_method = 'spectral_gate'  # 또는 'bandpass', 'wiener'
output_path = f"denoised_{output_method}.wav"

sf.write(output_path, denoised_audios[output_method], sr)
print(f"Denoised audio saved to: {output_path}")

In [None]:
# 각 방법별 결과 재생
print("=== Spectral Gate Denoising ===")
ipd.display(ipd.Audio(denoised_audios['spectral_gate'], rate=sr))

print("\n=== Bandpass Filter Denoising ===")
ipd.display(ipd.Audio(denoised_audios['bandpass'], rate=sr))

print("\n=== Wiener Filter Denoising ===")
ipd.display(ipd.Audio(denoised_audios['wiener'], rate=sr))

In [None]:
def simple_denoising(audio, sr, method='spectral_gate'):
    """
    간단한 디노이징 알고리즘
    
    Parameters:
    - audio: 오디오 신호 (numpy array)
    - sr: 샘플링 레이트
    - method: 'spectral_gate', 'bandpass', 'wiener'
    
    Returns:
    - denoised_audio: 디노이즈된 오디오
    """
    
    if method == 'spectral_gate':
        # 스펙트럴 게이팅 방법
        print("Applying spectral gating denoising...")
        
        # STFT 수행
        D = librosa.stft(audio)
        magnitude, phase = np.abs(D), np.angle(D)
        
        # 노이즈 프로파일 추정 (처음 0.5초를 노이즈로 가정)
        noise_frames = int(0.5 * sr / 512)
        noise_profile = np.median(magnitude[:, :noise_frames], axis=1, keepdims=True)
        
        # 스펙트럴 게이팅
        threshold = 2.0  # 노이즈 프로파일의 2배를 임계값으로 설정
        mask = magnitude > (noise_profile * threshold)
        
        # 마스크 적용
        magnitude_denoised = magnitude * mask
        
        # ISTFT로 복원
        D_denoised = magnitude_denoised * np.exp(1j * phase)
        denoised = librosa.istft(D_denoised)
        
    elif method == 'bandpass':
        # 밴드패스 필터 (음성 주파수 대역: 80-8000 Hz)
        print("Applying bandpass filter denoising...")
        
        nyquist = sr / 2
        low = 80 / nyquist
        high = 8000 / nyquist
        
        b, a = butter(5, [low, high], btype='band')
        denoised = filtfilt(b, a, audio)
        
    elif method == 'wiener':
        # Wiener 필터링 (간단한 버전)
        print("Applying Wiener filter denoising...")
        
        # STFT
        D = librosa.stft(audio)
        magnitude = np.abs(D)
        phase = np.angle(D)
        
        # 노이즈 추정
        noise_frames = int(0.5 * sr / 512)
        noise_power = np.mean(magnitude[:, :noise_frames] ** 2, axis=1, keepdims=True)
        
        # Wiener 필터
        signal_power = magnitude ** 2
        wiener_gain = np.maximum(1 - noise_power / (signal_power + 1e-10), 0.1)
        
        magnitude_denoised = magnitude * wiener_gain
        
        # ISTFT
        D_denoised = magnitude_denoised * np.exp(1j * phase)
        denoised = librosa.istft(D_denoised)
    
    else:
        raise ValueError(f"Unknown method: {method}")
    
    return denoised

# 여러 방법으로 디노이징
methods = ['spectral_gate', 'bandpass', 'wiener']
denoised_audios = {}

for method in methods:
    print(f"\n--- Method: {method} ---")
    denoised = simple_denoising(audio.copy(), sr, method=method)
    denoised_audios[method] = denoised
    print(f"Denoised audio length: {len(denoised)} samples")

In [None]:
import numpy as np
import librosa
import soundfile as sf
from scipy.signal import butter, filtfilt
import IPython.display as ipd

# 파일 경로
file_path = "/mnt/ddn/kyudan/Audio-data-centric/podcast-pipeline/data/test/_final/-sepreformer-True-vad-True-diaModel-dia3-initPrompt-False-merge_gap-2.0-seg_th-0.11-cl_min-11-cl_th-0.5-LLM-case_0/test_english_with_overlap/test_english_with_overlap/00003_SPEAKER_01.mp3"

# 오디오 로드
print("Loading audio...")
audio, sr = librosa.load(file_path, sr=None)
print(f"Sample rate: {sr} Hz, Duration: {len(audio)/sr:.2f} seconds")

# 원본 재생
print("\n=== Original Audio ===")
ipd.display(ipd.Audio(audio, rate=sr))