## 30 seconds processing

In [1]:
# app.py
import os
import time
import torch
import numpy as np
import gradio as gr

# Audio I/O
import soundfile as sf
import librosa

from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
)

# -----------------------------
# Paths
# -----------------------------

# Run root that contains tokenizer/preprocessor files (tokenizer.json, preprocessor_config.json, etc.)
RUN_ROOT = r"C:\Proxima Tech Solutions\Proxima AI Voice Call Center\Project-Code\Fine-Tuned-STT\Whisper-FT-V2"

# Checkpoint folder that contains model weights/config (pytorch_model.bin / model.safetensors, config.json, etc.)
CKPT_DIR = os.path.join(RUN_ROOT, "checkpoint-3672")  # change to the checkpoint you want

# -----------------------------
# Load model & processor
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
use_bf16 = (device == "cuda") and torch.cuda.get_device_capability(0)[0] >= 8  # Hopper+
dtype = torch.bfloat16 if use_bf16 else torch.float16 if device == "cuda" else torch.float32

# Processor (tokenizer + feature extractor) lives in RUN_ROOT
processor = WhisperProcessor.from_pretrained(RUN_ROOT)

# Model weights live in the checkpoint directory
model = WhisperForConditionalGeneration.from_pretrained(CKPT_DIR, torch_dtype=dtype)
model.to(device)
model.eval()

# Make sure generation settings are right for AZ transcription
model.generation_config.language = "az"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None
model.generation_config.suppress_tokens = []

SR = 16000  # Whisper expects 16kHz features

def _load_audio_from_path(path: str, target_sr: int = SR) -> np.ndarray:
    """Load any audio file path -> float32 mono @ target_sr."""
    # soundfile preserves native sr; librosa resamples
    wav, sr = sf.read(path, always_2d=False)
    if wav.ndim == 2:
        wav = wav.mean(axis=1)  # mono
    if sr != target_sr:
        wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
    wav = wav.astype(np.float32)
    # Clamp extreme amplitudes just in case
    maxabs = np.max(np.abs(wav)) if wav.size else 0.0
    if maxabs > 1.0:
        wav = wav / maxabs
    return wav

def _to_float32(wav: np.ndarray) -> np.ndarray:
    """Ensure float32 in [-1,1]."""
    if np.issubdtype(wav.dtype, np.integer):
        # common case: int16 mic input
        max_int = np.iinfo(wav.dtype).max
        wav = wav.astype(np.float32) / max_int
    elif wav.dtype != np.float32:
        wav = wav.astype(np.float32)
    # clamp (just in case)
    if wav.size:
        m = np.max(np.abs(wav))
        if m > 1.0:
            wav /= m
    return wav

def _normalize_input(audio):
    """
    Accepts: (sr, np.ndarray) from mic, np.ndarray, dict{'path':...}, or str path.
    Returns: float32 mono @ 16 kHz.
    """
    # mic: (sr, wav)
    if isinstance(audio, tuple) and len(audio) == 2:
        sr, wav = audio
        if wav.ndim == 2:
            wav = wav.mean(axis=1)
        wav = _to_float32(wav)             # <-- cast BEFORE resample
        if sr != SR:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=SR)
        return wav

    # gradio v4 sometimes returns just a numpy array
    if isinstance(audio, np.ndarray):
        wav = audio
        if wav.ndim == 2:
            wav = wav.mean(axis=1)
        # we don't know the true sr; assume SR (UI forces 16k for mic)
        return _to_float32(wav)

    # file upload: dict with 'path'
    if isinstance(audio, dict) and "path" in audio:
        wav, sr = sf.read(audio["path"], always_2d=False)
        if wav.ndim == 2:
            wav = wav.mean(axis=1)
        wav = _to_float32(wav)             # <-- cast BEFORE resample
        if sr != SR:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=SR)
        return wav

    # raw string path
    if isinstance(audio, str) and os.path.exists(audio):
        wav, sr = sf.read(audio, always_2d=False)
        if wav.ndim == 2:
            wav = wav.mean(axis=1)
        wav = _to_float32(wav)
        if sr != SR:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=SR)
        return wav

    raise ValueError("Unsupported audio input format")

@torch.inference_mode()
def transcribe(audio_in):
    try:
        t0 = time.time()
        wav = _normalize_input(audio_in)
        if wav.size == 0:
            return "Empty audio."

        # Feature extraction (log-Mel) via processor
        feats = processor.feature_extractor(
            wav, sampling_rate=SR, return_tensors="pt"
        ).input_features  # [1, 80, T]
        feats = feats.to(device)

        # Generate (greedy by default; tune max_length if needed)
        with torch.autocast(device_type="cuda", dtype=dtype) if device == "cuda" else torch.no_grad():
            out_ids = model.generate(
                inputs=feats,
                max_length=448,           # consistent with your training
                do_sample=False,
                num_beams=3,
            )

        text = processor.tokenizer.batch_decode(out_ids, skip_special_tokens=True)[0]
        elapsed = time.time() - t0
        return f"{text}\n\n‚è±Ô∏è {elapsed:.2f}s"
    except Exception as e:
        return f"Error: {e}"

# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks(title="Whisper AZ Transcriber") as demo:
    gr.Markdown("## Whisper AZ Transcriber\nRecord or upload audio; model returns the transcript.")

    with gr.Row():
        audio = gr.Audio(
            sources=["microphone", "upload"],
            type="numpy",            # gives (sr, numpy) for mic; file uploads return dict
            label="Microphone or upload (.wav/.mp3/etc.)",
        )
    btn = gr.Button("Transcribe")
    out = gr.Textbox(label="Transcript", lines=6)

    btn.click(transcribe, inputs=audio, outputs=out)

if __name__ == "__main__":
    # Launch on all interfaces so you can open it from your laptop browser via the server IP
    demo.launch(server_port=8000)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

* Running on local URL:  http://127.0.0.1:8000

To create a public link, set `share=True` in `launch()`.


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "C:\LLM Instruction Fine-Tuning\.venv\Lib\site-packages\uvicorn\protocols\http\h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\LLM Instruction Fine-Tuning\.venv\Lib\site-packages\uvicorn\middleware\proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\LLM Instruction Fine-Tuning\.venv\Lib\site-packages\fastapi\applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "C:\LLM Instruction Fine-Tuning\.venv\Lib\site-packages\starlette\applications.py", line 112, in __call__
    await self.middleware_stack(scope, receive, send)
  File "C:\LLM Instruction Fine-Tuning\.venv\Lib\site-packages\starlette\middleware\errors.py", line 187, in __call__
    raise exc
  Fil

## Chunking for long audio processing

In [None]:
# app.py
import os
import time
import torch
import numpy as np
import gradio as gr

# Audio I/O
import soundfile as sf
import librosa

from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
)

# -----------------------------
# Paths
# -----------------------------
# If you exported to a single folder, set MODEL_DIR to that folder.
# Otherwise, set it to your run root that also contains tokenizer/preprocessor files.
MODEL_DIR = r"checkpoint-3672"  # or ".../export-278145"

# The folder that contains tokenizer.json, vocab.json, merges.txt, etc.
ROOT_DIR = r"C:\Proxima Tech Solutions\Proxima AI Voice Call Center\Project-Code\Fine-Tuned-STT\Whisper-FT-V2"
MODEL_DIR = os.path.join(ROOT_DIR, "checkpoint-3672")

# -----------------------------
# Load model & processor
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
use_bf16 = (device == "cuda") and torch.cuda.get_device_capability(0)[0] >= 8  # Hopper+
dtype = torch.bfloat16 if use_bf16 else torch.float16 if device == "cuda" else torch.float32

processor = WhisperProcessor.from_pretrained(ROOT_DIR)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_DIR, torch_dtype=dtype)
model.to(device)
model.eval()

# Make sure generation settings are right for AZ transcription
model.generation_config.language = "az"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None
model.generation_config.suppress_tokens = []

SR = 16000  # Whisper expects 16kHz features

def _load_audio_from_path(path: str, target_sr: int = SR) -> np.ndarray:
    """Load any audio file path -> float32 mono @ target_sr."""
    # soundfile preserves native sr; librosa resamples
    wav, sr = sf.read(path, always_2d=False)
    if wav.ndim == 2:
        wav = wav.mean(axis=1)  # mono
    if sr != target_sr:
        wav = librosa.resample(wav, orig_sr=sr, target_sr=target_sr)
    wav = wav.astype(np.float32)
    # Clamp extreme amplitudes just in case
    maxabs = np.max(np.abs(wav)) if wav.size else 0.0
    if maxabs > 1.0:
        wav = wav / maxabs
    return wav

def _to_float32(wav: np.ndarray) -> np.ndarray:
    """Ensure float32 in [-1,1]."""
    if np.issubdtype(wav.dtype, np.integer):
        # common case: int16 mic input
        max_int = np.iinfo(wav.dtype).max
        wav = wav.astype(np.float32) / max_int
    elif wav.dtype != np.float32:
        wav = wav.astype(np.float32)
    # clamp (just in case)
    if wav.size:
        m = np.max(np.abs(wav))
        if m > 1.0:
            wav /= m
    return wav

def _normalize_input(audio):
    """
    Accepts: (sr, np.ndarray) from mic, np.ndarray, dict{'path':...}, or str path.
    Returns: float32 mono @ 16 kHz.
    """
    # mic: (sr, wav)
    if isinstance(audio, tuple) and len(audio) == 2:
        sr, wav = audio
        if wav.ndim == 2:
            wav = wav.mean(axis=1)
        wav = _to_float32(wav)             # <-- cast BEFORE resample
        if sr != SR:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=SR)
        return wav

    # gradio v4 sometimes returns just a numpy array
    if isinstance(audio, np.ndarray):
        wav = audio
        if wav.ndim == 2:
            wav = wav.mean(axis=1)
        # we don't know the true sr; assume SR (UI forces 16k for mic)
        return _to_float32(wav)

    # file upload: dict with 'path'
    if isinstance(audio, dict) and "path" in audio:
        wav, sr = sf.read(audio["path"], always_2d=False)
        if wav.ndim == 2:
            wav = wav.mean(axis=1)
        wav = _to_float32(wav)             # <-- cast BEFORE resample
        if sr != SR:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=SR)
        return wav

    # raw string path
    if isinstance(audio, str) and os.path.exists(audio):
        wav, sr = sf.read(audio, always_2d=False)
        if wav.ndim == 2:
            wav = wav.mean(axis=1)
        wav = _to_float32(wav)
        if sr != SR:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=SR)
        return wav

    raise ValueError("Unsupported audio input format")

def extract_original_audio_tuple(audio):
    """
    Returns a (sr, wav) tuple of the ORIGINAL, unmodified audio for playback.
    - Mic: already (sr, wav)
    - Numpy only: assume SR (UI mic is 16k); best-effort
    - File upload/path: read from disk with dtype='float32'
    """
    if isinstance(audio, tuple) and len(audio) == 2:
        # mic path: (sr, wav) as-is
        return audio

    if isinstance(audio, np.ndarray):
        # no explicit sr from gradio in this case; assume SR
        return (SR, audio)

    if isinstance(audio, dict) and "path" in audio:
        wav, sr = sf.read(audio["path"], always_2d=False, dtype="float32")
        return (sr, wav)

    if isinstance(audio, str) and os.path.exists(audio):
        wav, sr = sf.read(audio, always_2d=False, dtype="float32")
        return (sr, wav)

    raise ValueError("Unsupported audio input format (original)")

CHUNK_SEC  = 15   # was 25
STRIDE_SEC = 3    # was 5 (more context still)
BATCH_SIZE = 4    # increase if you have GPU room

def calc_max_len(chunk_sec):
    # generous budget: ~16 tokens/sec, capped by Whisper limit (448)
    return min(448, int(16 * chunk_sec))

def chunk_audio(wav: np.ndarray, sr: int = SR,
                chunk_sec: int = CHUNK_SEC, stride_sec: int = STRIDE_SEC):
    """Return list of (start_sample, end_sample) windows with overlap."""
    chunk = int(chunk_sec * sr)
    stride = int(stride_sec * sr)
    if wav.shape[0] <= chunk:
        return [(0, wav.shape[0])]
    spans = []
    i = 0
    N = wav.shape[0]
    while i < N:
        s = i
        e = min(i + chunk, N)
        spans.append((s, e))
        if e == N:
            break
        i += (chunk - stride)
    return spans

def dedup_join(texts, max_overlap_words=12):
    """Remove only the exact overlapping prefix of the next chunk
       that matches the suffix of the previous chunk."""
    out = []
    for t in texts:
        t = t.strip()
        if not t:
            continue
        if not out:
            out.append(t)
            continue
        prev_words = out[-1].split()
        curr_words = t.split()

        # find longest common prefix/suffix overlap up to max_overlap_words
        cut = 0
        lim = min(len(prev_words), len(curr_words), max_overlap_words)
        for k in range(lim, 0, -1):
            if prev_words[-k:] == curr_words[:k]:
                cut = k
                break
        out.append(" ".join(curr_words[cut:]))
    return " ".join(out).strip()

from difflib import SequenceMatcher

def fuzzy_join(texts, max_tail_words=30, min_ratio=0.6):
    """
    Join chunk texts while removing near-duplicate overlap even if punctuation/
    casing differ. Looks for the longest suffix of prev that roughly matches
    the prefix of curr and trims it.
    """
    out = []
    for t in texts:
        t = t.strip()
        if not t:
            continue
        if not out:
            out.append(t)
            continue

        prev = out[-1]
        prev_words = prev.split()
        curr_words = t.split()

        # Compare up to last N words of prev with first N words of curr
        N = min(max_tail_words, len(prev_words), len(curr_words))
        cut = 0
        best_ratio = 0.0
        for k in range(N, 0, -1):
            tail = " ".join(prev_words[-k:])
            head = " ".join(curr_words[:k])
            r = SequenceMatcher(None, tail.lower(), head.lower()).ratio()
            if r >= min_ratio and r >= best_ratio:
                best_ratio = r
                cut = k
                # break on first good-enough match from the longest side
                break

        out.append(" ".join(curr_words[cut:]))
    return " ".join(out).strip()

@torch.inference_mode()
def transcribe(audio_in):
    try:
        t0 = time.time()

        # ORIGINAL (for playback)
        orig_sr, orig_wav = extract_original_audio_tuple(audio_in)

        # PROCESSED (what Whisper actually gets)
        wav = _normalize_input(audio_in)
        if wav.size == 0:
            return "Empty audio.", (orig_sr, orig_wav), (SR, wav)

        spans = chunk_audio(wav, SR, CHUNK_SEC, STRIDE_SEC)
        texts = []

        for i in range(0, len(spans), BATCH_SIZE):
            batch_spans = spans[i:i + BATCH_SIZE]
            waves = [wav[s:e] for (s, e) in batch_spans]

            feats = processor.feature_extractor(
                waves, sampling_rate=SR, return_tensors="pt"
            ).input_features.to(device)

            with (torch.autocast(device_type="cuda", dtype=dtype)
                  if device == "cuda" else torch.no_grad()):
                out_ids = model.generate(
                    inputs=feats,
                    max_length=calc_max_len(CHUNK_SEC),
                    do_sample=False,
                )

            texts.extend(
                processor.tokenizer.batch_decode(out_ids, skip_special_tokens=True)
            )

        full_text = dedup_join(texts, max_overlap_words=12)
        elapsed = time.time() - t0
        transcript = f"{full_text}\n\n‚è±Ô∏è {elapsed:.2f}s  (chunks: {len(spans)})"

        # Return: text, original audio, processed audio
        return transcript, (orig_sr, orig_wav), (SR, wav)

    except Exception as e:
        return f"Error: {e}", None, None

# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks(title="Whisper AZ Transcriber") as demo:
    gr.Markdown("## Whisper AZ Transcriber\nRecord or upload audio; model returns the transcript.\n\n"
                "**Tip:** Compare the original vs processed audio below.")

    with gr.Row():
        audio = gr.Audio(
            sources=["microphone", "upload"],
            type="numpy",            # mic = (sr, numpy); uploads may be dict in some cases
            label="Microphone or upload (.wav/.mp3/etc.)",
        )

    btn = gr.Button("Transcribe")

    with gr.Row():
        out_text = gr.Textbox(label="Transcript", lines=6)

    with gr.Row():
        out_orig = gr.Audio(label="Original (as provided)", type="numpy")
        out_proc = gr.Audio(label="Processed (mono @ 16 kHz for Whisper)", type="numpy")

    # three outputs now: text, original, processed
    btn.click(transcribe, inputs=audio, outputs=[out_text, out_orig, out_proc])

if __name__ == "__main__":
    demo.launch(server_port=8000)

---

### üéß Why the ‚ÄúOriginal‚Äù sounds noisier than the mic preview

The **first waveform at the top** (from the Gradio input widget) is actually how the browser encodes and streams the audio to Python ‚Äî typically as **compressed WebM/Opus or MP3**, and Gradio plays it back *before* any decoding.
When you play that same audio again inside the Gradio app (after it‚Äôs passed to Python, decoded, processed, possibly resampled, and re-encoded as WAV for the player), you‚Äôre hearing a slightly different signal path.

This introduces three differences:

1. **Codec differences** ‚Äì The browser preview is using a lossy but psychoacoustically optimized codec (Opus). When Gradio sends it to Python, it‚Äôs decoded to PCM, then re-encoded to WAV for playback. The WAV has no compression masking, so low-level background noise is more audible.
2. **Resampling noise** ‚Äì Your code resamples 8 kHz ‚Üí 16 kHz using a high-quality band-limited algorithm. That upsampling *doesn‚Äôt* add new information above 4 kHz but may emphasize existing broadband noise in that band.
3. **Mono down-mixing** ‚Äì Stereo averaging (`mean(axis=1)`) can bring out noise that was previously panned differently in each channel. If the two channels weren‚Äôt perfectly correlated, summing them may slightly decorrelate ambient hiss or room noise, making it sound ‚Äúfuller.‚Äù

---

üëâ **The one that Whisper actually ‚Äúhears‚Äù and transcribes** is the **processed version** ‚Äî the one labeled

> üéß *Processed (mono @ 16 kHz for Whisper)*

That‚Äôs the float-32, mono, 16 kHz waveform that you create inside `_normalize_input()` before feature extraction.

---

### ‚úÖ Does this affect transcription quality?

No ‚Äî this is **completely safe** and **expected**.
Here‚Äôs why:

| Step                           | Purpose                                                                                            | Effect on ASR quality |
| ------------------------------ | -------------------------------------------------------------------------------------------------- | --------------------- |
| Stereo ‚Üí Mono                  | Whisper was trained on mono audio; down-mixing is required                                         | **Safe / required**   |
| 8 kHz ‚Üí 16 kHz                 | Whisper expects 16 kHz input; upsampling simply doubles the sample rate so the FFTs match training | **Safe / required**   |
| 8-bit ‚Üí float32                | Expands to full dynamic range, no extra quantization noise                                         | **Safe / required**   |
| Float normalization / clamping | Prevents clipping, ensures consistent loudness                                                     | **Safe**              |

So the ‚Äúnoise‚Äù you hear in playback is a *perceptual artifact* of resampling and mixing ‚Äî not a data-quality issue.
Whisper itself sees exactly what it was trained for (mono 16 kHz float PCM), and that small background hiss doesn‚Äôt change the model‚Äôs ability to recognize speech.

---

### üß† Bottom line

* The **processed 16 kHz mono signal** is what Whisper uses.
* The extra hiss is *audible* only in playback; it **does not hurt transcription accuracy**.
* Your preprocessing pipeline is **fully correct** for Whisper.