In [6]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
!pip install -q transformers torchaudio torchcodec
!apt install ffmpeg


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [8]:
import glob
import os
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

AUDIO_DIR = "/content/drive/MyDrive/adversarial-audio/Normal-Examples/long-signals"
TARGET_SR = 16000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def pick_sample(audio_dir=AUDIO_DIR):
    wavs = sorted(glob.glob(os.path.join(audio_dir, "*.wav")))
    if not wavs:
        raise FileNotFoundError(f"No .wav files found in {audio_dir}. Update AUDIO_DIR to your folder.")
    print(f"Found {len(wavs)} .wav files. Using: {wavs[0]}")
    return wavs[0]

SAMPLE_FILE = pick_sample()
# To target a specific file instead of the first one, uncomment and set the path below:
# SAMPLE_FILE = os.path.join(AUDIO_DIR, "your-file.wav")

def load_audio(path, target_sr=TARGET_SR):
    waveform, sr = torchaudio.load(path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)  # downmix to mono
    if sr != target_sr:
        waveform = torchaudio.functional.resample(waveform, sr, target_sr)
    return waveform.to(device)

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)


Using device: cuda
Found 300 .wav files. Using: /content/drive/MyDrive/adversarial-audio/Normal-Examples/long-signals/sample-070236.wav


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PGD-Based White-Box Non-Targeted Attack for ASR (Gao et al. (2024))

In [9]:
# pgd_attack.py

import torch
from torch.nn import functional as F
import torchaudio

class PGDAudioAttack:
    def __init__(self, model, processor, epsilon=0.002, alpha=0.0004, steps=10, sample_rate=16000, device=None):
        self.model = model.eval()
        self.processor = processor  # kept for consistency/decoding if needed
        self.epsilon = epsilon
        self.alpha = alpha
        self.steps = steps
        self.sample_rate = sample_rate
        self.device = device or next(model.parameters()).device

    def _prepare_audio(self, audio_tensor):
        # Flatten to mono 1D and add batch dim -> shape [1, time]
        if audio_tensor.dim() == 0:
            raise ValueError("Audio tensor must have at least 1 dimension")
        audio = audio_tensor
        if audio.dim() > 1:
            audio = audio.mean(dim=0)
        audio = audio.view(-1)  # ensure 1D
        return audio.unsqueeze(0).to(self.device)

    def _normalize(self, audio_batch):
        # Simple zero-mean, peak normalization in torch to keep gradients.
        audio = audio_batch - audio_batch.mean(dim=1, keepdim=True)
        peak = audio.abs().max(dim=1, keepdim=True).values.clamp(min=1e-6)
        return audio / peak

    def forward(self, audio_tensor):
        base_audio = self._prepare_audio(audio_tensor).detach()
        perturbed = base_audio.clone().detach().requires_grad_(True)

        with torch.no_grad():
            orig_logits = self.model(self._normalize(base_audio)).logits
            orig_pred = torch.argmax(orig_logits, dim=-1)

        for _ in range(self.steps):
            norm_audio = self._normalize(perturbed)
            logits = self.model(norm_audio).logits
            log_probs = logits.log_softmax(dim=-1)

            input_lengths = torch.full((logits.size(0),), logits.size(1), dtype=torch.long, device=self.device)
            target_lengths = torch.full((orig_pred.size(0),), orig_pred.size(1), dtype=torch.long, device=self.device)

            loss = -F.ctc_loss(log_probs.transpose(0, 1), orig_pred,
                               input_lengths=input_lengths,
                               target_lengths=target_lengths,
                               blank=0, reduction='mean', zero_infinity=True)

            loss.backward()
            grad_sign = perturbed.grad.sign()
            perturbed = perturbed + self.alpha * grad_sign
            perturbation = torch.clamp(perturbed - base_audio, min=-self.epsilon, max=self.epsilon)
            perturbed = torch.clamp(base_audio + perturbation, min=-1.0, max=1.0).detach().requires_grad_(True)

        return perturbed.detach()


waveform = load_audio(SAMPLE_FILE)

attacker = PGDAudioAttack(model, processor, sample_rate=TARGET_SR, device=device)
adv_audio = attacker.forward(waveform)

torchaudio.save("/content/adv_sample_pgd.wav", adv_audio.cpu(), TARGET_SR)
print("Saved PGD adversarial example to /content/adv_sample_pgd.wav")


Saved PGD adversarial example to /content/adv_sample_pgd.wav


Imperceptible White-Box ASR Attack Using Psychoacoustic Masking (Abdullah et al. (2021))

In [None]:
# imperceptible_attack.py

import torch
import torchaudio
import torchaudio.transforms as T
from torch.nn import functional as F
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

class ImperceptibleAttack:
    def __init__(self, model, processor, epsilon=0.002, alpha=0.0003, steps=15, sample_rate=16000, masking_db=20, device=None):
        self.model = model.eval()
        self.processor = processor
        self.epsilon = epsilon
        self.alpha = alpha
        self.steps = steps
        self.sample_rate = sample_rate
        self.masking_db = masking_db
        self.device = device or next(model.parameters()).device
        self.melspec = T.MelSpectrogram(sample_rate=sample_rate, n_mels=64).to(self.device)

    def _prepare_audio(self, audio_tensor):
        # Flatten to mono 1D and add batch dim -> shape [1, time]
        if audio_tensor.dim() == 0:
            raise ValueError("Audio tensor must have at least 1 dimension")
        audio = audio_tensor
        if audio.dim() > 1:
            audio = audio.mean(dim=0)
        audio = audio.view(-1)
        return audio.unsqueeze(0).to(self.device)

    def _normalize(self, audio_batch):
        # Zero-mean, peak-normalize to keep values in [-1, 1].
        audio = audio_batch - audio_batch.mean(dim=1, keepdim=True)
        peak = audio.abs().max(dim=1, keepdim=True).values.clamp(min=1e-6)
        return audio / peak

    def masking_threshold(self, audio):
        # Generate a crude auditory masking threshold.
        with torch.no_grad():
            spec = self.melspec(audio)
            threshold = spec.mean(dim=-1, keepdim=True) - self.masking_db
            mask = (spec >= threshold).float()
        return mask

    def apply_mask(self, grad, mask):
        # Scale gradients based on Mel mask energy. Interpolate mask to waveform length.
        # mask shape: [batch, n_mels, time_frames]
        # grad shape: [batch, time]
        mel_mask = mask.mean(dim=1)  # [batch, time_frames] - average across mel bins
        # Interpolate to match grad time dimension
        if mel_mask.dim() == 2:
            mel_mask = mel_mask.unsqueeze(1)  # [batch, 1, time_frames] for interpolate
        mel_mask = F.interpolate(mel_mask, size=grad.shape[-1], mode='linear', align_corners=False)
        mel_mask = mel_mask.squeeze(1)  # [batch, time] to match grad
        return grad * mel_mask

    def forward(self, audio_tensor):
        base_audio = self._prepare_audio(audio_tensor).detach()
        perturbed = base_audio.clone().detach().requires_grad_(True)

        with torch.no_grad():
            orig_logits = self.model(self._normalize(base_audio)).logits
            orig_pred = torch.argmax(orig_logits, dim=-1)

        masking_mask = self.masking_threshold(base_audio)

        for _ in range(self.steps):
            norm_audio = self._normalize(perturbed)
            logits = self.model(norm_audio).logits
            log_probs = logits.log_softmax(dim=-1)

            input_lengths = torch.full((logits.size(0),), logits.size(1), dtype=torch.long, device=self.device)
            target_lengths = torch.full((orig_pred.size(0),), orig_pred.size(1), dtype=torch.long, device=self.device)

            loss = -F.ctc_loss(log_probs.transpose(0, 1), orig_pred,
                               input_lengths=input_lengths,
                               target_lengths=target_lengths,
                               blank=0, reduction='mean', zero_infinity=True)
            loss.backward()

            grad_sign = perturbed.grad.sign()
            grad_masked = self.apply_mask(grad_sign, masking_mask)
            perturbed = perturbed + self.alpha * grad_masked
            perturbation = torch.clamp(perturbed - base_audio, min=-self.epsilon, max=self.epsilon)
            perturbed = torch.clamp(base_audio + perturbation, min=-1.0, max=1.0).detach().requires_grad_(True)

        return perturbed.detach()


waveform = load_audio(SAMPLE_FILE)

attacker = ImperceptibleAttack(model, processor, sample_rate=TARGET_SR, device=device)
adv_audio = attacker.forward(waveform)

torchaudio.save("/content/adv_imperceptible.wav", adv_audio.cpu(), TARGET_SR)
print("Saved imperceptible adversarial example to /content/adv_imperceptible.wav")


RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 1, 105600]

AdvReverb: Convolutional White-Box Adversarial Attack (Chen et al. (2023))

In [None]:
import torch
import torch.nn.functional as F
import torchaudio

class AdvReverbAttack:
    def __init__(self, model, processor, rir_len=2048, lr=1e-3, steps=300, sample_rate=16000, device=None):
        self.model = model.eval()
        self.processor = processor
        self.rir_len = rir_len
        self.lr = lr
        self.steps = steps
        self.sample_rate = sample_rate
        self.device = device or next(model.parameters()).device

    def _prepare_audio(self, audio_tensor):
        # Ensure a single-channel batch tensor shaped [1, time]
        audio = audio_tensor
        if audio.dim() == 3:  # [batch, channel, time]
            audio = audio.mean(dim=1)  # [batch, time]
        elif audio.dim() == 2:  # could be [channels, time] or [batch, time]
            if audio.size(0) > 1:
                audio = audio.mean(dim=0, keepdim=True)  # [1, time]
            # If already [1, time], keep it as is
        elif audio.dim() == 1:
            audio = audio.unsqueeze(0)  # [1, time]
        else:
            raise ValueError(f"Unexpected audio tensor shape: {audio.shape}")
        
        # Ensure final shape is exactly [1, time]
        if audio.dim() != 2 or audio.size(0) != 1:
            # Flatten to 1D and add batch dimension
            audio = audio.view(-1).unsqueeze(0)
        
        return audio.to(self.device)
    
    def _normalize(self, audio_batch):
        # Zero-mean, peak-normalize to keep values in [-1, 1].
        audio = audio_batch - audio_batch.mean(dim=1, keepdim=True)
        peak = audio.abs().max(dim=1, keepdim=True).values.clamp(min=1e-6)
        return audio / peak

    def forward(self, audio_tensor):
        base_audio = self._prepare_audio(audio_tensor).detach()  # [1, time]

        rir = torch.nn.Parameter(torch.randn(self.rir_len, device=self.device) * 1e-3)
        optimizer = torch.optim.Adam([rir], lr=self.lr)

        # Get original prediction using direct model input (not processor)
        with torch.no_grad():
            base_audio_norm = self._normalize(base_audio)
            orig_logits = self.model(base_audio_norm).logits
            orig_pred = torch.argmax(orig_logits, dim=-1)

        for _ in range(self.steps):
            optimizer.zero_grad()
            rir_windowed = rir * torch.hann_window(self.rir_len, device=self.device)
            rir_norm = rir_windowed / torch.norm(rir_windowed, p=2)

            # Apply convolution: base_audio is [1, time], unsqueeze to [1, 1, time]
            adv_audio = torch.nn.functional.conv1d(base_audio.unsqueeze(1), rir_norm.view(1, 1, -1), padding=self.rir_len // 2)
            # adv_audio is [1, 1, time], squeeze to [1, time]
            adv_audio = adv_audio.squeeze(1)
            adv_audio_norm = self._normalize(adv_audio)

            # Pass normalized audio directly to model
            logits = self.model(adv_audio_norm).logits
            log_probs = logits.log_softmax(dim=-1)

            input_lengths = torch.full((logits.size(0),), logits.size(1), dtype=torch.long, device=self.device)
            target_lengths = torch.full((orig_pred.size(0),), orig_pred.size(1), dtype=torch.long, device=self.device)

            loss = -F.ctc_loss(log_probs.transpose(0, 1), orig_pred,
                               input_lengths=input_lengths,
                               target_lengths=target_lengths,
                               blank=0, reduction='mean', zero_infinity=True)
            loss.backward()
            optimizer.step()

        # Final convolution with optimized RIR
        with torch.no_grad():
            final_rir = rir * torch.hann_window(self.rir_len, device=self.device)
            final_rir = final_rir / torch.norm(final_rir, p=2)
            # base_audio is [1, time], unsqueeze to [1, 1, time]
            adv_audio = torch.nn.functional.conv1d(base_audio.unsqueeze(1), final_rir.view(1, 1, -1), padding=self.rir_len // 2)
            # Squeeze to [1, time]
            adv_audio = adv_audio.squeeze(1)

        return adv_audio.detach(), final_rir.detach()


waveform = load_audio(SAMPLE_FILE)

attacker = AdvReverbAttack(model, processor, sample_rate=TARGET_SR, device=device)
adv_audio, adv_rir = attacker.forward(waveform)

torchaudio.save("/content/adv_reverb.wav", adv_audio.cpu(), TARGET_SR)
print("Saved reverb adversarial example to /content/adv_reverb.wav")


In [None]:
# Install additional dependencies for metrics and ASR
%pip install -q pesq pystoi openai-whisper librosa scipy soundfile


In [None]:
# Metrics computation functions
import numpy as np
import subprocess
import tempfile
import json
from pathlib import Path
from typing import Dict, Tuple, Optional
import whisper
from pesq import pesq as pesq_metric
from pystoi.stoi import stoi as stoi_metric
import librosa

# Initialize Whisper model
print("Loading Whisper model...")
whisper_model = whisper.load_model("base")
print("Whisper model loaded!")

def compute_snr(original: np.ndarray, degraded: np.ndarray) -> float:
    """Calculate Signal-to-Noise Ratio (SNR) in dB."""
    # Ensure same length by trimming to minimum
    min_len = min(len(original), len(degraded))
    original = original[:min_len]
    degraded = degraded[:min_len]
    
    signal_power = np.mean(original ** 2)
    noise_power = np.mean((degraded - original) ** 2)
    if noise_power == 0:
        return float('inf')
    snr = 10 * np.log10(signal_power / noise_power)
    return float(snr)

def compute_pesq(reference: np.ndarray, degraded: np.ndarray, sr: int = TARGET_SR) -> float:
    """Compute PESQ (ITU-T P.862). Uses wideband mode for 16 kHz."""
    min_len = min(len(reference), len(degraded))
    reference = reference[:min_len]
    degraded = degraded[:min_len]
    try:
        return float(pesq_metric(sr, reference, degraded, 'wb'))
    except:
        return 0.0

def compute_stoi(reference: np.ndarray, degraded: np.ndarray, sr: int = TARGET_SR) -> float:
    """Compute STOI (0..1)."""
    min_len = min(len(reference), len(degraded))
    reference = reference[:min_len]
    degraded = degraded[:min_len]
    try:
        return float(stoi_metric(reference, degraded, sr, extended=False))
    except:
        return 0.0

def compute_wer_cer(reference_text: str, hypothesis_text: str) -> Tuple[float, float]:
    """Compute Word Error Rate (WER) and Character Error Rate (CER)."""
    ref_words = reference_text.lower().split()
    hyp_words = hypothesis_text.lower().split()
    
    # WER using Levenshtein distance on words
    if len(ref_words) == 0:
        wer = 1.0 if len(hyp_words) > 0 else 0.0
    else:
        # Simple word-level edit distance
        from difflib import SequenceMatcher
        matcher = SequenceMatcher(None, ref_words, hyp_words)
        wer = 1.0 - matcher.ratio()
    
    # CER
    ref_chars = list(reference_text.lower().replace(" ", ""))
    hyp_chars = list(hypothesis_text.lower().replace(" ", ""))
    if len(ref_chars) == 0:
        cer = 1.0 if len(hyp_chars) > 0 else 0.0
    else:
        from difflib import SequenceMatcher
        matcher = SequenceMatcher(None, ref_chars, hyp_chars)
        cer = 1.0 - matcher.ratio()
    
    return float(wer), float(cer)

def transcribe_audio(audio_array: np.ndarray, sr: int = TARGET_SR) -> str:
    """Transcribe audio using Whisper."""
    # Save to temporary file for Whisper
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
        tmp_path = tmp_file.name
        import soundfile as sf
        sf.write(tmp_path, audio_array, sr)
    
    try:
        result = whisper_model.transcribe(tmp_path)
        transcript = result["text"].strip()
    except Exception as e:
        print(f"Whisper transcription error: {e}")
        transcript = ""
    finally:
        import os
        if os.path.exists(tmp_path):
            os.remove(tmp_path)
    
    return transcript

def compute_all_metrics(original_audio: np.ndarray, processed_audio: np.ndarray, 
                        original_transcript: Optional[str] = None, 
                        sr: int = TARGET_SR) -> Dict[str, float]:
    """Compute all metrics: SNR, PESQ, STOI, WER, CER."""
    metrics = {}
    
    # Audio quality metrics
    metrics['snr'] = compute_snr(original_audio, processed_audio)
    metrics['pesq'] = compute_pesq(original_audio, processed_audio, sr)
    metrics['stoi'] = compute_stoi(original_audio, processed_audio, sr)
    
    # ASR metrics (if transcript provided)
    if original_transcript:
        processed_transcript = transcribe_audio(processed_audio, sr)
        wer, cer = compute_wer_cer(original_transcript, processed_transcript)
        metrics['wer'] = wer
        metrics['cer'] = cer
        metrics['transcript'] = processed_transcript
    else:
        # Get transcript from original audio
        original_transcript = transcribe_audio(original_audio, sr)
        processed_transcript = transcribe_audio(processed_audio, sr)
        wer, cer = compute_wer_cer(original_transcript, processed_transcript)
        metrics['wer'] = wer
        metrics['cer'] = cer
        metrics['transcript'] = processed_transcript
    
    return metrics

print("Metrics functions ready!")


In [None]:
# Compression functions using ffmpeg
import soundfile as sf

def compress_opus(audio_array: np.ndarray, output_path: str, sr: int = TARGET_SR, bitrate: int = 64) -> np.ndarray:
    """Compress audio to Opus format and decode back to numpy array."""
    # Save input as temporary WAV
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_in:
        tmp_in_path = tmp_in.name
        sf.write(tmp_in_path, audio_array, sr)
    
    # Compress to Opus
    try:
        subprocess.run([
            'ffmpeg', '-y', '-hide_banner', '-loglevel', 'error',
            '-i', tmp_in_path,
            '-codec:a', 'libopus',
            '-b:a', f'{bitrate}k',
            output_path
        ], check=True, capture_output=True)
        
        # Decode back to WAV
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_out:
            tmp_out_path = tmp_out.name
            subprocess.run([
                'ffmpeg', '-y', '-hide_banner', '-loglevel', 'error',
                '-i', output_path,
                tmp_out_path
            ], check=True, capture_output=True)
            
            # Load decoded audio
            decoded_audio, decoded_sr = librosa.load(tmp_out_path, sr=sr, mono=True)
            
            # Cleanup
            import os
            if os.path.exists(tmp_out_path):
                os.remove(tmp_out_path)
            if os.path.exists(tmp_in_path):
                os.remove(tmp_in_path)
            
            return decoded_audio
    except subprocess.CalledProcessError as e:
        print(f"Opus compression error: {e}")
        import os
        if os.path.exists(tmp_in_path):
            os.remove(tmp_in_path)
        return audio_array  # Return original on error

def compress_amr(audio_array: np.ndarray, output_path: str, sr: int = TARGET_SR, bitrate: float = 12.65) -> np.ndarray:
    """Compress audio to AMR-WB format and decode back to numpy array.
    Falls back to AAC if AMR-WB encoder is not available."""
    # Save input as temporary WAV
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_in:
        tmp_in_path = tmp_in.name
        sf.write(tmp_in_path, audio_array, sr)
    
    import os
    
    # Try AMR-WB first, fall back to AAC if not available
    encoders_to_try = [
        ('libopencore-amrwb', 'amr'),  # AMR-WB decoder (read-only)
        ('libvo_amrwbenc', 'amr'),      # AMR-WB encoder
        ('aac', 'm4a'),                  # AAC fallback
    ]
    
    for encoder, ext in encoders_to_try:
        try:
            # Adjust output path extension if needed
            if ext != output_path.split('.')[-1]:
                output_path_adj = output_path.rsplit('.', 1)[0] + '.' + ext
            else:
                output_path_adj = output_path
            
            # Compress
            if encoder == 'aac':
                # AAC doesn't need special sample rate
                subprocess.run([
                    'ffmpeg', '-y', '-hide_banner', '-loglevel', 'error',
                    '-i', tmp_in_path,
                    '-codec:a', encoder,
                    '-b:a', f'{int(bitrate)}k',
                    output_path_adj
                ], check=True, capture_output=True, text=True)
            else:
                # AMR-WB requires 16kHz
                bitrate_bps = int(bitrate * 1000)
                subprocess.run([
                    'ffmpeg', '-y', '-hide_banner', '-loglevel', 'error',
                    '-i', tmp_in_path,
                    '-ar', '16000',
                    '-ac', '1',
                    '-codec:a', encoder,
                    '-b:a', str(bitrate_bps),
                    output_path_adj
                ], check=True, capture_output=True, text=True)
            
            # Decode back to WAV
            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_out:
                tmp_out_path = tmp_out.name
                subprocess.run([
                    'ffmpeg', '-y', '-hide_banner', '-loglevel', 'error',
                    '-i', output_path_adj,
                    tmp_out_path
                ], check=True, capture_output=True, text=True)
                
                # Load decoded audio
                decoded_audio, decoded_sr = librosa.load(tmp_out_path, sr=sr, mono=True)
                
                # Cleanup
                if os.path.exists(tmp_out_path):
                    os.remove(tmp_out_path)
                if os.path.exists(tmp_in_path):
                    os.remove(tmp_in_path)
                
                # Update output_path to actual path used
                if output_path_adj != output_path:
                    # Move file to expected location
                    if os.path.exists(output_path_adj):
                        import shutil
                        shutil.move(output_path_adj, output_path)
                
                return decoded_audio
                
        except subprocess.CalledProcessError:
            # Try next encoder
            continue
    
    # All encoders failed
    print(f"AMR-WB compression failed: No suitable encoder found. Using original audio.")
    if os.path.exists(tmp_in_path):
        os.remove(tmp_in_path)
    return audio_array  # Return original on error

print("Compression functions ready!")


In [None]:
# Main pipeline function
import time
from datetime import datetime

def torch_to_numpy(audio_tensor):
    """Convert torch tensor to numpy array."""
    if isinstance(audio_tensor, torch.Tensor):
        audio_np = audio_tensor.cpu().numpy()
        if audio_np.ndim > 1:
            audio_np = audio_np.squeeze()
        return audio_np
    return audio_tensor

def process_audio_file(audio_path: str, output_dir: str = "/content/results") -> Dict:
    """Process a single audio file through all pipelines."""
    print(f"\n{'='*80}")
    print(f"Processing: {os.path.basename(audio_path)}")
    print(f"{'='*80}")
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    result = {
        "original_file": os.path.basename(audio_path),
        "baseline_metrics": {},
        "pgd_attack": {"opus": {}, "amr": {}},
        "advreverb_attack": {"opus": {}, "amr": {}}
    }
    
    try:
        # Load original audio
        waveform = load_audio(audio_path)
        original_np = torch_to_numpy(waveform)
        
        # Normalize for metrics computation
        original_np = original_np / np.max(np.abs(original_np)) if np.max(np.abs(original_np)) > 0 else original_np
        
        # Compute baseline metrics (SNR is inf, PESQ/STOI are perfect)
        print("\n[1] Computing baseline metrics...")
        result["baseline_metrics"] = {
            "snr": float('inf'),
            "pesq": 5.0,
            "stoi": 1.0
        }
        print(f"  Baseline: SNR=inf, PESQ=5.0, STOI=1.0")
        
        # Get original transcript
        original_transcript = transcribe_audio(original_np, TARGET_SR)
        print(f"  Original transcript: '{original_transcript}'")
        
        # ========== PGD ATTACK PIPELINE ==========
        print("\n[2] PGD Attack Pipeline")
        print("-" * 80)
        
        # Apply PGD attack
        print("  Applying PGD attack...")
        pgd_attacker = PGDAudioAttack(model, processor, sample_rate=TARGET_SR, device=device)
        pgd_audio = pgd_attacker.forward(waveform)
        pgd_np = torch_to_numpy(pgd_audio)
        pgd_np = pgd_np / np.max(np.abs(pgd_np)) if np.max(np.abs(pgd_np)) > 0 else pgd_np
        
        # PGD → Opus
        print("  Compressing to Opus...")
        opus_path = os.path.join(output_dir, f"{Path(audio_path).stem}_pgd_opus.opus")
        pgd_opus_decoded = compress_opus(pgd_np, opus_path, TARGET_SR)
        print("  Computing metrics on Opus-compressed audio...")
        pgd_opus_metrics = compute_all_metrics(original_np, pgd_opus_decoded, original_transcript, TARGET_SR)
        result["pgd_attack"]["opus"] = {
            "compressed_file": os.path.basename(opus_path),
            "metrics": pgd_opus_metrics
        }
        print(f"    WER: {pgd_opus_metrics['wer']:.3f}, CER: {pgd_opus_metrics['cer']:.3f}")
        print(f"    SNR: {pgd_opus_metrics['snr']:.2f} dB, PESQ: {pgd_opus_metrics['pesq']:.2f}, STOI: {pgd_opus_metrics['stoi']:.3f}")
        
        # PGD → AMR-WB
        print("  Compressing to AMR-WB...")
        amr_path = os.path.join(output_dir, f"{Path(audio_path).stem}_pgd_amr.amr")
        pgd_amr_decoded = compress_amr(pgd_np, amr_path, TARGET_SR)
        print("  Computing metrics on AMR-WB-compressed audio...")
        pgd_amr_metrics = compute_all_metrics(original_np, pgd_amr_decoded, original_transcript, TARGET_SR)
        result["pgd_attack"]["amr"] = {
            "compressed_file": os.path.basename(amr_path),
            "metrics": pgd_amr_metrics
        }
        print(f"    WER: {pgd_amr_metrics['wer']:.3f}, CER: {pgd_amr_metrics['cer']:.3f}")
        print(f"    SNR: {pgd_amr_metrics['snr']:.2f} dB, PESQ: {pgd_amr_metrics['pesq']:.2f}, STOI: {pgd_amr_metrics['stoi']:.3f}")
        
        # ========== ADVREVERB ATTACK PIPELINE ==========
        print("\n[3] AdvReverb Attack Pipeline")
        print("-" * 80)
        
        # Apply AdvReverb attack
        print("  Applying AdvReverb attack...")
        reverb_attacker = AdvReverbAttack(model, processor, sample_rate=TARGET_SR, device=device)
        reverb_audio, _ = reverb_attacker.forward(waveform)
        reverb_np = torch_to_numpy(reverb_audio)
        reverb_np = reverb_np / np.max(np.abs(reverb_np)) if np.max(np.abs(reverb_np)) > 0 else reverb_np
        
        # AdvReverb → Opus
        print("  Compressing to Opus...")
        opus_path = os.path.join(output_dir, f"{Path(audio_path).stem}_advreverb_opus.opus")
        reverb_opus_decoded = compress_opus(reverb_np, opus_path, TARGET_SR)
        print("  Computing metrics on Opus-compressed audio...")
        reverb_opus_metrics = compute_all_metrics(original_np, reverb_opus_decoded, original_transcript, TARGET_SR)
        result["advreverb_attack"]["opus"] = {
            "compressed_file": os.path.basename(opus_path),
            "metrics": reverb_opus_metrics
        }
        print(f"    WER: {reverb_opus_metrics['wer']:.3f}, CER: {reverb_opus_metrics['cer']:.3f}")
        print(f"    SNR: {reverb_opus_metrics['snr']:.2f} dB, PESQ: {reverb_opus_metrics['pesq']:.2f}, STOI: {reverb_opus_metrics['stoi']:.3f}")
        
        # AdvReverb → AMR-WB
        print("  Compressing to AMR-WB...")
        amr_path = os.path.join(output_dir, f"{Path(audio_path).stem}_advreverb_amr.amr")
        reverb_amr_decoded = compress_amr(reverb_np, amr_path, TARGET_SR)
        print("  Computing metrics on AMR-WB-compressed audio...")
        reverb_amr_metrics = compute_all_metrics(original_np, reverb_amr_decoded, original_transcript, TARGET_SR)
        result["advreverb_attack"]["amr"] = {
            "compressed_file": os.path.basename(amr_path),
            "metrics": reverb_amr_metrics
        }
        print(f"    WER: {reverb_amr_metrics['wer']:.3f}, CER: {reverb_amr_metrics['cer']:.3f}")
        print(f"    SNR: {reverb_amr_metrics['snr']:.2f} dB, PESQ: {reverb_amr_metrics['pesq']:.2f}, STOI: {reverb_amr_metrics['stoi']:.3f}")
        
        print(f"\n✓ Completed processing: {os.path.basename(audio_path)}")
        
    except Exception as e:
        print(f"\n✗ Error processing {audio_path}: {e}")
        import traceback
        traceback.print_exc()
        result["error"] = str(e)
    
    return result

def process_all_files(audio_dir: str = AUDIO_DIR, output_json: str = "/content/pipeline_results.json"):
    """Process all audio files in directory."""
    wav_files = sorted(glob.glob(os.path.join(audio_dir, "*.wav")))
    
    if not wav_files:
        print(f"No .wav files found in {audio_dir}")
        return
    
    print(f"\n{'='*80}")
    print(f"Starting batch processing of {len(wav_files)} files")
    print(f"{'='*80}")
    
    all_results = []
    start_time = time.time()
    
    for i, audio_path in enumerate(wav_files, 1):
        print(f"\n\n[{i}/{len(wav_files)}] Processing file...")
        result = process_audio_file(audio_path)
        all_results.append(result)
        
        # Save intermediate results periodically
        if i % 10 == 0:
            with open(output_json, 'w') as f:
                json.dump(all_results, f, indent=2)
            print(f"\n  Intermediate save: {i} files processed")
    
    # Final save
    with open(output_json, 'w') as f:
        json.dump(all_results, f, indent=2)
    
    elapsed = time.time() - start_time
    print(f"\n{'='*80}")
    print(f"Batch processing complete!")
    print(f"  Total files: {len(wav_files)}")
    print(f"  Total time: {elapsed/60:.2f} minutes")
    print(f"  Results saved to: {output_json}")
    print(f"{'='*80}")

print("Pipeline functions ready!")


In [None]:
# Run the complete pipeline on all files
process_all_files(AUDIO_DIR, "/content/pipeline_results.json")
