<a href="https://colab.research.google.com/github/KaifAhmad1/deepfake/blob/main/Audio_Deepfake_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Audio Deepfake Detection, Fake Calls, Spoofing, Fraud Calls and Voice Cloning Analysis for Defensice Forensics**

In [23]:
!pip install -q numpy librosa soundfile matplotlib IPython webrtcvad pydub noisereduce pyAudioAnalysis speechbrain langchain openai langgraph transformers vllm requests ipywidgets
!pip install -q audiomentations hmmlearn eyed3 langchain_community

In [25]:
import os
import json
import asyncio
import concurrent.futures
import requests
from datetime import datetime
from typing import List, Dict, Any, Tuple

import numpy as np
import librosa
import librosa.display
import soundfile as sf
import matplotlib.pyplot as plt
import IPython.display as ipd

import webrtcvad
from pydub import AudioSegment
from audiomentations import Compose, AddGaussianNoise
import noisereduce as nr

from pyAudioAnalysis import audioSegmentation as aS
import speechbrain as sb
from speechbrain.inference.speaker import SpeakerRecognition

from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import openai

from langgraph.graph import StateGraph, END

from transformers import AutoTokenizer
from vllm import LLM, EngineArgs, SamplingParams

import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

In [26]:
#############################################
# Data Model for Forensic Report
#############################################
class ForensicReport:
    def __init__(self, **kwargs):
        self.file = kwargs.get("file")
        self.verdict = kwargs.get("verdict")
        self.mean_score = kwargs.get("mean_score")
        self.confidence = kwargs.get("confidence")
        self.all_model_scores = kwargs.get("all_model_scores")
        self.all_anomalies = kwargs.get("all_anomalies")
        self.natural_summary = kwargs.get("natural_summary")
        self.asr_transcript = kwargs.get("asr_transcript")
        self.asr_lang = kwargs.get("asr_lang")
        self.speaker_identities = kwargs.get("speaker_identities")
        self.speaker_spoof_score = kwargs.get("speaker_spoof_score")
        self.noise_quality_score = kwargs.get("noise_quality_score")
        self.gender_distribution = kwargs.get("gender_distribution")
        self.detailed_results = kwargs.get("detailed_results")
        self.timestamp = kwargs.get("timestamp")
        self.extra_info = kwargs.get("extra_info", {})
        self.vllm_model_outputs = kwargs.get("vllm_model_outputs", {})

    def json(self, indent=2):
        return json.dumps(self.__dict__, indent=indent)

In [27]:
#############################################
# Audio Preprocessing and Feature Extraction
#############################################
def preprocess_audio(audio_path: str, out_sr: int = 16000, mono: bool = True, reduce_noise: bool = True) -> Tuple[np.ndarray, int]:
    print("[Step 1] Loading and preprocessing audio...")
    ext = os.path.splitext(audio_path)[1].lower()
    if ext == ".wav":
        try:
            audio, sr = sf.read(audio_path, dtype='float32', always_2d=False, mmap=True)
        except Exception as e:
            raise RuntimeError(f"Error reading WAV file {audio_path}: {e}")
        if mono and audio.ndim > 1:
            audio = np.mean(audio, axis=1)
        if sr != out_sr:
            print("[Step 1] Resampling from {} to {} Hz".format(sr, out_sr))
            audio = librosa.resample(audio, orig_sr=sr, target_sr=out_sr)
            sr = out_sr
    else:
        print("[Step 1] Converting non-WAV file to WAV format...")
        audio_seg = AudioSegment.from_file(audio_path)
        audio_seg = audio_seg.set_frame_rate(out_sr).set_channels(1 if mono else 2)
        temp_wav = "temp_input.wav"
        audio_seg.export(temp_wav, format="wav")
        audio, sr = sf.read(temp_wav, dtype='float32', always_2d=False, mmap=True)
        if mono and audio.ndim > 1:
            audio = np.mean(audio, axis=1)
        os.remove(temp_wav)
    audio = audio / (np.max(np.abs(audio)) + 1e-8)
    if reduce_noise:
        print("[Step 1] Applying noise reduction...")
        try:
            audio = nr.reduce_noise(y=audio, sr=sr)
        except Exception:
            print("[Step 1] Noise reduction failed; proceeding without.")
    print("[Step 1] Audio loaded successfully with {} samples at {} Hz".format(len(audio), sr))
    return audio, sr

def extract_features(audio: np.ndarray, sr: int) -> Dict[str, float]:
    print("[Step 2] Extracting audio features...")
    feat = {}
    feat['duration'] = len(audio) / sr
    feat['energy'] = np.sqrt(np.mean(audio ** 2))
    feat['zcr'] = np.mean(librosa.feature.zero_crossing_rate(y=audio))
    feat['rmse'] = np.mean(librosa.feature.rms(y=audio))
    feat['spec_centroid'] = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr))
    feat['spec_bandwidth'] = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sr))
    feat['spec_rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr))
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    feat['mfcc_mean'] = np.mean(mfccs)
    feat['mfcc_std'] = np.std(mfccs)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    feat['chroma_mean'] = np.mean(chroma)
    feat['chroma_std'] = np.std(chroma)
    st_feats, _ = aS.feature_extraction(audio, sr, int(0.050 * sr), int(0.025 * sr))
    feat['st_energy_std'] = np.std(st_feats[1, :])
    feat['spectral_flatness'] = np.mean(librosa.feature.spectral_flatness(y=audio))
    try:
        signal_power = np.mean(audio ** 2)
        noise_est = audio - librosa.effects.hpss(audio)[1]
        noise_power = np.mean(noise_est ** 2)
        feat['snr_est'] = 10 * np.log10((signal_power + 1e-6) / (noise_power + 1e-6))
    except Exception:
        feat['snr_est'] = 0
    print("[Step 2] Features extracted: Duration {:.2f} sec, Energy {:.3f}, ZCR {:.3f}".format(feat['duration'], feat['energy'], feat['zcr']))
    return feat

def extract_vad_ratio(audio: np.ndarray, sr: int) -> float:
    print("[Step 3] Running VAD (Voice Activity Detection)...")
    try:
        vad = webrtcvad.Vad(2)
        audio_bytes = (audio * 32768).astype(np.int16).tobytes()
        speech_frames = 0
        total_frames = 0
        frame_length = 320
        for i in range(0, len(audio_bytes), frame_length):
            if i+frame_length > len(audio_bytes):
                break
            total_frames += 1
            if vad.is_speech(audio_bytes[i:i+frame_length], sr):
                speech_frames += 1
        ratio = speech_frames / (total_frames + 1e-8)
    except Exception:
        ratio = 0
    print(f"[Step 3] VAD ratio: {ratio:.3f}")
    return ratio

def extract_gender_distribution(audio_path: str) -> Tuple[int, int, Dict[str, float]]:
    print("[Step 4] Estimating gender distribution via speaker diarization...")
    try:
        segs, _ = py_audio_segmentation(audio_path)
        male_segs = sum(1 for s in segs if "male" in s.lower())
        female_segs = sum(1 for s in segs if "female" in s.lower())
        total = male_segs + female_segs
        distribution = {"male": male_segs/total if total>0 else 0,
                        "female": female_segs/total if total>0 else 0}
        print("[Step 4] Gender distribution: {}".format(distribution))
        return (len(segs), male_segs, distribution)
    except Exception:
        return (0, 0, {})

def py_audio_segmentation(audio_path: str) -> Tuple[List[str], List[Any]]:
    try:
        segs, classes, _ = aS.speaker_diarization(audio_path, 2, plot_res=False)
        seg_labels = ["male" if c==0 else "female" for c in classes]
        return seg_labels, classes
    except Exception:
        return [], []

In [28]:
#############################################
# Forensic Agent Functions using SpeechBrain
#############################################
def run_speechbrain_speaker(audio_path: str) -> Tuple[List[str], List[Any]]:
    print("[Step 5] Running speaker diarization (SpeechBrain)...")
    segs, _ = py_audio_segmentation(audio_path)
    speakers = list(set(segs))
    print("[Step 5] Detected speakers: {}".format(speakers))
    return speakers, segs

def run_speechbrain_verification(audio_path: str) -> float:
    print("[Step 6] Running speaker verification (SpeechBrain)...")
    spkr_model = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_spkrec")
    try:
        result = spkr_model.verify_files(audio_path, audio_path)
        score = float(result['score'])
    except Exception:
        score = 0.0
    print("[Step 6] Verification score: {:.3f}".format(score))
    return score

def run_speechbrain_spoof(audio_path: str) -> Tuple[float, List[str]]:
    print("[Step 7] Running spoof detection (SpeechBrain)...")
    try:
        model = EncoderClassifier.from_hparams(source="speechbrain/anti-spoofing-ecapa-voxceleb", savedir="tmp_spoof")
        output = model.classify_file(audio_path)[0]
        score = float(output.detach().cpu().numpy()[1])
        anomalies = ["SpeechBrain spoof detected"] if score > 0.5 else []
    except Exception:
        score, anomalies = 0.3, []
    print("[Step 7] Spoof score: {:.3f}".format(score))
    return score, anomalies

def run_language_id(audio_path: str) -> Tuple[str, float]:
    print("[Step 8] Running language identification (SpeechBrain)...")
    try:
        langid = LanguageIdentification.from_hparams(source="speechbrain/lang-id-commonlanguage_ecapa", savedir="tmp_langid")
        result = langid.classify_file(audio_path)
        lang = result[3][0] if result[3] else "unknown"
        conf = float(result[1][0]) if result[1] else 0.0
    except Exception:
        lang, conf = "unknown", 0.0
    print("[Step 8] Detected language: {} with confidence {:.2f}".format(lang, conf))
    return lang, conf

def run_wave2vec_fake_detection(audio_path: str) -> Tuple[float, List[str]]:
    print("[Step 9] Running wave2vec-based fake detection...")
    audio, sr = librosa.load(audio_path, sr=16000)
    zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
    if zcr > 0.2:
        return 0.8, ["High ZCR detected: potential synthetic voice."]
    return 0.3, []

def run_replay_attack_detection(audio: np.ndarray, sr: int) -> Tuple[float, List[str]]:
    print("[Step 10] Running replay attack detection...")
    rms = np.mean(librosa.feature.rms(y=audio))
    if rms < 0.01:
        return 0.7, ["Low RMS detected: potential replay attack."]
    return 0.2, []

def run_augmentation_tests(audio: np.ndarray, sr: int) -> Dict[str, float]:
    print("[Step 11] Running augmentation tests...")
    aug = Compose([AddGaussianNoise(min_amplitude=0.01, max_amplitude=0.05, p=1.0)])
    aug_audio = aug(samples=audio, sample_rate=sr)
    zcr_orig = np.mean(librosa.feature.zero_crossing_rate(audio))
    zcr_aug = np.mean(librosa.feature.zero_crossing_rate(aug_audio))
    score_diff = abs(zcr_orig - zcr_aug)
    return {'zcr_aug_diff': score_diff}

def run_enhanced_emotion_detection(audio: np.ndarray, sr: int) -> Tuple[float, List[str], str]:
    print("[Step 12] Running enhanced emotion detection (SpeechBrain)...")
    try:
        classifier = EncoderClassifier.from_hparams("speechbrain/emotion-recognition-wav2vec2-IEMOCAP", savedir="tmp_emotion")
        import torch
        out_prob, score, index, text_lab = classifier.classify_batch(torch.tensor(audio).unsqueeze(0))
        emotion = text_lab[0]
        conf = float(score[0])
        anomalies = [f"Emotion detected: {emotion} (conf: {conf:.2f})"]
        desc = f"Emotion: {emotion} (confidence: {conf:.2f})"
    except Exception:
        conf, anomalies, desc = 0.1, ["Enhanced emotion detection failed; default value used."], "emotion:unknown"
    print("[Step 12] Detected emotion: {}".format(desc))
    return conf, anomalies, desc

def run_asr_transcription(audio_path: str) -> str:
    print("[Step 13] Running ASR transcription (SpeechBrain)...")
    try:
        asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech", savedir="tmp_asr")
        transcript = asr_model.transcribe_file(audio_path)
    except Exception as e:
        transcript = f"ASR transcription failed: {e}"
    print("[Step 13] Transcript: {}".format(transcript[:100]))
    return transcript

In [29]:
#############################################
# vLLM Audio Model Integrations
#############################################
def run_minicpmo(question: str, audio_count: int) -> str:
    print("[vLLM] Running MiniCPM-o model...")
    model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"audio": audio_count},
    )
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(tok) for tok in stop_tokens]
    audio_placeholder = "()" * audio_count
    messages = [{'role': 'user', 'content': f'{audio_placeholder}\n{question}'}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    llm = LLM(**engine_args.__dict__)
    sampling_params = SamplingParams(temperature=0.2, max_tokens=128, stop_token_ids=stop_token_ids)
    inputs = {"prompt": prompt, "multi_modal_data": {"audio": []}}
    outputs = llm.generate([inputs], sampling_params=sampling_params)
    try:
        text = outputs[0].outputs[0].text
    except Exception:
        text = "[vLLM MiniCPM-o error: no output]"
    print("[vLLM] MiniCPM-o output: {}".format(text))
    return text

def run_phi4_mm(question: str, audio_count: int) -> str:
    print("[vLLM] Running Phi-4 MM model...")
    from huggingface_hub import snapshot_download
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
        limit_mm_per_prompt={"audio": audio_count},
    )
    llm = LLM(**engine_args.__dict__)
    sampling_params = SamplingParams(temperature=0.2, max_tokens=128)
    inputs = {"prompt": prompt, "multi_modal_data": {"audio": []}}
    outputs = llm.generate([inputs], sampling_params=sampling_params)
    try:
        text = outputs[0].outputs[0].text
    except Exception:
        text = "[vLLM Phi-4 MM error: no output]"
    print("[vLLM] Phi-4 MM output: {}".format(text))
    return text

def run_qwen2_audio(question: str, audio_count: int) -> str:
    print("[vLLM] Running Qwen2-Audio model...")
    model_name = "Qwen/Qwen2-Audio-7B-Instruct"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        limit_mm_per_prompt={"audio": audio_count},
    )
    audio_in_prompt = "".join([f"Audio {i+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" for i in range(audio_count)])
    prompt = ("<|im_start|>system\nYou are an audio forensic assistant.<|im_end|>\n"
              "<|im_start|>user\n" + audio_in_prompt + question + "<|im_end|>\n"
              "<|im_start|>assistant\n")
    llm = LLM(**engine_args.__dict__)
    sampling_params = SamplingParams(temperature=0.2, max_tokens=128)
    inputs = {"prompt": prompt, "multi_modal_data": {"audio": []}}
    outputs = llm.generate([inputs], sampling_params=sampling_params)
    try:
        text = outputs[0].outputs[0].text
    except Exception:
        text = "[vLLM Qwen2-Audio error: no output]"
    print("[vLLM] Qwen2-Audio output: {}".format(text))
    return text

def run_ultravox(question: str, audio_count: int) -> str:
    print("[vLLM] Running Ultravox model...")
    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [{'role': 'user', 'content': ("<|audio|>\n" * audio_count) + question}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        trust_remote_code=True,
        limit_mm_per_prompt={"audio": audio_count},
    )
    llm = LLM(**engine_args.__dict__)
    sampling_params = SamplingParams(temperature=0.2, max_tokens=128)
    inputs = {"prompt": prompt, "multi_modal_data": {"audio": []}}
    outputs = llm.generate([inputs], sampling_params=sampling_params)
    try:
        text = outputs[0].outputs[0].text
    except Exception:
        text = "[vLLM Ultravox error: no output]"
    print("[vLLM] Ultravox output: {}".format(text))
    return text

def run_whisper(question: str, audio_count: int) -> str:
    print("[vLLM] Running Whisper model...")
    if audio_count != 1:
        return "Whisper supports only a single audio input."
    model_name = "openai/whisper-large-v3-turbo"
    prompt = "<|startoftranscript|>"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=448,
        max_num_seqs=5,
        limit_mm_per_prompt={"audio": audio_count},
    )
    llm = LLM(**engine_args.__dict__)
    sampling_params = SamplingParams(temperature=0.2, max_tokens=128)
    inputs = {"prompt": prompt, "multi_modal_data": {"audio": []}}
    outputs = llm.generate([inputs], sampling_params=sampling_params)
    try:
        text = outputs[0].outputs[0].text
    except Exception:
        text = "[vLLM Whisper error: no output]"
    print("[vLLM] Whisper output: {}".format(text))
    return text

def get_vllm_audio_model_configs() -> Dict[str, Any]:
    return {
        "minicpmo": run_minicpmo,
        "phi4_mm": run_phi4_mm,
        "qwen2_audio": run_qwen2_audio,
        "ultravox": run_ultravox,
        "whisper": run_whisper,
    }

In [30]:
#############################################
# Groq LLM Integration using Real API Calls
#############################################
def run_groq_model1(question: str, audio_count: int) -> str:
    print("[Groq] Running Groq Model 1...")
    url = "https://api.groq.ai/v1/inference/model1"
    payload = {"question": question, "audio_count": audio_count}
    headers = {"Authorization": f"Bearer {os.environ.get('GROQ_API_KEY', '')}"}
    response = requests.post(url, json=payload, headers=headers)
    if response.status_code == 200:
        result = response.json().get("text", "")
    else:
        result = f"Groq model1 error: {response.status_code}"
    print("[Groq] Groq Model 1 output: {}".format(result))
    return result

def run_groq_model2(question: str, audio_count: int) -> str:
    print("[Groq] Running Groq Model 2...")
    url = "https://api.groq.ai/v1/inference/model2"
    payload = {"question": question, "audio_count": audio_count}
    headers = {"Authorization": f"Bearer {os.environ.get('GROQ_API_KEY', '')}"}
    response = requests.post(url, json=payload, headers=headers)
    if response.status_code == 200:
        result = response.json().get("text", "")
    else:
        result = f"Groq model2 error: {response.status_code}"
    print("[Groq] Groq Model 2 output: {}".format(result))
    return result

def get_groq_audio_model_configs() -> Dict[str, Any]:
    return {
        "groq_model1": run_groq_model1,
        "groq_model2": run_groq_model2,
    }

def run_vllm_inference(audio_path: str, question: str) -> Dict[str, str]:
    print("[Step 14] Running multimodal integration (vLLM & Groq)...")
    results = {}
    for model_name, model_fn in get_vllm_audio_model_configs().items():
        try:
            result = model_fn(question, audio_count=1)
            results[model_name] = result
        except Exception as e:
            results[model_name] = f"vLLM error: {e}"
    for model_name, model_fn in get_groq_audio_model_configs().items():
        try:
            result = model_fn(question, audio_count=1)
            results[model_name] = result
        except Exception as e:
            results[model_name] = f"Groq error: {e}"
    print("[Step 14] Multimodal integration completed.")
    return results

In [31]:
#############################################
# LangGraph Integration for Report Aggregation
#############################################
def langgraph_forensic_report(report_data: Dict[str, Any]) -> str:
    print("[Step 15] Running LangGraph aggregation...")
    graph = StateGraph("forensic_analysis")
    graph.add_node("evidence_aggregation", lambda data: f"Anomalies: {data['all_anomalies']}\nFeatures: {data['features']}")
    graph.add_node("analysis", lambda data: langchain_llm_report(data))
    graph.add_node("evidence_table", lambda data: f"Model Scores: {data['all_model_scores']}")
    def combine(data):
        return f"{data['evidence_aggregation']}\n\n{data['analysis']}\n\n{data['evidence_table']}"
    graph.add_node("final_report", combine, dependencies=["evidence_aggregation", "analysis", "evidence_table"])
    graph.add_edge("start", "evidence_aggregation")
    graph.add_edge("start", "analysis")
    graph.add_edge("start", "evidence_table")
    graph.add_edge("evidence_aggregation", "final_report")
    graph.add_edge("analysis", "final_report")
    graph.add_edge("evidence_table", "final_report")
    result = graph.run(report_data)
    final_report = result.get(END, "LangGraph aggregation failed.")
    print("[Step 15] LangGraph aggregation completed.")
    return final_report

In [32]:
#############################################
# Concurrency Helper to run blocking functions
#############################################
async def async_run_in_executor(func, *args):
    loop = asyncio.get_event_loop()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        return await loop.run_in_executor(executor, func, *args)

async def gather_agents(audio: np.ndarray, sr: int, audio_path: str) -> Dict[str, Any]:
    print("[Step 16] Concurrently running forensic agent functions...")
    tasks = {
        "wave2vec": async_run_in_executor(run_wave2vec_fake_detection, audio_path),
        "replay": async_run_in_executor(run_replay_attack_detection, audio, sr),
        "emotion": async_run_in_executor(run_enhanced_emotion_detection, audio, sr),
        "speechbrain_spoof": async_run_in_executor(run_speechbrain_spoof, audio_path)
    }
    results = {}
    for key, task in tasks.items():
        results[key] = await task
    print("[Step 16] Forensic agent functions completed.")
    return results

In [33]:
#############################################
# LangChain Forensic Report Generation
#############################################
def langchain_llm_report(report_data: Dict[str, Any]) -> str:
    print("[Step 17] Generating forensic report summary via LangChain...")
    prompt = PromptTemplate(
        input_variables=["verdict", "mean_score", "anomalies", "asr", "asr_lang",
                           "speakers", "spoof", "features", "noise_quality", "gender_dist", "extra", "vllm_outputs"],
        template=(
            "You are a digital audio forensics expert. Provide a detailed explanation of every analysis step below.\n\n"
            "Verdict: {verdict}\n"
            "Mean Deepfake Score: {mean_score:.2f}\n"
            "Detected Anomalies: {anomalies}\n"
            "ASR Transcript: {asr}\n"
            "ASR Language: {asr_lang}\n"
            "Speaker Identities: {speakers}\n"
            "Spoof Score: {spoof}\n"
            "Feature Summary: {features}\n"
            "Noise/Quality Score: {noise_quality}\n"
            "Gender Distribution: {gender_dist}\n"
            "Additional Analysis: {extra}\n"
            "vLLM Audio Model Outputs: {vllm_outputs}\n\n"
            "Provide a risk assessment with actionable recommendations."
        )
    )
    llm = OpenAI(temperature=0.2, max_tokens=700)
    chain = LLMChain(llm=llm, prompt=prompt)
    summary = chain.run(
        verdict=report_data['verdict'],
        mean_score=report_data['mean_score'],
        anomalies=", ".join(report_data['all_anomalies']),
        asr=report_data['asr_transcript'][:400] + "..." if report_data['asr_transcript'] else "N/A",
        asr_lang=report_data.get('asr_lang', 'unknown'),
        speakers=", ".join(report_data['speaker_identities']),
        spoof=str(report_data['speaker_spoof_score']),
        features="; ".join(f"{k}: {v:.3f}" for k, v in report_data['features'].items()),
        noise_quality=str(report_data.get('noise_quality_score', 'N/A')),
        gender_dist=json.dumps(report_data.get('gender_distribution', {})),
        extra=json.dumps(report_data.get('extra_info', {})),
        vllm_outputs=json.dumps(report_data.get('vllm_outputs', {}))
    )
    print("[Step 17] LangChain summary generated.")
    return summary

In [34]:
#############################################
# Aggregation & Final Report Generation
#############################################
def aggregate_and_report(audio_path: str, results: Dict[str, Any],
                         feats: Dict[str, float], asr: str,
                         asr_lang: str, speaker_identities: List[str],
                         spk_score: float, noise_quality_score: float,
                         gender_dist: Dict[str, float],
                         vllm_model_outputs: Dict[str, str],
                         extra_results: Dict[str, Any] = {}) -> ForensicReport:
    print("[Step 18] Aggregating all forensic analysis results...")
    scores = []
    all_anomalies = []
    model_scores = {}
    detailed = {}
    for agent, (score, anomalies, detail) in results.items():
        scores.append(score)
        model_scores[agent] = score
        all_anomalies.extend(anomalies)
        detailed[agent] = {"score": score, "anomalies": anomalies, "detail": detail}
    speakers, _ = run_speechbrain_speaker(audio_path)
    spk_verif_score = run_speechbrain_verification(audio_path)
    spoof_score, _ = results.get("speechbrain_spoof", (0.0, []))
    combined_scores = scores + [spk_verif_score]
    mean_score = float(np.mean(combined_scores))
    confidence = 1.0 - float(np.std(combined_scores))
    verdict = ("Likely FAKE (spoof/scam/deepfake detected)" if mean_score > 0.7
               else "Possibly FAKE (review anomalies)" if mean_score > 0.5
               else "Likely REAL")
    extra_info = {
        "speaker_diarization": speaker_identities if speaker_identities else speakers,
        "augmentation_tests": run_augmentation_tests(librosa.util.normalize(feats.get('energy')), 16000)
    }
    try:
        llm_summary = langchain_llm_report({
            "verdict": verdict,
            "mean_score": mean_score,
            "all_anomalies": list(set(all_anomalies)),
            "asr_transcript": asr,
            "asr_lang": asr_lang,
            "speaker_identities": speaker_identities if speaker_identities else speakers,
            "speaker_spoof_score": spoof_score,
            "features": feats,
            "noise_quality_score": noise_quality_score,
            "gender_distribution": gender_dist,
            "extra_info": extra_info,
            "vllm_outputs": vllm_model_outputs
        })
        lg_summary = langgraph_forensic_report({
            "verdict": verdict,
            "mean_score": mean_score,
            "all_anomalies": list(set(all_anomalies)),
            "asr_transcript": asr,
            "asr_lang": asr_lang,
            "speaker_identities": speaker_identities if speaker_identities else speakers,
            "speaker_spoof_score": spoof_score,
            "features": feats,
            "noise_quality_score": noise_quality_score,
            "gender_distribution": gender_dist,
            "extra_info": extra_info,
            "vllm_outputs": vllm_model_outputs,
            "all_model_scores": model_scores
        })
        natural_summary = f"{llm_summary}\n\nLangGraph Analysis:\n{lg_summary}"
    except Exception as e:
        natural_summary = f"Verdict: {verdict} (Error generating summary: {e})"
    detailed.update(extra_results)
    print("[Step 18] Aggregation complete. Report ready.")
    return ForensicReport(
        file=audio_path,
        verdict=verdict,
        mean_score=mean_score,
        confidence=confidence,
        all_model_scores=model_scores,
        all_anomalies=list(set(all_anomalies)),
        natural_summary=natural_summary,
        asr_transcript=asr,
        asr_lang=asr_lang,
        speaker_identities=speaker_identities if speaker_identities else speakers,
        speaker_spoof_score=spoof_score,
        noise_quality_score=noise_quality_score,
        gender_distribution=gender_dist,
        detailed_results=detailed,
        timestamp=datetime.utcnow().isoformat(),
        extra_info=extra_info,
        vllm_model_outputs=vllm_model_outputs
    )

In [35]:
#############################################
# Main Pipeline Execution Function (Async)
#############################################
async def deepfake_defensive_pipeline(audio_path: str, vllm_question: str = None) -> ForensicReport:
    print("=== Deepfake Audio Forensic Analysis Pipeline Initiated ===")
    if vllm_question is None:
        vllm_question = (
            "Analyze this audio for potential deepfake, spoofing, scam, or synthetic voice indicators. "
            "Extract evidence including speaker details, replay attacks, spoof cues, emotion, ASR, "
            "spectral features, and augmentation artifacts. Provide a risk score (0 to 1), a detailed evidence table, and actionable recommendations."
        )
    audio, sr = await async_run_in_executor(preprocess_audio, audio_path)
    feats = extract_features(audio, sr)
    feats['vad_ratio'] = extract_vad_ratio(audio, sr)
    _, _, gender_dist = extract_gender_distribution(audio_path)
    feats['nb_segments'] = 0  # Update if segmentation implemented.
    langid_label, _ = run_language_id(audio_path)
    feats['langid_label'] = langid_label
    agent_results = await gather_agents(audio, sr, audio_path)
    asr_transcript = run_asr_transcription(audio_path)
    asr_lang = langid_label
    vllm_outputs = run_vllm_inference(audio_path, vllm_question)
    report = aggregate_and_report(
        audio_path,
        agent_results,
        feats,
        asr_transcript,
        asr_lang,
        speaker_identities=[],
        spk_score=run_speechbrain_verification(audio_path),
        noise_quality_score=nr.reduce_noise(y=audio, sr=sr).std(),
        gender_dist=gender_dist,
        vllm_model_outputs=vllm_outputs,
        extra_results={}
    )
    print("=== Pipeline Completed ===")
    return report

In [36]:
#############################################
# Mermaid.js Flowchart for Pipeline Visualization
#############################################
def generate_mermaid_flowchart() -> str:
    mermaid_code = """
    %% Mermaid Flowchart for Deepfake Audio Forensic Pipeline
    graph TD
      A[Upload Audio File] -->|File size info| B[Preprocess Audio]
      B --> C[Extract Audio Features]
      C --> D[Compute VAD Ratio]
      D --> E[Estimate Gender Distribution]
      E --> F[Speaker Diarization & Verification]
      F --> G[ASR Transcription]
      G --> H[vLLM & Groq Multimodal Analysis]
      H --> I[Aggregate Forensic Evidence]
      I --> J[Generate Forensic Report]
    """
    return mermaid_code

def display_mermaid_chart(mermaid_code: str):
    html_content = f"""
    <html>
    <head>
      <script type="module">
        import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.esm.min.js';
        mermaid.initialize({{startOnLoad:true, theme:'default'}});
      </script>
    </head>
    <body>
      <div class="mermaid">
      {mermaid_code}
      </div>
    </body>
    </html>
    """
    display(HTML(html_content))

In [37]:
#############################################
# UI for Google Colab with Enhanced Features
#############################################
def display_audio_file(audio_path: str, sr: int = 16000):
    audio, _ = librosa.load(audio_path, sr=sr)
    plt.figure(figsize=(14, 3))
    librosa.display.waveshow(audio, sr=sr)
    plt.title('Audio Waveform')
    plt.tight_layout()
    plt.show()
    ipd.display(ipd.Audio(audio, rate=sr))

def run_pipeline_ui():
    upload_widget = widgets.FileUpload(accept=".wav,.mp3,.flac,.m4a", multiple=False)
    size_label = widgets.Label(value="File Size: Not Uploaded")
    progress_bar = widgets.IntProgress(value=0, min=0, max=100, description='Upload:', bar_style='info')
    status_label = widgets.Label(value="Status: Waiting for file upload...")
    run_button = widgets.Button(description="Run Deepfake Detection", button_style='success')

    # Tabs for displaying Results, Logs and Raw JSON Report
    tab_children = [widgets.Output(), widgets.Output(), widgets.Output()]
    tab = widgets.Tab(children=tab_children)
    tab.set_title(0, "Summary")
    tab.set_title(1, "Logs")
    tab.set_title(2, "Raw JSON")

    log_area = widgets.Output()

    def append_log(message: str):
        with log_area:
            print(message)

    def on_file_upload(change):
        if upload_widget.value:
            progress_bar.value = 100
            uploaded_filename = list(upload_widget.value.keys())[0]
            file_size = len(upload_widget.value[uploaded_filename]['content'])
            size_label.value = f"File Size: {file_size} bytes (Upload Complete)"
            status_label.value = "Status: File uploaded. Ready to run detection."
            append_log(f"File uploaded: {uploaded_filename} ({file_size} bytes)")

    upload_widget.observe(on_file_upload, names='value')

    def on_run_clicked(change):
        with tab_children[1]:
            clear_output()
        with tab_children[0]:
            clear_output()
        with tab_children[2]:
            clear_output()
        with log_area:
            clear_output()
        if not upload_widget.value:
            status_label.value = "Status: Please upload an audio file."
            return

        status_label.value = "Status: Saving file..."
        uploaded_filename = list(upload_widget.value.keys())[0]
        content = upload_widget.value[uploaded_filename]['content']
        local_filename = "uploaded_audio" + os.path.splitext(uploaded_filename)[1]
        with open(local_filename, "wb") as f:
            f.write(content)
        append_log(f"Audio file '{uploaded_filename}' saved as '{local_filename}'.")
        # Display waveform in logs tab
        with tab_children[1]:
            print("Displaying Audio Waveform:")
            display_audio_file(local_filename)
        # Display pipeline flowchart in logs tab
        with tab_children[1]:
            print("\nVisualizing Pipeline Flowchart:")
            mermaid_code = generate_mermaid_flowchart()
            display_mermaid_chart(mermaid_code)

        status_label.value = "Status: Processing audio..."
        append_log("Starting Deepfake Detection Pipeline...")
        try:
            try:
                loop = asyncio.get_event_loop()
            except RuntimeError:
                loop = asyncio.new_event_loop()
                asyncio.set_event_loop(loop)
            report = loop.run_until_complete(deepfake_defensive_pipeline(local_filename))
            status_label.value = "Status: Processing complete."
            append_log("Pipeline processing completed.")
            # Show summary in Summary tab
            with tab_children[0]:
                print("=== Forensic Report Summary ===\n")
                print(report.natural_summary)
            # Show raw JSON in Raw JSON tab
            with tab_children[2]:
                print("=== Complete JSON Report ===\n")
                print(report.json(indent=2))
        except Exception as e:
            status_label.value = f"Status: Error during processing: {e}"
            append_log(f"Error during processing: {e}")

    run_button.on_click(on_run_clicked)
    ui = widgets.VBox([
        widgets.HBox([upload_widget, size_label, progress_bar]),
        status_label,
        run_button,
        tab,
        widgets.Label("Pipeline Logs:"),
        log_area
    ])
    display(ui)

# Launch the enhanced UI when this cell is executed in Colab
run_pipeline_ui()

VBox(children=(HBox(children=(FileUpload(value={}, accept='.wav,.mp3,.flac,.m4a', description='Upload'), Label…