<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_and_Manipulated_Media_Analysis_R%26D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Deepfake Detection and Manipulated Media Analysis using Multiagent System and Compound AI Approach**

In [1]:
%pip install -q langchain langchain-community langgraph torch transformers opencv-python librosa numpy face-recognition dlib mediapipe scipy pillow tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.2/138.2 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.7/44.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for face-recognition-models (setup.py) ... [?25l[?25hdone


In [2]:
# Required imports
import os
from typing import Dict, List, Any, Tuple, Optional
from dataclasses import dataclass
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain.tools import tool
from langgraph.graph import Graph, END
import numpy as np
import torch
import torch.nn.functional as F
from transformers import (
    AutoProcessor,
    AutoModelForVideoClassification,
    AutoModelForAudioClassification,
    CLIPProcessor,
    CLIPModel,
    Blip2Processor,
    Blip2ForConditionalGeneration,
    VideoMAEFeatureExtractor,
    VideoMAEForVideoClassification,
    WhisperProcessor,
    WhisperForAudioClassification,
    LayoutLMv3Processor,
    LayoutLMv3ForSequenceClassification
)
import cv2
import librosa
import json
import face_recognition
import dlib
import scipy
from scipy.signal import welch
from PIL import Image
import mediapipe as mp
import warnings
from tqdm import tqdm

In [3]:
def setup_models(device: torch.device) -> dict:
    """Initialize all AI models used in the pipeline"""
    models = {}

    try:
        # Video understanding models
        models["videomae"] = VideoMAEForVideoClassification.from_pretrained(
            "MCG-NJU/videomae-base-finetuned-kinetics"
        ).to(device)
        models["videomae_processor"] = VideoMAEFeatureExtractor.from_pretrained(
            "MCG-NJU/videomae-base-finetuned-kinetics"
        )

        # Visual analysis models
        models["clip"] = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
        models["clip_processor"] = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

        # Multimodal understanding
        models["blip2"] = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b"
        ).to(device)
        models["blip2_processor"] = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")

        # Audio analysis
        models["whisper"] = WhisperForAudioClassification.from_pretrained(
            "openai/whisper-large-v3"
        ).to(device)
        models["whisper_processor"] = WhisperProcessor.from_pretrained("openai/whisper-large-v3")

        # Layout analysis for visual artifacts
        models["layoutlm"] = LayoutLMv3ForSequenceClassification.from_pretrained(
            "microsoft/layoutlmv3-base"
        ).to(device)
        models["layoutlm_processor"] = LayoutLMv3Processor.from_pretrained(
            "microsoft/layoutlmv3-base"
        )

        # Face analysis
        models["face_detector"] = dlib.get_frontal_face_detector()
        models["face_predictor"] = dlib.shape_predictor(
            "shape_predictor_68_face_landmarks.dat"
        )

        # MediaPipe for advanced face mesh analysis
        models["face_mesh"] = mp.solutions.face_mesh.FaceMesh(
            static_image_mode=False,
            max_num_faces=1,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )

        # Additional models for specific tasks
        models["action_recognition"] = hub.load(
            "https://tfhub.dev/deepmind/i3d-kinetics-400/1"
        )

    except Exception as e:
        print(f"Error loading models: {str(e)}")
        raise

    return models

In [4]:
import cv2
import librosa
import numpy as np
from tqdm import tqdm

def enhanced_video_processing(video_path: str, models: dict) -> dict:
    """Enhanced video preprocessing with advanced feature extraction"""
    processed_data = {
        "frames": [],
        "audio": {},
        "metadata": {},
        "features": {},
        "face_data": [],
        "motion_data": [],
        "temporal_features": []
    }

    try:
        # Extract frames and basic metadata
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Process frames with advanced features
        prev_frame = None
        flow_accumulator = []

        with tqdm(total=total_frames, desc="Processing frames") as pbar:
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break

                # Store original frame
                processed_data["frames"].append(frame)

                # Extract face data
                face_data = extract_face_features(frame, models)
                processed_data["face_data"].append(face_data)

                # Calculate optical flow if we have a previous frame
                if prev_frame is not None:
                    flow = calculate_optical_flow(prev_frame, frame)
                    flow_accumulator.append(flow)

                prev_frame = frame.copy()
                pbar.update(1)

        cap.release()

        # Process audio
        processed_data["audio"] = extract_audio_features(video_path)

        # Extract temporal features
        processed_data["temporal_features"] = analyze_temporal_coherence(
            processed_data["frames"],
            flow_accumulator
        )

        # Generate comprehensive metadata
        processed_data["metadata"] = extract_enhanced_metadata(
            video_path,
            processed_data
        )

        return processed_data

    except Exception as e:
        print(f"Error in video processing: {str(e)}")
        raise

def enhanced_image_processing(image_path: str, models: dict) -> dict:
    """Enhanced image preprocessing with advanced feature extraction"""
    processed_data = {
        "image": None,
        "metadata": {},
        "features": {},
        "face_data": []
    }

    try:
        # Load image
        image = cv2.imread(image_path)
        processed_data["image"] = image

        # Extract face data
        face_data = extract_face_features(image, models)
        processed_data["face_data"].append(face_data)

        # Generate comprehensive metadata
        processed_data["metadata"] = extract_enhanced_metadata(
            image_path,
            processed_data
        )

        return processed_data

    except Exception as e:
        print(f"Error in image processing: {str(e)}")
        raise

def extract_face_features(frame: np.ndarray, models: dict) -> dict:
    """Extract comprehensive face features using multiple models"""
    face_data = {
        "landmarks": [],
        "mesh": [],
        "emotions": [],
        "quality_metrics": {}
    }

    try:
        # Basic face detection with dlib
        faces = models["face_detector"](frame)

        for face in faces:
            # Get facial landmarks
            shape = models["face_predictor"](frame, face)
            landmarks = np.array([[p.x, p.y] for p in shape.parts()])
            face_data["landmarks"].append(landmarks)

            # MediaPipe face mesh for detailed analysis
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            mesh_results = models["face_mesh"].process(frame_rgb)

            if mesh_results.multi_face_landmarks:
                mesh_points = np.array([
                    [point.x, point.y, point.z]
                    for point in mesh_results.multi_face_landmarks[0].landmark
                ])
                face_data["mesh"].append(mesh_points)

            # Extract face quality metrics
            quality_metrics = calculate_face_quality_metrics(frame, face, landmarks)
            face_data["quality_metrics"].update(quality_metrics)

            # Emotion detection using pretrained model
            emotions = detect_emotions(frame, face)
            face_data["emotions"].append(emotions)

    except Exception as e:
        print(f"Error in face feature extraction: {str(e)}")

    return face_data

def calculate_face_quality_metrics(
    frame: np.ndarray,
    face: dlib.rectangle,
    landmarks: np.ndarray
) -> dict:
    """Calculate various metrics for face quality assessment"""
    metrics = {}

    try:
        # Face size relative to frame
        face_size = (face.right() - face.left()) * (face.bottom() - face.top())
        frame_size = frame.shape[0] * frame.shape[1]
        metrics["relative_size"] = face_size / frame_size

        # Face symmetry
        symmetry_score = calculate_face_symmetry(landmarks)
        metrics["symmetry"] = symmetry_score

        # Blur detection
        laplacian_var = cv2.Laplacian(
            cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)[
                face.top():face.bottom(),
                face.left():face.right()
            ]
        ).var()
        metrics["sharpness"] = laplacian_var

        # Lighting consistency
        lighting_score = analyze_lighting_consistency(
            frame[face.top():face.bottom(), face.left():face.right()]
        )
        metrics["lighting"] = lighting_score

    except Exception as e:
        print(f"Error calculating face quality metrics: {str(e)}")

    return metrics

def detect_emotions(frame: np.ndarray, face: dlib.rectangle) -> dict:
    """Detect emotions in facial expressions"""
    emotions = {}

    try:
        # Extract face region
        face_img = frame[face.top():face.bottom(), face.left():face.right()]
        face_img = cv2.resize(face_img, (224, 224))

        # Use pre-trained emotion detection model
        # This is a placeholder - you would need to implement actual emotion detection
        emotions = {
            "happy": 0.0,
            "sad": 0.0,
            "angry": 0.0,
            "neutral": 0.0,
            "surprise": 0.0
        }

    except Exception as e:
        print(f"Error in emotion detection: {str(e)}")

    return emotions

def extract_audio_features(video_path: str) -> dict:
    """Extract comprehensive audio features"""
    audio_features = {}

    try:
        # Load audio
        y, sr = librosa.load(video_path)

        # Basic features
        audio_features["mfcc"] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        audio_features["spectral_contrast"] = librosa.feature.spectral_contrast(y=y, sr=sr)
        audio_features["chroma"] = librosa.feature.chroma_stft(y=y, sr=sr)

        # Advanced features
        audio_features["spectral_rolloff"] = librosa.feature.spectral_rolloff(y=y, sr=sr)
        audio_features["spectral_bandwidth"] = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        audio_features["spectral_flatness"] = librosa.feature.spectral_flatness(y=y)

        # Temporal features
        audio_features["onset_strength"] = librosa.onset.onset_strength(y=y, sr=sr)
        audio_features["tempo"] = librosa.beat.tempo(onset_envelope=audio_features["onset_strength"], sr=sr)

        # Additional analysis
        audio_features["zero_crossing_rate"] = librosa.feature.zero_crossing_rate(y)
        audio_features["rms"] = librosa.feature.rms(y=y)

        # Calculate overall audio quality metrics
        audio_features["quality_metrics"] = calculate_audio_quality_metrics(y, sr)

    except Exception as e:
        print(f"Error extracting audio features: {str(e)}")

    return audio_features

def calculate_audio_quality_metrics(y: np.ndarray, sr: int) -> dict:
    """Calculate various audio quality metrics"""
    metrics = {}

    try:
        # Signal-to-noise ratio estimation
        noise_floor = np.mean(np.abs(y[y < np.mean(y)]))
        signal_power = np.mean(np.abs(y[y >= np.mean(y)]))
        metrics["snr"] = 20 * np.log10(signal_power / noise_floor) if noise_floor > 0 else 0

        # Dynamic range
        metrics["dynamic_range"] = np.max(np.abs(y)) - np.min(np.abs(y))

        # Spectral centroid (brightness)
        metrics["spectral_centroid"] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))

        # Clarity metrics
        metrics["clarity"] = calculate_audio_clarity(y, sr)

    except Exception as e:
        print(f"Error calculating audio quality metrics: {str(e)}")

    return metrics

def analyze_temporal_coherence(
    frames: List[np.ndarray],
    optical_flow: List[np.ndarray]
) -> dict:
    """Analyze temporal coherence with advanced metrics"""
    temporal_features = {}

    try:
        # Motion analysis
        motion_metrics = analyze_motion_patterns(optical_flow)
        temporal_features["motion"] = motion_metrics

        # Frame consistency
        consistency_metrics = analyze_frame_consistency(frames)
        temporal_features["consistency"] = consistency_metrics

        # Scene transition analysis
        transition_metrics = detect_scene_transitions(frames)
        temporal_features["transitions"] = transition_metrics

        # Calculate overall temporal quality score
        temporal_features["quality_score"] = calculate_temporal_quality(
            motion_metrics,
            consistency_metrics,
            transition_metrics
        )

    except Exception as e:
        print(f"Error analyzing temporal coherence: {str(e)}")

    return temporal_features

In [5]:
import torch

def analyze_visual_content(
    frames: List[np.ndarray],
    models: dict,
    device: torch.device
) -> dict:
    """Analyze visual content using advanced models"""
    visual_results = {}

    try:
        # CLIP analysis
        clip_inputs = models["clip_processor"](images=frames, return_tensors="pt").to(device)
        clip_outputs = models["clip"](**clip_inputs)
        visual_results["clip"] = clip_outputs.logits.softmax(dim=1).cpu().numpy()

        # VideoMAE analysis
        if len(frames) > 1:
            videomae_inputs = models["videomae_processor"](frames, return_tensors="pt").to(device)
            videomae_outputs = models["videomae"](**videomae_inputs)
            visual_results["videomae"] = videomae_outputs.logits.softmax(dim=1).cpu().numpy()

        # Blip2 analysis
        blip2_inputs = models["blip2_processor"](images=frames, return_tensors="pt").to(device)
        blip2_outputs = models["blip2"].generate(**blip2_inputs)
        visual_results["blip2"] = blip2_outputs

    except Exception as e:
        print(f"Error in visual content analysis: {str(e)}")

    return visual_results

def analyze_audio_content(
    audio_data: dict,
    models: dict,
    device: torch.device
) -> dict:
    """Analyze audio content using advanced models"""
    audio_results = {}

    try:
        # Whisper analysis
        whisper_inputs = models["whisper_processor"](audio_data["raw"], return_tensors="pt").to(device)
        whisper_outputs = models["whisper"](**whisper_inputs)
        audio_results["whisper"] = whisper_outputs.logits.softmax(dim=1).cpu().numpy()

    except Exception as e:
        print(f"Error in audio content analysis: {str(e)}")

    return audio_results

def analyze_crossmodal_coherence(
    visual_results: dict,
    audio_results: dict,
    models: dict,
    device: torch.device
) -> dict:
    """Analyze cross-modal coherence using advanced models"""
    coherence_results = {}

    try:
        # LayoutLM analysis
        layoutlm_inputs = models["layoutlm_processor"](visual_results, audio_results, return_tensors="pt").to(device)
        layoutlm_outputs = models["layoutlm"](**layoutlm_inputs)
        coherence_results["layoutlm"] = layoutlm_outputs.logits.softmax(dim=1).cpu().numpy()

    except Exception as e:
        print(f"Error in cross-modal coherence analysis: {str(e)}")

    return coherence_results

In [6]:
import numpy as np

def feature_fusion(results: List[dict], metadata: dict) -> dict:
    """Enhanced feature fusion with adaptive weighting"""
    # Initialize weights based on metadata
    base_weights = {
        "spatial_confidence": 0.25,
        "temporal_confidence": 0.20,
        "facial_confidence": 0.20,
        "audio_confidence": 0.15,
        "semantic_confidence": 0.20
    }

    # Adjust weights based on video properties
    if metadata.get("video_info", {}).get("fps", 0) < 20:
        base_weights["temporal_confidence"] *= 0.8
        base_weights["spatial_confidence"] *= 1.2

    # Collect all confidence scores
    confidence_scores = {}
    for result in results:
        for key, value in result.items():
            if "confidence" in key and isinstance(value, (int, float)):
                confidence_scores[key] = value

    # Calculate weighted average
    total_weight = 0
    weighted_sum = 0

    for key, value in confidence_scores.items():
        weight = base_weights.get(key, 0.1)  # Default weight for unknown metrics
        weighted_sum += value * weight
        total_weight += weight

    weighted_avg = weighted_sum / total_weight if total_weight > 0 else 0.0

    # Calculate uncertainty
    variances = [(score - weighted_avg) ** 2 for score in confidence_scores.values()]
    uncertainty = np.sqrt(np.mean(variances)) if variances else 0.0

    return {
        "final_confidence": float(weighted_avg),
        "uncertainty": float(uncertainty),
        "individual_scores": confidence_scores
    }

def make_decision(fusion_result: dict) -> dict:
    """Enhanced decision making with detailed analysis"""
    confidence = fusion_result["final_confidence"]
    uncertainty = fusion_result["uncertainty"]
    individual_scores = fusion_result["individual_scores"]

    # Dynamic thresholding based on uncertainty
    base_threshold = 0.7
    adjusted_threshold = base_threshold + (uncertainty * 0.1)

    # Analyze score distribution
    score_distribution = analyze_score_distribution(individual_scores)

    # Decision making
    is_fake = confidence < adjusted_threshold
    certainty_level = calculate_certainty_level(confidence, uncertainty)

    # Generate detailed explanation
    explanation = generate_detailed_explanation(
        is_fake, confidence, uncertainty,
        individual_scores, score_distribution
    )

    return {
        "is_fake": is_fake,
        "confidence": confidence,
        "uncertainty": uncertainty,
        "certainty_level": certainty_level,
        "threshold_used": adjusted_threshold,
        "score_distribution": score_distribution,
        "explanation": explanation
    }

def analyze_score_distribution(scores: dict) -> dict:
    """Analyze the distribution of individual scores"""
    values = np.array(list(scores.values()))
    return {
        "mean": float(np.mean(values)),
        "std": float(np.std(values)),
        "min": float(np.min(values)),
        "max": float(np.max(values)),
        "range": float(np.ptp(values)),
        "consistency": float(1 - (np.std(values) / np.mean(values))) if np.mean(values) != 0 else 0.0
    }

def calculate_certainty_level(confidence: float, uncertainty: float) -> str:
    """Calculate the certainty level of the decision"""
    if uncertainty > 0.3:
        return "Low"
    elif uncertainty > 0.15:
        return "Medium"
    else:
        return "High"

def generate_detailed_explanation(
    is_fake: bool,
    confidence: float,
    uncertainty: float,
    individual_scores: dict,
    score_distribution: dict
) -> str:
    """Generate a detailed explanation of the decision"""
    explanation_parts = []

    # Overall decision
    decision_text = "likely manipulated" if is_fake else "likely authentic"
    explanation_parts.append(f"The media is {decision_text} with {confidence:.1%} confidence.")

    # Uncertainty analysis
    explanation_parts.append(
        f"The uncertainty level is {uncertainty:.1%}, indicating a "
        f"{'high' if uncertainty > 0.3 else 'moderate' if uncertainty > 0.15 else 'low'} "
        "level of prediction variability."
    )

    # Individual score analysis
    strongest_evidence = max(individual_scores.items(), key=lambda x: x[1])
    weakest_evidence = min(individual_scores.items(), key=lambda x: x[1])

    explanation_parts.append(
        f"The strongest evidence comes from {strongest_evidence[0]} "
        f"({strongest_evidence[1]:.1%}), while the weakest indicator is "
        f"{weakest_evidence[0]} ({weakest_evidence[1]:.1%})."
    )

    # Score consistency
    explanation_parts.append(
        f"The consistency between different detection methods is "
        f"{score_distribution['consistency']:.1%}."
    )

    return " ".join(explanation_parts)

In [7]:
from langgraph.graph import Graph, END

def create_detection_graph():
    """Create enhanced LangGraph workflow"""
    workflow = Graph()

    def preprocess(state):
        if state["input_type"] == "video":
            video_data = enhanced_video_processing(state["input_path"], state["env"]["models"])
        else:
            video_data = enhanced_image_processing(state["input_path"], state["env"]["models"])
        return {"video_data": video_data}

    def visual_analysis(state):
        frames = state["video_data"]["frames"] if "frames" in state["video_data"] else [state["video_data"]["image"]]
        optical_flow = state["video_data"].get("optical_flow", [])

        # Run visual analysis agents in parallel
        spatial_results = spatial_inconsistency_agent(frames, state["env"]["device"])
        temporal_results = temporal_coherence_agent(frames, optical_flow)
        facial_results = facial_analysis_agent(frames, state["env"]["models"]["face_detector"])

        return {
            "visual_results": {
                "spatial": spatial_results,
                "temporal": temporal_results,
                "facial": facial_results
            }
        }

    def audio_analysis(state):
        if "audio" in state["video_data"]:
            audio_data = state["video_data"]["audio"]
            results = audio_analysis_agent(audio_data)
            return {"audio_results": results}
        return {"audio_results": {}}

    def semantic_analysis(state):
        results = semantic_analysis_agent(state["video_data"])
        return {"semantic_results": results}

    def decision(state):
        all_results = [
            state["visual_results"]["spatial"],
            state["visual_results"]["temporal"],
            state["visual_results"]["facial"],
            state["audio_results"],
            state["semantic_results"]
        ]

        fusion_result = feature_fusion(all_results, state["video_data"]["metadata"])
        final_decision = make_decision(fusion_result)

        return {"decision": final_decision, "end": True}

    # Define workflow nodes
    workflow.add_node("preprocess", preprocess)
    workflow.add_node("visual_analysis", visual_analysis)
    workflow.add_node("audio_analysis", audio_analysis)
    workflow.add_node("semantic_analysis", semantic_analysis)
    workflow.add_node("decision", decision)

    # Define workflow edges
    workflow.add_edge("preprocess", "visual_analysis")
    workflow.add_edge("preprocess", "audio_analysis")
    workflow.add_edge("preprocess", "semantic_analysis")
    workflow.add_edge("visual_analysis", "decision")
    workflow.add_edge("audio_analysis", "decision")
    workflow.add_edge("semantic_analysis", "decision")

    return workflow

In [None]:
import datetime
import json

def detect_deepfake(input_path: str, input_type: str = "video", verbose: bool = False) -> dict:
    """Enhanced main function to run deepfake detection"""
    try:
        # Setup environment
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        models = setup_models(device)
        env = {
            "device": device,
            "models": models
        }

        # Create and compile workflow
        workflow = create_detection_graph()

        # Configure logging
        if verbose:
            print("Verbose mode enabled")
        else:
            print("Verbose mode disabled")

        # Run detection
        config = {
            "input_path": input_path,
            "input_type": input_type,
            "env": env
        }

        print("Starting deepfake detection pipeline...")
        result = workflow.run(config)
        print("Detection pipeline completed successfully.")

        return format_forensic_report(result["decision"])

    except Exception as e:
        print(f"Error in deepfake detection: {e}")
        return {
            "error": str(e),
            "is_fake": None,
            "confidence": 0.0,
            "explanation": "An error occurred during detection."
        }

def format_forensic_report(decision: dict) -> dict:
    """Format the decision as a forensic report"""
    report = {
        "timestamp": datetime.datetime.now().isoformat(),
        "is_fake": decision["is_fake"],
        "confidence": decision["confidence"],
        "uncertainty": decision["uncertainty"],
        "certainty_level": decision["certainty_level"],
        "threshold_used": decision["threshold_used"],
        "score_distribution": decision["score_distribution"],
        "explanation": decision["explanation"]
    }
    return report

# Example usage
if __name__ == "__main__":
    # Directly specify the input path, type, and verbosity
    input_path = "path/to/your/video.mp4"  # or "path/to/your/image.jpg"
    input_type = "video"  # or "image"
    verbose = True

    result = detect_deepfake(input_path, input_type, verbose=verbose)
    print("\nDeepfake Detection Forensic Report:")
    print(json.dumps(result, indent=2))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/22.9k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

In [None]:
# Semantic Analysis Agents

def setup_llm_agent():
    """Setup enhanced LLM agent"""
    llm = ChatOpenAI(model="gpt-4-vision-preview", temperature=0.2)

    prompt = ChatPromptTemplate.from_messages([
        ("system", """Analyze the following media content for signs of manipulation.
        Consider:
        1. Visual consistency and artifacts
        2. Audio-visual synchronization
        3. Natural movement and expressions
        4. Lighting and shadow consistency
        5. Edge artifacts and blending issues

        Provide a detailed analysis with confidence scores."""),
        ("human", "{input}")
    ])

    return llm, prompt

@tool
def semantic_analysis_agent(input_data: Dict[str, Any]) -> Dict[str, float]:
    """Enhanced semantic analysis"""
    llm, prompt = setup_llm_agent()

    # Prepare input data
    analysis_text = {
        "video_metadata": input_data.get("metadata", {}),
        "detected_faces": len(face_recognition.face_locations(input_data["frames"][0])) if "frames" in input_data else 0,
        "video_length": len(input_data["frames"]) if "frames" in input_data else 0,
        "audio_features": bool(input_data.get("audio", {}))
    }

    chain = prompt | llm | JsonOutputParser()

    try:
        result = chain.invoke({"input": json.dumps(analysis_text)})
        confidence = result.get("confidence", 0.0)

        # Additional analysis of response content
        response_detail = result.get("analysis", "")
        detail_score = analyze_llm_response_detail(response_detail)

        return {
            "semantic_confidence": float(confidence),
            "detail_score": float(detail_score),
            "analysis": response_detail
        }
    except Exception as e:
        print(f"Error in semantic analysis: {e}")
        return {"semantic_confidence": 0.0, "error": str(e)}

def analyze_llm_response_detail(response: str) -> float:
    """Analyze the detail level of LLM response"""
    # Consider factors like response length, specific terminology use, etc.
    detail_metrics = {
        "length": len(response) / 1000,  # Normalize by 1000 chars
        "technical_terms": len([word for word in response.lower().split()
                              if word in ["artifact", "consistency", "synchronization",
                                        "manipulation", "synthetic", "generated"]]) / 10
    }
    return np.mean(list(detail_metrics.values()))

# Integration and Decision Making

def feature_fusion(results: List[Dict[str, float]], metadata: Dict[str, Any]) -> Dict[str, float]:
    """Enhanced feature fusion with adaptive weighting"""
    # Initialize weights based on metadata
    base_weights = {
        "spatial_confidence": 0.25,
        "temporal_confidence": 0.20,
        "facial_confidence": 0.20,
        "audio_confidence": 0.15,
        "semantic_confidence": 0.20
    }

    # Adjust weights based on video properties
    if metadata.get("video_info", {}).get("fps", 0) < 20:
        base_weights["temporal_confidence"] *= 0.8
        base_weights["spatial_confidence"] *= 1.2

    # Collect all confidence scores
    confidence_scores = {}
    for result in results:
        for key, value in result.items():
            if "confidence" in key and isinstance(value, (int, float)):
                confidence_scores[key] = value

    # Calculate weighted average
    total_weight = 0
    weighted_sum = 0

    for key, value in confidence_scores.items():
        weight = base_weights.get(key, 0.1)  # Default weight for unknown metrics
        weighted_sum += value * weight
        total_weight += weight

    weighted_avg = weighted_sum / total_weight if total_weight > 0 else 0.0

    # Calculate uncertainty
    variances = [(score - weighted_avg) ** 2 for score in confidence_scores.values()]
    uncertainty = np.sqrt(np.mean(variances)) if variances else 0.0

    return {
        "final_confidence": float(weighted_avg),
        "uncertainty": float(uncertainty),
        "individual_scores": confidence_scores
    }

In [None]:
def make_decision(fusion_result: Dict[str, float]) -> Dict[str, Any]:
    """Enhanced decision making with detailed analysis"""
    confidence = fusion_result["final_confidence"]
    uncertainty = fusion_result["uncertainty"]
    individual_scores = fusion_result["individual_scores"]

    # Dynamic thresholding based on uncertainty
    base_threshold = 0.7
    adjusted_threshold = base_threshold + (uncertainty * 0.1)

    # Analyze score distribution
    score_distribution = analyze_score_distribution(individual_scores)

    # Decision making
    is_fake = confidence < adjusted_threshold
    certainty_level = calculate_certainty_level(confidence, uncertainty)

    # Generate detailed explanation
    explanation = generate_detailed_explanation(
        is_fake, confidence, uncertainty,
        individual_scores, score_distribution
    )

    return {
        "is_fake": is_fake,
        "confidence": confidence,
        "uncertainty": uncertainty,
        "certainty_level": certainty_level,
        "threshold_used": adjusted_threshold,
        "score_distribution": score_distribution,
        "explanation": explanation
    }

def analyze_score_distribution(scores: Dict[str, float]) -> Dict[str, Any]:
    """Analyze the distribution of individual scores"""
    values = np.array(list(scores.values()))
    return {
        "mean": float(np.mean(values)),
        "std": float(np.std(values)),
        "min": float(np.min(values)),
        "max": float(np.max(values)),
        "range": float(np.ptp(values)),
        "consistency": float(1 - (np.std(values) / np.mean(values))) if np.mean(values) != 0 else 0.0
    }

def calculate_certainty_level(confidence: float, uncertainty: float) -> str:
    """Calculate the certainty level of the decision"""
    if uncertainty > 0.3:
        return "Low"
    elif uncertainty > 0.15:
        return "Medium"
    else:
        return "High"

def generate_detailed_explanation(
    is_fake: bool,
    confidence: float,
    uncertainty: float,
    individual_scores: Dict[str, float],
    score_distribution: Dict[str, Any]
) -> str:
    """Generate a detailed explanation of the decision"""
    explanation_parts = []

    # Overall decision
    decision_text = "likely manipulated" if is_fake else "likely authentic"
    explanation_parts.append(f"The media is {decision_text} with {confidence:.1%} confidence.")

    # Uncertainty analysis
    explanation_parts.append(
        f"The uncertainty level is {uncertainty:.1%}, indicating a "
        f"{'high' if uncertainty > 0.3 else 'moderate' if uncertainty > 0.15 else 'low'} "
        "level of prediction variability."
    )

    # Individual score analysis
    strongest_evidence = max(individual_scores.items(), key=lambda x: x[1])
    weakest_evidence = min(individual_scores.items(), key=lambda x: x[1])

    explanation_parts.append(
        f"The strongest evidence comes from {strongest_evidence[0]} "
        f"({strongest_evidence[1]:.1%}), while the weakest indicator is "
        f"{weakest_evidence[0]} ({weakest_evidence[1]:.1%})."
    )

    # Score consistency
    explanation_parts.append(
        f"The consistency between different detection methods is "
        f"{score_distribution['consistency']:.1%}."
    )

    return " ".join(explanation_parts)

In [None]:
# Main Pipeline

def create_detection_graph():
    """Create enhanced LangGraph workflow"""
    workflow = Graph()

    @workflow.node("preprocess")
    def preprocess(state):
        video_data = preprocess_video(state["input_path"])
        return {"video_data": video_data}

    @workflow.node("visual_analysis")
    def visual_analysis(state):
        frames = state["video_data"]["frames"]
        optical_flow = state["video_data"]["optical_flow"]

        # Run visual analysis agents in parallel
        spatial_results = spatial_inconsistency_agent(frames, state["env"]["device"])
        temporal_results = temporal_coherence_agent(frames, optical_flow)
        facial_results = facial_analysis_agent(frames, state["env"]["face_detector"])

        return {
            "visual_results": {
                "spatial": spatial_results,
                "temporal": temporal_results,
                "facial": facial_results
            }
        }

    @workflow.node("audio_analysis")
    def audio_analysis(state):
        audio_data = state["video_data"]["audio"]
        results = audio_analysis_agent(audio_data)
        return {"audio_results": results}

    @workflow.node("semantic_analysis")
    def semantic_analysis(state):
        results = semantic_analysis_agent(state["video_data"])
        return {"semantic_results": results}

    @workflow.node("decision")
    def decision(state):
        all_results = [
            state["visual_results"]["spatial"],
            state["visual_results"]["temporal"],
            state["visual_results"]["facial"],
            state["audio_results"],
            state["semantic_results"]
        ]

        fusion_result = feature_fusion(all_results, state["video_data"]["metadata"])
        final_decision = make_decision(fusion_result)

        return {"decision": final_decision, "end": True}

    # Define workflow
    workflow.set_entry_point("preprocess")
    workflow.add_edge("preprocess", "visual_analysis")
    workflow.add_edge("preprocess", "audio_analysis")
    workflow.add_edge("preprocess", "semantic_analysis")
    workflow.add_edge("visual_analysis", "decision")
    workflow.add_edge("audio_analysis", "decision")
    workflow.add_edge("semantic_analysis", "decision")

    return workflow

def detect_deepfake(video_path: str, verbose: bool = False) -> Dict[str, Any]:
    """Enhanced main function to run deepfake detection"""
    try:
        # Setup environment
        env = setup_environment()

        # Create and compile workflow
        workflow = create_detection_graph()

        # Configure logging
        if verbose:
            print("Verbose mode enabled")
        else:
            print("Verbose mode disabled")

        # Run detection
        config = {
            "input_path": video_path,
            "env": env
        }

        print("Starting deepfake detection pipeline...")
        result = workflow.run(config)
        print("Detection pipeline completed successfully.")

        return result["decision"]

    except Exception as e:
        print(f"Error in deepfake detection: {e}")
        return {
            "error": str(e),
            "is_fake": None,
            "confidence": 0.0,
            "explanation": "An error occurred during detection."
        }

In [None]:
# Main Pipeline

def create_detection_graph():
    """Create enhanced LangGraph workflow"""
    workflow = Graph()

    @workflow.node("preprocess")
    def preprocess(state):
        video_data = preprocess_video(state["input_path"])
        return {"video_data": video_data}

    @workflow.node("visual_analysis")
    def visual_analysis(state):
        frames = state["video_data"]["frames"]
        optical_flow = state["video_data"]["optical_flow"]

        # Run visual analysis agents in parallel
        spatial_results = spatial_inconsistency_agent(frames, state["env"]["device"])
        temporal_results = temporal_coherence_agent(frames, optical_flow)
        facial_results = facial_analysis_agent(frames, state["env"]["face_detector"])

        return {
            "visual_results": {
                "spatial": spatial_results,
                "temporal": temporal_results,
                "facial": facial_results
            }
        }

    @workflow.node("audio_analysis")
    def audio_analysis(state):
        audio_data = state["video_data"]["audio"]
        results = audio_analysis_agent(audio_data)
        return {"audio_results": results}

    @workflow.node("semantic_analysis")
    def semantic_analysis(state):
        results = semantic_analysis_agent(state["video_data"])
        return {"semantic_results": results}

    @workflow.node("decision")
    def decision(state):
        all_results = [
            state["visual_results"]["spatial"],
            state["visual_results"]["temporal"],
            state["visual_results"]["facial"],
            state["audio_results"],
            state["semantic_results"]
        ]

        fusion_result = feature_fusion(all_results, state["video_data"]["metadata"])
        final_decision = make_decision(fusion_result)

        return {"decision": final_decision, "end": True}

    # Define workflow
    workflow.set_entry_point("preprocess")
    workflow.add_edge("preprocess", "visual_analysis")
    workflow.add_edge("preprocess", "audio_analysis")
    workflow.add_edge("preprocess", "semantic_analysis")
    workflow.add_edge("visual_analysis", "decision")
    workflow.add_edge("audio_analysis", "decision")
    workflow.add_edge("semantic_analysis", "decision")

    return workflow

def detect_deepfake(video_path: str, verbose: bool = False) -> Dict[str, Any]:
    """Enhanced main function to run deepfake detection"""
    try:
        # Setup environment
        env = setup_environment()

        # Create and compile workflow
        workflow = create_detection_graph()

        # Configure logging
        if verbose:
            print("Verbose mode enabled")
        else:
            print("Verbose mode disabled")

        # Run detection
        config = {
            "input_path": video_path,
            "env": env
        }

        print("Starting deepfake detection pipeline...")
        result = workflow.run(config)
        print("Detection pipeline completed successfully.")

        return result["decision"]

    except Exception as e:
        print(f"Error in deepfake detection: {e}")
        return {
            "error": str(e),
            "is_fake": None,
            "confidence": 0.0,
            "explanation": "An error occurred during detection."
        }

In [None]:
# Example usage
if __name__ == "__main__":
    # Directly specify the video path and verbosity
    video_path = "path/to/your/video.mp4"
    verbose = True

    result = detect_deepfake(video_path, verbose=verbose)
    print("\nDeepfake Detection Results:")
    print(json.dumps(result, indent=2))

Using device: cuda
Error in deepfake detection: 'Graph' object has no attribute 'node'

Deepfake Detection Results:
{
  "error": "'Graph' object has no attribute 'node'",
  "is_fake": null,
  "confidence": 0.0,
  "explanation": "An error occurred during detection."
}
