<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_and_Manipulated_Media_Analysis_R%26D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Deepfake Detection and Manipulated Media Analysis using Multiagent System and Compound AI Approach**

In [1]:
!pip install -q langchain langchain-community langgraph torch transformers opencv-python librosa numpy face-recognition dlib mediapipe scipy pillow tqdm pydantic

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.2/138.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.7/44.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for face-recognition-models (setup.py) ... [?25l[?25hdone


In [3]:
from typing import Dict, List, Any, Tuple, Optional
import torch
import numpy as np
import cv2
import librosa
import face_recognition
import mediapipe as mp
from pydantic import BaseModel, Field
from datetime import datetime
import json
from transformers import (
    AutoProcessor, AutoModelForVideoClassification, AutoModelForAudioClassification,
    CLIPProcessor, CLIPModel, Blip2Processor, Blip2ForConditionalGeneration,
    VideoMAEFeatureExtractor, VideoMAEForVideoClassification, WhisperProcessor,
    WhisperForAudioClassification, LayoutLMv3Processor, LayoutLMv3ForSequenceClassification,
    OwlViTProcessor, OwlViTForObjectDetection, InstructBlipProcessor,
    InstructBlipForConditionalGeneration, ImageGPTForCausalImageModeling,
    TimesformerForVideoClassification, AutoModelForAudioClassification,
    ASTForAudioClassification, Wav2Vec2ForSequenceClassification
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langgraph.graph import Graph, END

In [4]:
# Pydantic models for structured output
class AnalysisMetrics(BaseModel):
    score: float = Field(..., description="Confidence score between 0 and 1")
    anomalies: List[str] = Field(default_factory=list, description="Detected anomalies")
    confidence_level: str = Field(..., description="Low/Medium/High confidence assessment")

class ModalityAnalysis(BaseModel):
    visual: AnalysisMetrics
    temporal: AnalysisMetrics
    facial: AnalysisMetrics
    audio: AnalysisMetrics
    semantic: AnalysisMetrics
    behavioral: AnalysisMetrics

class ForensicReport(BaseModel):
    timestamp: datetime = Field(default_factory=datetime.now)
    file_metadata: Dict[str, Any]
    analysis_results: ModalityAnalysis
    final_verdict: str
    confidence_score: float
    risk_assessment: str
    evidence_summary: List[str]
    recommendations: List[str]

In [6]:
def setup_enhanced_environment():
    """Initialize advanced environment with additional models"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    models = {
        # Video Analysis Models
        "videomae": VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics"),
        "timesformer": TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400"),

        # Vision Models
        "clip": CLIPModel.from_pretrained("openai/clip-vit-large-patch14"),
        "owlvit": OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32"),
        "instructblip": InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b"),

        # Audio Models
        "whisper": WhisperForAudioClassification.from_pretrained("openai/whisper-large-v3"),
        "wav2vec2": Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base"),
        "ast": ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593"),

        # Face Analysis
        "face_detector": mp.solutions.face_detection.FaceDetection(min_detection_confidence=0.7),
        "face_mesh": mp.solutions.face_mesh.FaceMesh(
            static_image_mode=False,
            max_num_faces=1,
            min_detection_confidence=0.7,
            min_tracking_confidence=0.7
        ),

        # Processors
        "processors": {
            "clip": CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14"),
            "owlvit": OwlViTProcessor.from_pretrained("google/owlvit-base-patch32"),
            "instructblip": InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b"),
            "whisper": WhisperProcessor.from_pretrained("openai/whisper-large-v3")
        }
    }

    # Move models to device
    for key, model in models.items():
        if isinstance(model, torch.nn.Module):
            models[key] = model.to(device)

    return {"device": device, "models": models}

In [7]:
def enhanced_preprocess_video(video_path: str) -> Dict[str, Any]:
    """Enhanced preprocessing with additional feature extraction"""
    cap = cv2.VideoCapture(video_path)
    frames = []
    audio_data = None
    metadata = {
        "fps": cap.get(cv2.CAP_PROP_FPS),
        "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
        "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
        "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
        "duration": float(cap.get(cv2.CAP_PROP_FRAME_COUNT)) / float(cap.get(cv2.CAP_PROP_FPS))
    }

    # Enhanced frame extraction with quality metrics
    frame_quality_metrics = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(rgb_frame)

        # Calculate frame quality metrics
        blur = cv2.Laplacian(frame, cv2.CV_64F).var()
        noise = np.std(frame)
        frame_quality_metrics.append({
            "blur": blur,
            "noise": noise
        })

    cap.release()

    # Enhanced audio extraction
    try:
        y, sr = librosa.load(video_path)

        # Extract advanced audio features
        audio_data = {
            "raw": y,
            "sr": sr,
            "mfcc": librosa.feature.mfcc(y=y, sr=sr),
            "spectral_contrast": librosa.feature.spectral_contrast(y=y, sr=sr),
            "chroma": librosa.feature.chroma_stft(y=y, sr=sr),
            "onset_env": librosa.onset.onset_strength(y=y, sr=sr),
            "tempo": librosa.beat.tempo(y=y, sr=sr)[0]
        }
    except Exception as e:
        print(f"Audio extraction error: {e}")
        audio_data = None

    return {
        "frames": frames,
        "audio": audio_data,
        "metadata": metadata,
        "frame_quality": frame_quality_metrics
    }

def advanced_visual_analysis_agent(
    frames: List[np.ndarray],
    models: Dict[str, Any],
    device: torch.device
) -> AnalysisMetrics:
    """Advanced visual analysis using multiple models"""
    anomalies = []
    scores = []

    # CLIP Analysis
    for frame in frames[::10]:  # Analyze every 10th frame for efficiency
        clip_inputs = models["processors"]["clip"](images=frame, return_tensors="pt").to(device)
        clip_features = models["clip"](**clip_inputs).image_features
        clip_score = float(clip_features.mean().item())
        scores.append(clip_score)

        if clip_score < 0.5:
            anomalies.append(f"Low CLIP confidence at frame {len(scores) * 10}")

    # OWL-ViT Object Detection
    for frame in frames[::30]:
        inputs = models["processors"]["owlvit"](images=frame, return_tensors="pt").to(device)
        outputs = models["owlvit"](**inputs)

        # Analyze object detection confidence
        if outputs.logits.mean().item() < 0.3:
            anomalies.append("Suspicious object detection patterns detected")

    # InstructBLIP Analysis
    prompts = [
        "Describe any visual artifacts or inconsistencies in this image.",
        "Are there any unnatural elements in this image?",
        "Analyze the lighting and shadows in this image."
    ]

    for frame in frames[::60]:
        for prompt in prompts:
            inputs = models["processors"]["instructblip"](images=frame, text=prompt, return_tensors="pt").to(device)
            outputs = models["instructblip"].generate(**inputs)

            # Analysis logic here
            if "artifact" in outputs or "unnatural" in outputs:
                anomalies.append(f"InstructBLIP detected potential artifacts: {outputs}")

    # Calculate final metrics
    avg_score = np.mean(scores)
    confidence_level = "High" if avg_score > 0.8 else "Medium" if avg_score > 0.6 else "Low"

    return AnalysisMetrics(
        score=float(avg_score),
        anomalies=anomalies,
        confidence_level=confidence_level
    )

def advanced_audio_analysis_agent(
    audio_data: Dict[str, Any],
    models: Dict[str, Any],
    device: torch.device
) -> AnalysisMetrics:
    """Advanced audio analysis using multiple models"""
    if not audio_data:
        return AnalysisMetrics(
            score=0.0,
            anomalies=["No audio data available"],
            confidence_level="Low"
        )

    anomalies = []
    scores = []

    # Whisper Analysis
    whisper_inputs = models["processors"]["whisper"](
        audio_data["raw"],
        return_tensors="pt"
    ).to(device)
    whisper_outputs = models["whisper"](**whisper_inputs)
    whisper_score = whisper_outputs.logits.softmax(dim=1).mean().item()
    scores.append(whisper_score)

    # Wav2Vec2 Analysis
    wav2vec_outputs = models["wav2vec2"](
        torch.tensor(audio_data["raw"]).unsqueeze(0).to(device)
    )
    wav2vec_score = wav2vec_outputs.logits.softmax(dim=1).max().item()
    scores.append(wav2vec_score)

    # AST Analysis
    ast_outputs = models["ast"](
        torch.tensor(audio_data["raw"]).unsqueeze(0).to(device)
    )
    ast_score = ast_outputs.logits.softmax(dim=1).max().item()
    scores.append(ast_score)

    # Advanced audio feature analysis
    mfcc_variance = np.var(audio_data["mfcc"])
    if mfcc_variance > 1.5:
        anomalies.append("Unusual MFCC patterns detected")

    tempo_consistency = np.std(audio_data["onset_env"]) / np.mean(audio_data["onset_env"])
    if tempo_consistency > 0.5:
        anomalies.append("Inconsistent tempo patterns detected")

    # Calculate final metrics
    avg_score = np.mean(scores)
    confidence_level = "High" if avg_score > 0.8 else "Medium" if avg_score > 0.6 else "Low"

    return AnalysisMetrics(
        score=float(avg_score),
        anomalies=anomalies,
        confidence_level=confidence_level
    )

In [8]:
def generate_forensic_report(
    analysis_results: Dict[str, AnalysisMetrics],
    metadata: Dict[str, Any]
) -> ForensicReport:
    """Generate comprehensive forensic report"""
    modality_analysis = ModalityAnalysis(
        visual=analysis_results["visual"],
        temporal=analysis_results["temporal"],
        facial=analysis_results["facial"],
        audio=analysis_results["audio"],
        semantic=analysis_results["semantic"],
        behavioral=analysis_results["behavioral"]
    )

    # Calculate final confidence score
    weights = {
        "visual": 0.25,
        "temporal": 0.2,
        "facial": 0.2,
        "audio": 0.15,
        "semantic": 0.1,
        "behavioral": 0.1
    }

    final_score = sum(
        analysis_results[key].score * weights[key]
        for key in weights
    )

    # Determine verdict
    verdict = "AUTHENTIC" if final_score > 0.7 else "MANIPULATED"

    # Risk assessment
    risk_levels = {
        (0.8, 1.0): "Low Risk - High confidence in authenticity",
        (0.6, 0.8): "Medium Risk - Some suspicious patterns detected",
        (0.0, 0.6): "High Risk - Strong indicators of manipulation"
    }

    risk_assessment = next(
        desc for (lower, upper), desc in risk_levels.items()
        if lower <= final_score < upper
    )

    # Compile evidence summary
    evidence_summary = []
    for modality, metrics in analysis_results.items():
        if metrics.anomalies:
            evidence_summary.extend(
                f"{modality.capitalize()} Analysis: {anomaly}"
                for anomaly in metrics.anomalies
            )

    # Generate recommendations
    recommendations = [
        "Conduct additional manual review by forensic experts",
        "Cross-reference with original source material if available",
        "Verify temporal consistency across all frames",
        "Check audio-visual synchronization in detail"
    ]

    if final_score < 0.6:
        recommendations.extend([
            "Flag content for immediate review",
            "Consider restricting distribution until verification"
        ])

    return ForensicReport(
        file_metadata=metadata,
        analysis_results=modality_analysis,
        final_verdict=verdict,
        confidence_score=final_score,
        risk_assessment=risk_assessment,
        evidence_summary=evidence_summary,
        recommendations=recommendations
    )

def run_enhanced_detection(video_path: str, verbose: bool = False) -> ForensicReport:
    """Main function to run enhanced deepfake detection pipeline"""
    try:
        # Setup environment
        env = setup_enhanced_environment()

        if verbose:
            print("Initialized environment and models")

        # Preprocess video
        preprocessed_data = enhanced_preprocess_video(video_path)

        if verbose:
            print("Completed video preprocessing")

        # Run analysis agents
        analysis_results = {
            "visual": advanced_visual_analysis_agent(
                preprocessed_data["frames"],
                env["models"],
                env["device"]
            ),
            "temporal": temporal_coherence_agent(
                preprocessed_data["frames"],
                preprocessed_data.get("optical_flow", [])
            ),
            "facial": facial_analysis_agent(
                preprocessed_data["frames"],
                env["models"]["face_detector"]
            ),
            "audio": advanced_audio_analysis_agent(
                preprocessed_data["audio"],
                env["models"],
                env["device"]
            ),
            "semantic": semantic_analysis_agent(preprocessed_data),
            "behavioral": behavioral_analysis_agent(preprocessed_data)

SyntaxError: incomplete input (<ipython-input-8-f070329f0ab6>, line 114)

In [9]:
def temporal_coherence_agent(
    frames: List[np.ndarray],
    optical_flow: List[np.ndarray]
) -> AnalysisMetrics:
    """Advanced temporal coherence analysis"""
    anomalies = []
    scores = []

    # Calculate frame-to-frame consistency
    for i in range(len(frames) - 1):
        # Calculate structural similarity
        ssim = cv2.compareSSIM(frames[i], frames[i+1], multichannel=True)
        scores.append(ssim)

        if ssim < 0.5:
            anomalies.append(f"Abrupt scene change detected at frame {i}")

    # Analyze motion patterns
    if optical_flow:
        flow_magnitudes = [np.mean(np.abs(flow)) for flow in optical_flow]
        flow_consistency = np.std(flow_magnitudes) / np.mean(flow_magnitudes)

        if flow_consistency > 0.5:
            anomalies.append("Inconsistent motion patterns detected")
            scores.append(1 - flow_consistency)

    avg_score = np.mean(scores)
    confidence_level = "High" if avg_score > 0.8 else "Medium" if avg_score > 0.6 else "Low"

    return AnalysisMetrics(
        score=float(avg_score),
        anomalies=anomalies,
        confidence_level=confidence_level
    )

In [10]:
def behavioral_analysis_agent(video_data: Dict[str, Any]) -> AnalysisMetrics:
    """Analyze behavioral patterns and inconsistencies"""
    frames = video_data["frames"]
    anomalies = []
    scores = []

    # Initialize pose estimation
    pose_estimator = mp.solutions.pose.Pose(
        static_image_mode=False,
        min_detection_confidence=0.7,
        min_tracking_confidence=0.7
    )

    # Analyze pose landmarks over time
    pose_landmarks = []
    for frame in frames:
        results = pose_estimator.process(frame)
        if results.pose_landmarks:
            landmarks = np.array([[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark])
            pose_landmarks.append(landmarks)

    if pose_landmarks:
        # Analyze movement smoothness
        movement_smoothness = analyze_movement_smoothness(pose_landmarks)
        scores.append(movement_smoothness)

        if movement_smoothness < 0.6:
            anomalies.append("Unnatural body movement patterns detected")

        # Analyze joint angles
        joint_consistency = analyze_joint_angles(pose_landmarks)
        scores.append(joint_consistency)

        if joint_consistency < 0.5:
            anomalies.append("Inconsistent joint movements detected")

    # Add gaze analysis if faces are detected
    face_mesh = mp.solutions.face_mesh.FaceMesh(
        static_image_mode=False,
        max_num_faces=1,
        min_detection_confidence=0.7
    )

    gaze_patterns = analyze_gaze_patterns(frames, face_mesh)
    if gaze_patterns:
        scores.append(gaze_patterns)
        if gaze_patterns < 0.6:
            anomalies.append("Unnatural gaze patterns detected")

    avg_score = np.mean(scores) if scores else 0.0
    confidence_level = "High" if avg_score > 0.8 else "Medium" if avg_score > 0.6 else "Low"

    return AnalysisMetrics(
        score=float(avg_score),
        anomalies=anomalies,
        confidence_level=confidence_level
    )

In [11]:
def analyze_movement_smoothness(pose_landmarks: List[np.ndarray]) -> float:
    """Analyze smoothness of movement trajectories"""
    if len(pose_landmarks) < 3:
        return 0.0

    # Calculate velocity and acceleration
    velocities = np.diff(pose_landmarks, axis=0)
    accelerations = np.diff(velocities, axis=0)

    # Calculate jerk (rate of change of acceleration)
    jerk = np.diff(accelerations, axis=0)

    # Normalize jerk score (lower jerk = smoother movement)
    jerk_score = 1.0 - min(np.mean(np.abs(jerk)), 1.0)

    return float(jerk_score)

def analyze_joint_angles(pose_landmarks: List[np.ndarray]) -> float:
    """Analyze consistency of joint angles over time"""
    if len(pose_landmarks) < 2:
        return 0.0

    # Define key joint connections
    joint_connections = [
        # Upper body
        ([11, 13, 15], "right_arm"),  # right shoulder -> elbow -> wrist
        ([12, 14, 16], "left_arm"),   # left shoulder -> elbow -> wrist
        # Lower body
        ([23, 25, 27], "right_leg"),  # right hip -> knee -> ankle
        ([24, 26, 28], "left_leg")    # left hip -> knee -> ankle
    ]

    angle_scores = []
    for joints, _ in joint_connections:
        angles = []
        for landmarks in pose_landmarks:
            v1 = landmarks[joints[1]] - landmarks[joints[0]]
            v2 = landmarks[joints[2]] - landmarks[joints[1]]
            angle = np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
            angles.append(angle)

        # Calculate angle consistency
        angle_consistency = 1.0 - min(np.std(angles) / np.pi, 1.0)
        angle_scores.append(angle_consistency)

    return float(np.mean(angle_scores)) if angle_scores else 0.0

def analyze_gaze_patterns(
    frames: List[np.ndarray],
    face_mesh: mp.solutions.face_mesh.FaceMesh
) -> float:
    """Analyze naturalness of gaze patterns"""
    # Eye landmarks indices for MediaPipe Face Mesh
    LEFT_EYE = [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398]
    RIGHT_EYE = [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246]

    gaze_directions = []
    for frame in frames:
        results = face_mesh.process(frame)
        if results.multi_face_landmarks:
            face_landmarks = results.multi_face_landmarks[0]

            # Extract eye landmarks
            left_eye_points = np.array([[lm.x, lm.y, lm.z] for i, lm in enumerate(face_landmarks.landmark) if i in LEFT_EYE])
            right_eye_points = np.array([[lm.x, lm.y, lm.z] for i, lm in enumerate(face_landmarks.landmark) if i in RIGHT_EYE])

            # Calculate gaze direction
            left_gaze = calculate_gaze_direction(left_eye_points)
            right_gaze = calculate_gaze_direction(right_eye_points)

            # Check for gaze consistency between eyes
            gaze_consistency = 1.0 - min(np.linalg.norm(left_gaze - right_gaze), 1.0)
            gaze_directions.append(gaze_consistency)

    return float(np.mean(gaze_directions)) if gaze_directions else 0.0

def calculate_gaze_direction(eye_points: np.ndarray) -> np.ndarray:
    """Calculate gaze direction from eye landmarks"""
    if len(eye_points) < 2:
        return np.zeros(3)

    # Calculate eye center
    eye_center = np.mean(eye_points, axis=0)

    # Calculate principal direction (approximate gaze direction)
    U, S, Vt = np.linalg.svd(eye_points - eye_center)
    gaze_direction = Vt[0]

    return gaze_direction

In [12]:
def semantic_analysis_agent(video_data: Dict[str, Any]) -> AnalysisMetrics:
    """Enhanced semantic analysis using LLM"""
    llm = ChatOpenAI(model="gpt-4", temperature=0.2)

    analysis_prompt = ChatPromptTemplate.from_messages([
        ("system", """Analyze the following video content for potential manipulation:
        1. Evaluate visual coherence and continuity
        2. Assess natural behavior patterns
        3. Check for physical inconsistencies
        4. Analyze temporal logic
        5. Look for contextual anomalies

        Provide detailed analysis with specific anomalies found."""),
        ("human", "{input}")
    ])

    # Prepare context for LLM
    context = {
        "frame_count": len(video_data["frames"]),
        "duration": video_data["metadata"]["duration"],
        "resolution": f"{video_data['metadata']['width']}x{video_data['metadata']['height']}",
        "has_audio": video_data["audio"] is not None
    }

    parser = PydanticOutputParser(pydantic_object=AnalysisMetrics)
    chain = analysis_prompt | llm | parser

    try:
        result = chain.invoke({"input": str(context)})
        return result
    except Exception as e:
        print(f"Semantic analysis error: {e}")
        return AnalysisMetrics(
            score=0.0,
            anomalies=["Failed to perform semantic analysis"],
            confidence_level="Low"
        )

In [None]:
# Example usage
if __name__ == "__main__":
    video_path = "path/to/your/video.mp4"
    forensic_report = run_enhanced_detection(video_path, verbose=True)

    # Print report in formatted JSON
    print("\nDeepfake Detection Forensic Report:")
    print(json.dumps(forensic_report.dict(), indent=2, default=str))