<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_and_Manipulated_Media_Analysis_R%26D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Deepfake Detection and Manipulated Media Analysis using Multiagent System and Compound AI Approach**

In [None]:
%pip install -q langchain langchain-community langgraph torch transformers opencv-python librosa numpy face-recognition dlib mediapipe scipy pillow tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.2/138.2 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.7/44.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for face-recognition-models (setup.py) ... [?25l[?25hdone


In [11]:
import torch
import numpy as np
import cv2
import librosa
import face_recognition
import mediapipe as mp
from typing import Dict, List, Any, Tuple
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langgraph.graph import Graph, END
from transformers import (
    AutoProcessor,
    AutoModelForVideoClassification,
    AutoModelForAudioClassification,
    CLIPProcessor,
    CLIPModel,
    Blip2Processor,
    Blip2ForConditionalGeneration,
    VideoMAEFeatureExtractor,
    VideoMAEForVideoClassification,
    WhisperProcessor,
    WhisperForAudioClassification,
    LayoutLMv3Processor,
    LayoutLMv3ForSequenceClassification
)

In [12]:
def setup_environment():
    """Initialize environment and models"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    models = {
        "videomae": VideoMAEForVideoClassification.from_pretrained(
            "MCG-NJU/videomae-base-finetuned-kinetics"
        ).to(device),
        "videomae_processor": VideoMAEFeatureExtractor.from_pretrained(
            "MCG-NJU/videomae-base-finetuned-kinetics"
        ),
        "clip": CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device),
        "clip_processor": CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14"),
        "blip2": Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b"
        ).to(device),
        "blip2_processor": Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b"),
        "whisper": WhisperForAudioClassification.from_pretrained(
            "openai/whisper-large-v3"
        ).to(device),
        "whisper_processor": WhisperProcessor.from_pretrained("openai/whisper-large-v3"),
        "face_detector": mp.solutions.face_detection.FaceDetection(
            min_detection_confidence=0.5
        ),
        "face_mesh": mp.solutions.face_mesh.FaceMesh(
            static_image_mode=False,
            max_num_faces=1,
            min_detection_confidence=0.5
        )
    }

    return {"device": device, "models": models}

In [13]:
def preprocess_video(video_path: str) -> Dict[str, Any]:
    """Preprocess video for analysis"""
    cap = cv2.VideoCapture(video_path)
    frames = []
    audio_data = None

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    cap.release()

    # Extract audio if available
    try:
        y, sr = librosa.load(video_path)
        audio_data = {"raw": y, "sr": sr}
    except:
        print("No audio found or error extracting audio")

    # Calculate optical flow
    optical_flow = []
    prev_frame = frames[0]
    for frame in frames[1:]:
        flow = cv2.calcOpticalFlowFarneback(
            cv2.cvtColor(prev_frame, cv2.COLOR_RGB2GRAY),
            cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY),
            None, 0.5, 3, 15, 3, 5, 1.2, 0
        )
        optical_flow.append(flow)
        prev_frame = frame

    return {
        "frames": frames,
        "audio": audio_data,
        "optical_flow": optical_flow,
        "metadata": {"fps": cap.get(cv2.CAP_PROP_FPS)}
    }

In [15]:
def spatial_inconsistency_agent(frames: List[np.ndarray], device: torch.device) -> Dict[str, float]:
    """Analyze spatial inconsistencies"""
    clip_scores = []
    blip_scores = []

    for frame in frames:
        # CLIP analysis
        clip_inputs = models["clip_processor"](images=frame, return_tensors="pt").to(device)
        clip_outputs = models["clip"](**clip_inputs)
        clip_scores.append(clip_outputs.logits.softmax(dim=1).mean().item())

        # BLIP2 analysis
        blip_inputs = models["blip2_processor"](images=frame, return_tensors="pt").to(device)
        blip_outputs = models["blip2"].generate(**blip_inputs)
        blip_scores.append(float(blip_outputs.mean().item()))

    return {
        "spatial_confidence": np.mean(clip_scores) * 0.6 + np.mean(blip_scores) * 0.4
    }

def temporal_coherence_agent(
    frames: List[np.ndarray],
    optical_flow: List[np.ndarray]
) -> Dict[str, float]:
    """Analyze temporal coherence"""
    # VideoMAE temporal analysis
    videomae_inputs = models["videomae_processor"](frames, return_tensors="pt")
    videomae_outputs = models["videomae"](**videomae_inputs)
    temporal_score = videomae_outputs.logits.softmax(dim=1).mean().item()

    # Optical flow analysis
    flow_consistency = np.mean([np.mean(np.abs(flow)) for flow in optical_flow])
    flow_score = 1.0 - min(flow_consistency / 100.0, 1.0)  # Normalize and invert

    return {
        "temporal_confidence": temporal_score * 0.7 + flow_score * 0.3
    }

def facial_analysis_agent(frames: List[np.ndarray], face_detector) -> Dict[str, float]:
    """Analyze facial features and inconsistencies"""
    face_scores = []

    for frame in frames:
        # Face detection
        results = face_detector.process(frame)

        if results.detections:
            # Analyze each detected face
            for detection in results.detections:
                bbox = detection.location_data.relative_bounding_box
                confidence = detection.score[0]

                # Extract face region
                h, w = frame.shape[:2]
                x, y = int(bbox.xmin * w), int(bbox.ymin * h)
                width, height = int(bbox.width * w), int(bbox.height * h)
                face_region = frame[y:y+height, x:x+width]

                # Face mesh analysis
                mesh_results = models["face_mesh"].process(face_region)
                if mesh_results.multi_face_landmarks:
                    landmarks = mesh_results.multi_face_landmarks[0]
                    # Calculate landmark consistency
                    landmark_positions = np.array([[lm.x, lm.y, lm.z] for lm in landmarks.landmark])
                    landmark_consistency = np.std(landmark_positions)

                    # Combine metrics
                    face_score = confidence * (1.0 - min(landmark_consistency, 1.0))
                    face_scores.append(face_score)

    return {
        "facial_confidence": np.mean(face_scores) if face_scores else 0.0
    }

def audio_analysis_agent(audio_data: Dict[str, Any]) -> Dict[str, float]:
    """Analyze audio for inconsistencies"""
    if not audio_data:
        return {"audio_confidence": 0.0}

    # Basic audio features
    y, sr = audio_data["raw"], audio_data["sr"]
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    # Whisper analysis
    whisper_inputs = models["whisper_processor"](y, return_tensors="pt")
    whisper_outputs = models["whisper"](**whisper_inputs)
    whisper_score = whisper_outputs.logits.softmax(dim=1).mean().item()

    # Calculate audio quality metrics
    audio_quality = np.mean([
        np.std(mfcc),
        np.mean(spectral_contrast),
        whisper_score
    ])

    return {
        "audio_confidence": float(audio_quality)
    }

In [16]:
def setup_semantic_agent():
    """Setup LLM-based semantic analysis agent"""
    llm = ChatOpenAI(model="gpt-4", temperature=0.2)

    prompt = ChatPromptTemplate.from_messages([
        ("system", """Analyze the following media content for potential manipulation:
        1. Check for visual artifacts and inconsistencies
        2. Evaluate temporal coherence
        3. Assess facial feature naturality
        4. Analyze audio-visual synchronization

        Provide a detailed analysis with confidence scores."""),
        ("human", "{input}")
    ])

    return llm, prompt

def semantic_analysis_agent(video_data: Dict[str, Any]) -> Dict[str, float]:
    """Perform semantic analysis using LLM"""
    llm, prompt = setup_semantic_agent()

    analysis_input = {
        "frame_count": len(video_data["frames"]),
        "has_audio": video_data["audio"] is not None,
        "metadata": video_data["metadata"]
    }

    chain = prompt | llm | JsonOutputParser()

    try:
        result = chain.invoke({"input": str(analysis_input)})
        return {
            "semantic_confidence": float(result.get("confidence", 0.5))
        }
    except Exception as e:
        print(f"Semantic analysis error: {e}")
        return {"semantic_confidence": 0.0}

def feature_fusion(results: List[Dict[str, float]]) -> Dict[str, float]:
    """Fuse multiple analysis results"""
    weights = {
        "spatial_confidence": 0.25,
        "temporal_confidence": 0.25,
        "facial_confidence": 0.2,
        "audio_confidence": 0.15,
        "semantic_confidence": 0.15
    }

    final_score = 0.0
    total_weight = 0.0

    for result in results:
        for key, value in result.items():
            if key in weights:
                final_score += value * weights[key]
                total_weight += weights[key]

    if total_weight > 0:
        final_score /= total_weight

    return {
        "final_confidence": float(final_score),
        "individual_scores": results
    }

def make_decision(fusion_result: Dict[str, float]) -> Dict[str, Any]:
    """Make final decision based on fusion results"""
    confidence = fusion_result["final_confidence"]
    threshold = 0.7

    is_fake = confidence < threshold

    return {
        "is_fake": is_fake,
        "confidence": confidence,
        "threshold": threshold,
        "individual_scores": fusion_result["individual_scores"],
        "explanation": generate_explanation(is_fake, confidence, fusion_result["individual_scores"])
    }

def generate_explanation(
    is_fake: bool,
    confidence: float,
    individual_scores: List[Dict[str, float]]
) -> str:
    """Generate detailed explanation of the decision"""
    status = "manipulated" if is_fake else "authentic"
    explanation = f"The media is likely {status} with {confidence:.1%} confidence.\n\n"

    # Add details about individual analyses
    explanation += "Analysis breakdown:\n"
    for score_dict in individual_scores:
        for metric, score in score_dict.items():
            explanation += f"- {metric}: {score:.1%}\n"

    return explanation

def create_detection_workflow():
    """Create the detection workflow graph"""
    workflow = Graph()

    # Define nodes
    workflow.add_node("preprocess", lambda x: {"video_data": preprocess_video(x["input_path"])})
    workflow.add_node("visual_analysis", lambda x: {
        "visual_results": {
            "spatial": spatial_inconsistency_agent(x["video_data"]["frames"], x["env"]["device"]),
            "temporal": temporal_coherence_agent(x["video_data"]["frames"], x["video_data"]["optical_flow"]),
            "facial": facial_analysis_agent(x["video_data"]["frames"], x["env"]["models"]["face_detector"])
        }
    })
    workflow.add_node("audio_analysis", lambda x: {
        "audio_results": audio_analysis_agent(x["video_data"]["audio"])
    })
    workflow.add_node("semantic_analysis", lambda x: {
        "semantic_results": semantic_analysis_agent(x["video_data"])
    })
    workflow.add_node("decision", lambda x: {
        "final_decision": make_decision(feature_fusion([
            x["visual_results"]["spatial"],
            x["visual_results"]["temporal"],
            x["visual_results"]["facial"],
            x["audio_results"],
            x["semantic_results"]
        ]))
    })

    # Define edges
    workflow.add_edge("preprocess", "visual_analysis")
    workflow.add_edge("preprocess", "audio_analysis")
    workflow.add_edge("preprocess", "semantic_analysis")
    workflow.add_edge("visual_analysis", "decision")
    workflow.add_edge("audio_analysis", "decision")
    workflow.add_edge("semantic_analysis", "decision")

    return workflow

def run_deepfake_detection(video_path: str, verbose: bool = False) -> Dict[str, Any]:
    """Main function to run the deepfake detection pipeline"""
    try:
        # Setup environment
        env = setup_environment()

        # Create workflow
        workflow = create_detection_workflow()

        if verbose:
            print("Starting deepfake detection pipeline...")

        # Run detection
        result = workflow.run({
            "input_path": video_path,
            "env": env
        })

        if verbose:
            print("Detection completed successfully.")

        return result["final_decision"]

    except Exception as e:
        error_msg = f"Error in deepfake detection: {str(e)}"
        print(error_msg)
        return {
            "error": error_msg,
            "is_fake": None,
            "confidence": 0.0,
            "explanation": "An error occurred during detection."
        }

In [None]:
# Example usage
if __name__ == "__main__":
    video_path = "path/to/your/video.mp4"
    result = run_deepfake_detection(video_path, verbose=True)
    print("\nDeepfake Detection Results:")
    print(json.dumps(result, indent=2))

Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]