<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_and_Manipulated_Media_Analysis_R%26D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Deepfake Detection and Manipulated Media Analysis using Multiagent System and Compound AI Approach**

In [1]:
# Installation
%pip install -q langchain langchain-community langgraph torch transformers opencv-python librosa numpy face-recognition dlib mediapipe scipy pillow tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.2/138.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.7/44.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for face-recognition-models (setup.py) ... [?25l[?25hdone


In [2]:
# Required imports
import os
from typing import Dict, List, Any, Tuple, Optional
from dataclasses import dataclass
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain.tools import tool
from langgraph.graph import Graph, END
import numpy as np
import torch
import torch.nn.functional as F
from transformers import (
    AutoProcessor,
    AutoModelForVideoClassification,
    AutoModelForAudioClassification,
    CLIPProcessor,
    CLIPModel,
    AutoModelForImageClassification,
    ViTForImageClassification,
    Wav2Vec2ForSequenceClassification,
    AutoFeatureExtractor
)
import cv2
import librosa
import json
import face_recognition
import dlib
import scipy
from scipy.signal import welch
from PIL import Image
import mediapipe as mp
import logging
import warnings
from tqdm import tqdm

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
def setup_environment():
    """Setup necessary environment variables and configurations"""
    # Configure device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")

    # Load face detection model
    face_detector = dlib.get_frontal_face_detector()

    # Initialize MediaPipe
    mp_face_mesh = mp.solutions.face_mesh.FaceMesh(
        static_image_mode=False,
        max_num_faces=1,
        min_detection_confidence=0.5
    )

    return {
        "device": device,
        "face_detector": face_detector,
        "face_mesh": mp_face_mesh
    }

In [6]:
def preprocess_video(video_path: str) -> Dict[str, Any]:
    """Enhanced video preprocessing with advanced feature extraction"""
    cap = cv2.VideoCapture(video_path)
    frames = []
    face_landmarks_seq = []
    optical_flow_seq = []

    # Initialize optical flow
    ret, prev_frame = cap.read()
    prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Store original frame
        frames.append(frame)

        # Calculate optical flow
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        flow = cv2.calcOpticalFlowFarneback(
            prev_gray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0
        )
        optical_flow_seq.append(flow)
        prev_gray = gray

    cap.release()

    # Extract audio with advanced features
    y, sr = librosa.load(video_path)

    # Extract audio features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    return {
        "frames": frames,
        "audio": {
            "raw": (y, sr),
            "mfcc": mfcc,
            "chroma": chroma,
            "spectral_contrast": spectral_contrast
        },
        "optical_flow": optical_flow_seq,
        "metadata": extract_metadata(video_path)
    }

def extract_metadata(file_path: str) -> Dict[str, Any]:
    """Enhanced metadata extraction"""
    metadata = {}
    try:
        # Basic file info
        metadata["file_info"] = {
            "size": os.path.getsize(file_path),
            "created": os.path.getctime(file_path),
            "modified": os.path.getmtime(file_path)
        }

        # Video-specific metadata
        cap = cv2.VideoCapture(file_path)
        metadata["video_info"] = {
            "fps": cap.get(cv2.CAP_PROP_FPS),
            "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
            "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
            "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        }
        cap.release()

    except Exception as e:
        metadata["error"] = str(e)
        logger.error(f"Error extracting metadata: {e}")

    return metadata

In [7]:
# Visual Analysis Agents

def spatial_inconsistency_agent(frames: List[np.ndarray], device: torch.device) -> Dict[str, float]:
    """Enhanced spatial inconsistency detection"""
    # Load multiple models for ensemble detection
    models = {
        "xclip": AutoModelForVideoClassification.from_pretrained("microsoft/xclip-base-patch32"),
        "vit": ViTForImageClassification.from_pretrained("google/vit-base-patch16-224"),
        "clip": CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    }

    processors = {
        "xclip": AutoProcessor.from_pretrained("microsoft/xclip-base-patch32"),
        "vit": AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224"),
        "clip": CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    }

    results = []
    for frame in tqdm(frames, desc="Analyzing spatial inconsistencies"):
        frame_results = []

        # XClip analysis
        xclip_inputs = processors["xclip"](images=frame, return_tensors="pt").to(device)
        xclip_outputs = models["xclip"](**xclip_inputs)
        frame_results.append(xclip_outputs.logits.softmax(dim=1))

        # ViT analysis
        frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        vit_inputs = processors["vit"](images=frame_pil, return_tensors="pt").to(device)
        vit_outputs = models["vit"](**vit_inputs)
        frame_results.append(vit_outputs.logits.softmax(dim=1))

        # Ensemble results
        frame_confidence = torch.mean(torch.stack([r.mean() for r in frame_results]))
        results.append(frame_confidence.item())

    return {
        "spatial_confidence": float(np.mean(results)),
        "frame_confidences": results
    }

In [8]:
def facial_analysis_agent(frames: List[np.ndarray], face_detector: dlib.fhog_object_detector) -> Dict[str, float]:
    """Analyze facial features for inconsistencies"""
    face_confidences = []
    landmark_movements = []

    for i in range(len(frames) - 1):
        # Detect faces
        curr_faces = face_recognition.face_locations(frames[i])
        next_faces = face_recognition.face_locations(frames[i + 1])

        if curr_faces and next_faces:
            # Get facial landmarks
            curr_landmarks = face_recognition.face_landmarks(frames[i], curr_faces)
            next_landmarks = face_recognition.face_landmarks(frames[i + 1], next_faces)

            # Calculate landmark movement consistency
            if curr_landmarks and next_landmarks:
                movement = calculate_landmark_movement(curr_landmarks[0], next_landmarks[0])
                landmark_movements.append(movement)

                # Calculate face similarity
                curr_encoding = face_recognition.face_encodings(frames[i], [curr_faces[0]])[0]
                next_encoding = face_recognition.face_encodings(frames[i + 1], [next_faces[0]])[0]
                similarity = face_recognition.face_distance([curr_encoding], next_encoding)[0]
                face_confidences.append(1 - similarity)

    return {
        "facial_confidence": float(np.mean(face_confidences)) if face_confidences else 0.0,
        "landmark_consistency": float(np.mean(landmark_movements)) if landmark_movements else 0.0
    }

In [9]:
def calculate_landmark_movement(curr_landmarks: Dict, next_landmarks: Dict) -> float:
    """Calculate consistency of facial landmark movements"""
    total_movement = 0
    num_points = 0

    for feature in curr_landmarks.keys():
        curr_points = np.array(curr_landmarks[feature])
        next_points = np.array(next_landmarks[feature])

        # Calculate movement vectors
        movements = next_points - curr_points

        # Check for consistent movement
        movement_directions = np.sign(movements)
        consistency = np.mean(np.abs(np.diff(movement_directions, axis=0)))

        total_movement += consistency
        num_points += 1

    return total_movement / num_points if num_points > 0 else 0.0

In [10]:
def temporal_coherence_agent(frames: List[np.ndarray], optical_flow: List[np.ndarray]) -> Dict[str, float]:
    """Enhanced temporal coherence analysis"""
    flow_consistency = []
    frame_consistency = []

    for i in range(len(frames) - 1):
        # Analyze optical flow consistency
        flow = optical_flow[i]
        flow_magnitude = np.sqrt(flow[..., 0]**2 + flow[..., 1]**2)
        flow_consistency.append(np.mean(flow_magnitude))

        # Analyze frame-to-frame consistency
        curr_frame = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
        next_frame = cv2.cvtColor(frames[i + 1], cv2.COLOR_BGR2GRAY)

        # Calculate structural similarity
        ssim = calculate_ssim(curr_frame, next_frame)
        frame_consistency.append(ssim)

    return {
        "temporal_confidence": float(np.mean(frame_consistency)),
        "flow_consistency": float(np.mean(flow_consistency))
    }

def calculate_ssim(img1: np.ndarray, img2: np.ndarray, window_size: int = 7) -> float:
    """Calculate Structural Similarity Index (SSIM)"""
    C1 = (0.01 * 255)**2
    C2 = (0.03 * 255)**2

    window = cv2.getGaussianKernel(window_size, 1.5)
    window = np.outer(window, window)

    mu1 = cv2.filter2D(img1, -1, window)[::2, ::2]
    mu2 = cv2.filter2D(img2, -1, window)[::2, ::2]
    mu1_sq = mu1**2
    mu2_sq = mu2**2
    mu1_mu2 = mu1 * mu2

    sigma1_sq = cv2.filter2D(img1**2, -1, window)[::2, ::2] - mu1_sq
    sigma2_sq = cv2.filter2D(img2**2, -1, window)[::2, ::2] - mu2_sq
    sigma12 = cv2.filter2D(img1 * img2, -1, window)[::2, ::2] - mu1_mu2

    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / \
               ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))

    return float(np.mean(ssim_map))

In [11]:
# Audio Analysis Agents

def audio_analysis_agent(audio_data: Dict[str, Any]) -> Dict[str, float]:
    """Enhanced audio analysis"""
    # Unpack audio data
    y, sr = audio_data["raw"]
    mfcc = audio_data["mfcc"]
    chroma = audio_data["chroma"]
    spectral_contrast = audio_data["spectral_contrast"]

    # Analyze MFCC consistency
    mfcc_consistency = np.mean(np.std(mfcc, axis=1))

    # Analyze spectral properties
    frequencies, power = welch(y, sr, nperseg=2048)
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)

    # Detect audio artifacts
    zero_crossings = librosa.zero_crossings(y)
    zero_crossing_rate = np.mean(zero_crossings)

    # Calculate overall confidence
    features = [
        mfcc_consistency,
        np.mean(spectral_contrast),
        np.mean(spectral_rolloff),
        np.mean(spectral_bandwidth),
        zero_crossing_rate
    ]

    confidence = calculate_audio_confidence(features)

    return {
        "audio_confidence": float(confidence),
        "mfcc_consistency": float(mfcc_consistency),
        "spectral_contrast": float(np.mean(spectral_contrast)),
        "zero_crossing_rate": float(zero_crossing_rate)
    }

def calculate_audio_confidence(features: List[float]) -> float:
    """Calculate overall audio confidence score"""
    # Normalize features
    normalized_features = []
    for feature in features:
        if np.isfinite(feature):
            normalized = (feature - np.min(features)) / (np.max(features) - np.min(features))
            normalized_features.append(normalized)

    # Weight and combine features
    weights = [0.3, 0.2, 0.2, 0.15, 0.15]  # Adjustable weights
    confidence = np.average(normalized_features, weights=weights[:len(normalized_features)])

    return float(confidence)

In [12]:
# Semantic Analysis Agents

def setup_llm_agent():
    """Setup enhanced LLM agent"""
    llm = ChatOpenAI(model="gpt-4-vision-preview", temperature=0.2)

    prompt = ChatPromptTemplate.from_messages([
        ("system", """Analyze the following media content for signs of manipulation.
        Consider:
        1. Visual consistency and artifacts
        2. Audio-visual synchronization
        3. Natural movement and expressions
        4. Lighting and shadow consistency
        5. Edge artifacts and blending issues

        Provide a detailed analysis with confidence scores."""),
        ("human", "{input}")
    ])

    return llm, prompt

@tool
def semantic_analysis_agent(input_data: Dict[str, Any]) -> Dict[str, float]:
    """Enhanced semantic analysis"""
    llm, prompt = setup_llm_agent()

    # Prepare input data
    analysis_text = {
        "video_metadata": input_data.get("metadata", {}),
        "detected_faces": len(face_recognition.face_locations(input_data["frames"][0])) if "frames" in input_data else 0,
        "video_length": len(input_data["frames"]) if "frames" in input_data else 0,
        "audio_features": bool(input_data.get("audio", {}))
    }

    chain = prompt | llm | JsonOutputParser()

    try:
        result = chain.invoke({"input": json.dumps(analysis_text)})
        confidence = result.get("confidence", 0.0)

        # Additional analysis of response content
        response_detail = result.get("analysis", "")
        detail_score = analyze_llm_response_detail(response_detail)

        return {
            "semantic_confidence": float(confidence),
            "detail_score": float(detail_score),
            "analysis": response_detail
        }
    except Exception as e:
        logger.error(f"Error in semantic analysis: {e}")
        return {"semantic_confidence": 0.0, "error": str(e)}

def analyze_llm_response_detail(response: str) -> float:
    """Analyze the detail level of LLM response"""
    # Consider factors like response length, specific terminology use, etc.
    detail_metrics = {
        "length": len(response) / 1000,  # Normalize by 1000 chars
        "technical_terms": len([word for word in response.lower().split()
                              if word in ["artifact", "consistency", "synchronization",
                                        "manipulation", "synthetic", "generated"]]) / 10
    }
    return np.mean(list(detail_metrics.values()))

In [13]:
# Integration and Decision Making

def feature_fusion(results: List[Dict[str, float]], metadata: Dict[str, Any]) -> Dict[str, float]:
    """Enhanced feature fusion with adaptive weighting"""
    # Initialize weights based on metadata
    base_weights = {
        "spatial_confidence": 0.25,
        "temporal_confidence": 0.20,
        "facial_confidence": 0.20,
        "audio_confidence": 0.15,
        "semantic_confidence": 0.20
    }

    # Adjust weights based on video properties
    if metadata.get("video_info", {}).get("fps", 0) < 20:
        base_weights["temporal_confidence"] *= 0.8
        base_weights["spatial_confidence"] *= 1.2

    # Collect all confidence scores
    confidence_scores = {}
    for result in results:
        for key, value in result.items():
            if "confidence" in key and isinstance(value, (int, float)):
                confidence_scores[key] = value

    # Calculate weighted average
    total_weight = 0
    weighted_sum = 0

    for key, value in confidence_scores.items():
        weight = base_weights.get(key, 0.1)  # Default weight for unknown metrics
        weighted_sum += value * weight
        total_weight += weight

    weighted_avg = weighted_sum / total_weight if total_weight > 0 else 0.0

    # Calculate uncertainty
    variances = [(score - weighted_avg) ** 2 for score in confidence_scores.values()]
    uncertainty = np.sqrt(np.mean(variances)) if variances else 0.0

    return {
        "final_confidence": float(weighted_avg),
        "uncertainty": float(uncertainty),
        "individual_scores": confidence_scores
    }

In [14]:
def make_decision(fusion_result: Dict[str, float]) -> Dict[str, Any]:
    """Enhanced decision making with detailed analysis"""
    confidence = fusion_result["final_confidence"]
    uncertainty = fusion_result["uncertainty"]
    individual_scores = fusion_result["individual_scores"]

    # Dynamic thresholding based on uncertainty
    base_threshold = 0.7
    adjusted_threshold = base_threshold + (uncertainty * 0.1)

    # Analyze score distribution
    score_distribution = analyze_score_distribution(individual_scores)

    # Decision making
    is_fake = confidence < adjusted_threshold
    certainty_level = calculate_certainty_level(confidence, uncertainty)

    # Generate detailed explanation
    explanation = generate_detailed_explanation(
        is_fake, confidence, uncertainty,
        individual_scores, score_distribution
    )

    return {
        "is_fake": is_fake,
        "confidence": confidence,
        "uncertainty": uncertainty,
        "certainty_level": certainty_level,
        "threshold_used": adjusted_threshold,
        "score_distribution": score_distribution,
        "explanation": explanation
    }

def analyze_score_distribution(scores: Dict[str, float]) -> Dict[str, Any]:
    """Analyze the distribution of individual scores"""
    values = np.array(list(scores.values()))
    return {
        "mean": float(np.mean(values)),
        "std": float(np.std(values)),
        "min": float(np.min(values)),
        "max": float(np.max(values)),
        "range": float(np.ptp(values)),
        "consistency": float(1 - (np.std(values) / np.mean(values))) if np.mean(values) != 0 else 0.0
    }

def calculate_certainty_level(confidence: float, uncertainty: float) -> str:
    """Calculate the certainty level of the decision"""
    if uncertainty > 0.3:
        return "Low"
    elif uncertainty > 0.15:
        return "Medium"
    else:
        return "High"

def generate_detailed_explanation(
    is_fake: bool,
    confidence: float,
    uncertainty: float,
    individual_scores: Dict[str, float],
    score_distribution: Dict[str, Any]
) -> str:
    """Generate a detailed explanation of the decision"""
    explanation_parts = []

    # Overall decision
    decision_text = "likely manipulated" if is_fake else "likely authentic"
    explanation_parts.append(f"The media is {decision_text} with {confidence:.1%} confidence.")

    # Uncertainty analysis
    explanation_parts.append(
        f"The uncertainty level is {uncertainty:.1%}, indicating a "
        f"{'high' if uncertainty > 0.3 else 'moderate' if uncertainty > 0.15 else 'low'} "
        "level of prediction variability."
    )

    # Individual score analysis
    strongest_evidence = max(individual_scores.items(), key=lambda x: x[1])
    weakest_evidence = min(individual_scores.items(), key=lambda x: x[1])

    explanation_parts.append(
        f"The strongest evidence comes from {strongest_evidence[0]} "
        f"({strongest_evidence[1]:.1%}), while the weakest indicator is "
        f"{weakest_evidence[0]} ({weakest_evidence[1]:.1%})."
    )

    # Score consistency
    explanation_parts.append(
        f"The consistency between different detection methods is "
        f"{score_distribution['consistency']:.1%}."
    )

    return " ".join(explanation_parts)

In [15]:
# Main Pipeline

def create_detection_graph():
    """Create enhanced LangGraph workflow"""
    workflow = Graph()

    @workflow.node("preprocess")
    def preprocess(state):
        video_data = preprocess_video(state["input_path"])
        return {"video_data": video_data}

    @workflow.node("visual_analysis")
    def visual_analysis(state):
        frames = state["video_data"]["frames"]
        optical_flow = state["video_data"]["optical_flow"]

        # Run visual analysis agents in parallel
        spatial_results = spatial_inconsistency_agent(frames, state["env"]["device"])
        temporal_results = temporal_coherence_agent(frames, optical_flow)
        facial_results = facial_analysis_agent(frames, state["env"]["face_detector"])

        return {
            "visual_results": {
                "spatial": spatial_results,
                "temporal": temporal_results,
                "facial": facial_results
            }
        }

    @workflow.node("audio_analysis")
    def audio_analysis(state):
        audio_data = state["video_data"]["audio"]
        results = audio_analysis_agent(audio_data)
        return {"audio_results": results}

    @workflow.node("semantic_analysis")
    def semantic_analysis(state):
        results = semantic_analysis_agent(state["video_data"])
        return {"semantic_results": results}

    @workflow.node("decision")
    def decision(state):
        all_results = [
            state["visual_results"]["spatial"],
            state["visual_results"]["temporal"],
            state["visual_results"]["facial"],
            state["audio_results"],
            state["semantic_results"]
        ]

        fusion_result = feature_fusion(all_results, state["video_data"]["metadata"])
        final_decision = make_decision(fusion_result)

        return {"decision": final_decision, "end": True}

    # Define workflow
    workflow.set_entry_point("preprocess")
    workflow.add_edge("preprocess", "visual_analysis")
    workflow.add_edge("preprocess", "audio_analysis")
    workflow.add_edge("preprocess", "semantic_analysis")
    workflow.add_edge("visual_analysis", "decision")
    workflow.add_edge("audio_analysis", "decision")
    workflow.add_edge("semantic_analysis", "decision")

    return workflow

def detect_deepfake(video_path: str, verbose: bool = False) -> Dict[str, Any]:
    """Enhanced main function to run deepfake detection"""
    try:
        # Setup environment
        env = setup_environment()

        # Create and compile workflow
        workflow = create_detection_graph()

        # Configure logging
        if verbose:
            logging.getLogger().setLevel(logging.INFO)
        else:
            logging.getLogger().setLevel(logging.WARNING)

        # Run detection
        config = {
            "input_path": video_path,
            "env": env
        }

        logger.info("Starting deepfake detection pipeline...")
        result = workflow.run(config)
        logger.info("Detection pipeline completed successfully.")

        return result["decision"]

    except Exception as e:
        logger.error(f"Error in deepfake detection: {e}")
        return {
            "error": str(e),
            "is_fake": None,
            "confidence": 0.0,
            "explanation": "An error occurred during detection."
        }

In [16]:
# Example usage
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Deepfake Detection System")
    parser.add_argument("video_path", help="Path to the video file to analyze")
    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")

    args = parser.parse_args()

    result = detect_deepfake(args.video_path, verbose=args.verbose)
    print("\nDeepfake Detection Results:")
    print(json.dumps(result, indent=2))

usage: colab_kernel_launcher.py [-h] [--verbose] video_path
colab_kernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
