<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_and_Manipulated_Media_Analysis_R%26D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Deepfake Detection and Manipulated Media Analysis using Multiagent System and Compound AI Approach**

**Install Required Packages**

In [None]:
!pip install -q groq langchain langchain-community langgraph torch transformers opencv-python librosa numpy face-recognition
!pip install -qU dlib mediapipe scipy pillow tqdm pydantic moviepy

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.6/109.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.6/142.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.7/44.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for face-recognition-models (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing meta

In [None]:
!pip install -qU vllm

In [None]:
import asyncio
import websockets
import json
import torch
import os
import getpass
import gc
import cv2
import numpy as np
import librosa
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from typing import Dict, List, Any, Optional, Union
from pydantic import BaseModel, Field
from transformers import (
    CLIPModel, Wav2Vec2ForSequenceClassification,
    CLIPProcessor, Wav2Vec2Processor,
    EfficientNetForImageClassification, LlavaForConditionalGeneration
)
# Commenting out the larger models
# from transformers import (
#     VideoMAEForVideoClassification, TimesformerForVideoClassification, LlavaForConditionalGeneration,
#     HubertForCTC, InstructBlipForConditionalGeneration
# )
# from transformers import (
#     AutoImageProcessor, ViTImageProcessor, LlavaProcessor, InstructBlipProcessor
# )
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.chains import LLMChain
from langchain.tools import BaseTool
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent
from langgraph.graph import Graph, StateGraph, END
from moviepy.editor import VideoFileClip

  if event.key is 'enter':



**Set Up the Groq Client**

In [None]:
# Configuration
class Config:
    GROQ_API_KEY = os.getenv("GROQ_API_KEY") or getpass.getpass("Enter your Groq API key: ")
    MODEL_PATHS = {
        "llava": "lmms-lab/LLaVA-Video-7B-Qwen2",
        "efficientnet": "google/efficientnet-b7",
        "clip": "openai/clip-vit-large-patch14",
        "wav2vec2": "facebook/wav2vec2-large-960h",
        # "hubert": "facebook/hubert-large-ls960-ft",
        "clip_processor": "openai/clip-vit-large-patch14",
        # "vit_processor": "google/vit-base-patch16-224-in21k",
        "wav2vec2_processor": "facebook/wav2vec2-large-960h",
        # "hubert_processor": "facebook/hubert-large-ls960-ft",
        # "llava_processor": "lmms-lab/LLaVA-Video-7B-Qwen2",
        # "instructblip_processor": "Salesforce/instructblip-vicuna-7b"
    }

Enter your Groq API key: ··········


**More Models for Text, Audio and Vedio Analysis**

In [None]:
def initialize_groq_client():
    from groq import Groq
    client = Groq(api_key=Config.GROQ_API_KEY)
    models = {
        "text": "llama-3.3-70b-versatile",
        "vision": "llama-3.2-90b-vision-preview",
        "audio": "whisper-large-v3-turbo"
    }
    print("Groq models initialized successfully!")
    return client, models

groq_client, groq_models = initialize_groq_client()

# Cache for loaded models
loaded_models = {}

def load_model(model_name):
    model_mapping = {
        "llava": LlavaForConditionalGeneration.from_pretrained(Config.MODEL_PATHS["llava"]),
        "efficientnet": EfficientNetForImageClassification.from_pretrained(Config.MODEL_PATHS["efficientnet"]),
        "clip": CLIPModel.from_pretrained(Config.MODEL_PATHS["clip"]),
        "wav2vec2": Wav2Vec2ForSequenceClassification.from_pretrained(Config.MODEL_PATHS["wav2vec2"]),
        # "hubert": HubertForCTC.from_pretrained(Config.MODEL_PATHS["hubert"]),
        "clip_processor": CLIPProcessor.from_pretrained(Config.MODEL_PATHS["clip_processor"]),
        # "vit_processor": ViTImageProcessor.from_pretrained(Config.MODEL_PATHS["vit_processor"]),
        "wav2vec2_processor": Wav2Vec2Processor.from_pretrained(Config.MODEL_PATHS["wav2vec2_processor"]),
        # "hubert_processor": Wav2Vec2Processor.from_pretrained(Config.MODEL_PATHS["hubert_processor"]),
        # "llava_processor": LlavaProcessor.from_pretrained(Config.MODEL_PATHS["llava_processor"]),
        # "instructblip_processor": InstructBlipProcessor.from_pretrained(Config.MODEL_PATHS["instructblip_processor"])
    }
    return model_mapping[model_name]

def get_model(model_name):
    if model_name not in loaded_models:
        loaded_models[model_name] = load_model(model_name)
    return loaded_models[model_name]

def load_models_in_parallel(model_names):
    with ThreadPoolExecutor() as executor:
        executor.map(get_model, model_names)

def free_memory(model_names):
    for model_name in model_names:
        if model_name in loaded_models:
            del loaded_models[model_name]
    gc.collect()

# Example usage
model_names = ["llava", "efficientnet", "clip", "wav2vec2"]
load_models_in_parallel(model_names)
free_memory(model_names)
print("Models loaded and memory freed successfully!")

Groq models initialized successfully!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/3.10k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/80.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

**Define Data Models**
- Define the data models for storing analysis results and reports:


In [None]:
# Define Data Models
class DeepfakeAnalysisResult(BaseModel):
    score: float = Field(..., description="Confidence score (0-1)")
    label: str = Field(..., description="Classification label")
    anomalies: List[str] = Field(default_factory=list)
    artifacts: List[str] = Field(default_factory=list)
    confidence: float = Field(..., description="Model confidence")
    method: str = Field(..., description="Detection method used")
    timestamp: datetime = Field(default_factory=datetime.now)

class MultimodalAnalysisReport(BaseModel):
    case_id: str
    file_info: Dict[str, Any]
    video_analysis: Optional[DeepfakeAnalysisResult]
    audio_analysis: Optional[DeepfakeAnalysisResult]
    image_analysis: Optional[DeepfakeAnalysisResult]
    text_analysis: Optional[DeepfakeAnalysisResult]
    multimodal_score: float
    verdict: str
    evidence: List[Dict[str, Any]]
    metadata: Dict[str, Any]

**Frame Stabilization**


In [None]:
import cv2
import numpy as np
from typing import List
def stabilize_frames(frames: List[np.ndarray]) -> List[np.ndarray]:
    stabilized_frames = []
    prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
    transforms = []

    for i in range(1, len(frames)):
        curr_gray = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
        transform = cv2.estimateRigidTransform(prev_gray, curr_gray, False)
        transforms.append(transform)
        prev_gray = curr_gray

    for i, frame in enumerate(frames):
        if i == 0:
            stabilized_frames.append(frame)
        else:
            stabilized_frame = cv2.warpAffine(frame, transforms[i-1], (frame.shape[1], frame.shape[0]))
            stabilized_frames.append(stabilized_frame)

    return stabilized_frames

**Adaptive Noise Reduction**

In [None]:
import librosa
import numpy as np

def adaptive_noise_reduction(audio_data: np.ndarray) -> np.ndarray:
    noise_profile = np.mean(audio_data[:1000])  # Assume first 1000 samples as noise profile
    reduced_noise_audio = audio_data - noise_profile
    return reduced_noise_audio

**Define Functions for Preprocessing**
- Define helper functions for preprocessing audio, image, and video data:

In [None]:
# Define Functions for Preprocessing
def enhance_resolution(image: np.ndarray) -> np.ndarray:
    sr = cv2.dnn_superres.DnnSuperResImpl_create()
    path = "path/to/your/model.pb"
    sr.readModel(path)
    sr.setModel("edsr", 4)
    return sr.upsample(image)

def deblur_image(image: np.ndarray) -> np.ndarray:
    return cv2.filter2D(image, -1, np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]))

def extract_audio_features(audio_data: np.ndarray, sample_rate: int) -> Dict[str, Any]:
    mfcc = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=audio_data, sr=sample_rate)
    mel = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
    return {"mfcc": mfcc, "chroma": chroma, "mel": mel}

def extract_image_features(image: np.ndarray) -> Dict[str, Any]:
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    sift = cv2.SIFT_create()
    keypoints, descriptors = sift.detectAndCompute(gray, None)
    return {"keypoints": keypoints, "descriptors": descriptors}

def calculate_dense_optical_flow(prev_frame: np.ndarray, curr_frame: np.ndarray) -> np.ndarray:
    gray_prev = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    gray_curr = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)
    flow = cv2.calcOpticalFlowFarneback(gray_prev, gray_curr, None, 0.5, 3, 15, 3, 5, 1.2, 0)
    return flow

def extract_temporal_features(optical_flow: np.ndarray) -> Dict[str, Any]:
    mag, ang = cv2.cartToPolar(optical_flow[..., 0], optical_flow[..., 1])
    return {"magnitude": mag, "angle": ang}

def estimate_noise(frame: np.ndarray) -> float:
    return np.mean(cv2.Laplacian(frame, cv2.CV_64F).var())

def calculate_contrast(frame: np.ndarray) -> float:
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    return gray.std()

def detect_compression_artifacts(frame: np.ndarray) -> float:
    dct = cv2.dct(np.float32(frame) / 255.0)
    return np.mean(np.abs(dct))

**Define Preprocessing Functions**
- Define functions for preprocessing audio, image, and video data

In [None]:
# Define Preprocessing Functions
async def enhanced_preprocessing(file_path: str) -> Dict[str, Any]:
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension in [".mp3", ".wav", ".flac"]:
        return await enhanced_audio_preprocessing(file_path)
    elif file_extension in [".jpg", ".jpeg", ".png", ".bmp"]:
        return await enhanced_image_preprocessing(file_path)
    elif file_extension in [".mp4", ".avi", ".mov", ".mkv"]:
        return await enhanced_video_preprocessing(file_path)
    else:
        raise ValueError("Unsupported file format")

async def enhanced_audio_preprocessing(audio_path: str) -> Dict[str, Any]:
    audio_data, sample_rate = librosa.load(audio_path, sr=None)
    audio_data = adaptive_noise_reduction(audio_data)
    audio_features = extract_audio_features(audio_data, sample_rate)
    return {"audio": audio_data, "sample_rate": sample_rate, "audio_features": audio_features}

async def enhanced_image_preprocessing(image_path: str) -> Dict[str, Any]:
    image = cv2.imread(image_path)
    image = enhance_resolution(image)
    image = deblur_image(image)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_features = extract_image_features(image_rgb)
    return {"image": image_rgb, "image_features": image_features}

async def enhanced_video_preprocessing(video_path: str) -> Dict[str, Any]:
    cap = cv2.VideoCapture(video_path)
    frames = []
    audio_data = None
    metadata = {
        "fps": cap.get(cv2.CAP_PROP_FPS),
        "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
        "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
        "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
        "duration": float(cap.get(cv2.CAP_PROP_FRAME_COUNT)) / float(cap.get(cv2.CAP_PROP_FPS)),
        "codec": int(cap.get(cv2.CAP_PROP_FOURCC)).to_bytes(4, byteorder='little').decode(),
        "file_size": os.path.getsize(video_path)
    }
    frame_quality_metrics = []
    optical_flow_data = []
    prev_frame = None
    temporal_features = []

    loop = asyncio.get_event_loop()
    executor = ThreadPoolExecutor(max_workers=4)

    async def process_frame(frame):
        return await loop.run_in_executor(executor, enhance_resolution, frame)

    async def process_quality_metrics(frame):
        return await loop.run_in_executor(executor, lambda: {
            "blur": cv2.Laplacian(frame, cv2.CV_64F).var(),
            "noise": estimate_noise(frame),
            "brightness": np.mean(frame),
            "contrast": calculate_contrast(frame),
            "compression_artifacts": detect_compression_artifacts(frame)
        })

    async def process_optical_flow(prev_frame, frame):
        return await loop.run_in_executor(executor, calculate_dense_optical_flow, prev_frame, frame)

    async def process_temporal_features(flow):
        return await loop.run_in_executor(executor, extract_temporal_features, flow)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame, quality_metrics = await asyncio.gather(
            process_frame(frame),
            process_quality_metrics(frame)
        )
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(rgb_frame)
        frame_quality_metrics.append(quality_metrics)
        if prev_frame is not None:
            flow, temp_features = await asyncio.gather(
                process_optical_flow(prev_frame, frame),
                process_temporal_features(flow)
            )
            optical_flow_data.append(flow)
            temporal_features.append(temp_features)
        prev_frame = frame.copy()
    cap.release()

    try:
        video = VideoFileClip(video_path)
        audio = video.audio
        if audio is not None:
            audio_array = audio.to_soundarray()
            audio_data = extract_audio_features(audio_array, audio.fps)
        video.close()
    except Exception as e:
        print(f"Audio extraction error: {e}")
        audio_data = None

    return {
        "frames": frames,
        "audio": audio_data,
        "metadata": metadata,
        "quality_metrics": frame_quality_metrics,
        "optical_flow": optical_flow_data,
        "temporal_features": temporal_features
    }

**Define Analysis Functions**
- functions for analyzing transcription, vision response, and other features

In [None]:
# Define Analysis Functions
def analyze_transcription(transcription: str) -> List[str]:
    anomalies = []
    if "repeated phrase" in transcription.lower():
        anomalies.append("Repetitive phrases detected")
    if "inconsistent timestamp" in transcription.lower():
        anomalies.append("Inconsistent timestamps detected")
    return anomalies

def analyze_vision_response(response: str) -> List[str]:
    anomalies = []
    if "blurry region" in response.lower():
        anomalies.append("Blurry regions detected")
    if "unnatural shadow" in response.lower():
        anomalies.append("Unnatural shadows detected")
    return anomalies

def analyze_av_sync(frames: List[np.ndarray], audio: np.ndarray) -> float:
    sync_score = 0.9
    return sync_score

def analyze_temporal_features(temporal_features: List[Dict[str, Any]]) -> List[str]:
    anomalies = []
    for feature in temporal_features:
        if feature["magnitude"].max() > 1.0:
            anomalies.append("Abrupt changes in motion detected")
    return anomalies

def analyze_optical_flow(optical_flow: List[np.ndarray]) -> List[str]:
    anomalies = []
    for flow in optical_flow:
        if flow.max() > 1.0:
            anomalies.append("Inconsistent flow patterns detected")
    return anomalies

def analyze_biometric_consistency(frames: List[np.ndarray], models: Dict[str, Any]) -> float:
    consistency_score = 0.9
    return consistency_score

**Define Analysis Functions**
- Define functions for analyzing audio, image, and video data using Groq models

**Advanced Audio Analysis**

In [None]:
# Define Analysis Functions
async def advanced_audio_analysis(audio_data: Dict[str, Any], models: Dict[str, Any], device: torch.device) -> DeepfakeAnalysisResult:
    if audio_data is None:
        return DeepfakeAnalysisResult(
            score=0.0,
            label="NO_AUDIO",
            confidence=0.0,
            method="audio_analysis",
            anomalies=["No audio data available"]
        )
    anomalies = []
    scores = []

    response = groq_client.audio.transcriptions.create(
        file=audio_data["waveform"],
        model="whisper-large-v3-turbo",
    )
    transcription = response.text

    anomalies.extend(analyze_transcription(transcription))
    scores.append(0.9)

    final_score = np.mean(scores)
    return DeepfakeAnalysisResult(
        score=float(final_score),
        label="REAL" if final_score > 0.7 else "FAKE",
        confidence=float(np.std(scores)),
        method="audio_analysis",
        anomalies=anomalies
    )

**Advanced Image Analysis**

In [None]:
async def advanced_image_analysis(image: np.ndarray, models: Dict[str, Any], device: torch.device) -> DeepfakeAnalysisResult:
    anomalies = []
    scores = []

    response = groq_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Analyze this image for signs of digital manipulation or inconsistencies.",
                "image_url": "data:image/png;base64," + base64.b64encode(cv2.imencode('.png', image)[1]).decode()
            }
        ],
        model="llama-3.2-90b-vision-preview",
    )
    content = response.choices[0].message.content
    score_match = re.search(r"Score: (0\.\d+|1\.0)", content)
    if score_match:
        scores.append(float(score_match.group(1)))
    anomalies.extend(analyze_vision_response(content))

    final_score = np.mean(scores)
    return DeepfakeAnalysisResult(
        score=float(final_score),
        label="REAL" if final_score > 0.7 else "FAKE",
        confidence=float(np.std(scores)),
        method="image_analysis",
        anomalies=anomalies
    )

**Advanced Video Analysis**

In [None]:
async def advanced_video_analysis(video_data: Dict[str, Any], models: Dict[str, Any], device: torch.device) -> DeepfakeAnalysisResult:
    anomalies = []
    scores = []

    # VideoMAE model analysis
    videomae_model = models["videomae"].to(device)
    videomae_processor = VideoMAEFeatureExtractor.from_pretrained(Config.MODEL_PATHS["videomae"])
    inputs = videomae_processor(video_data["frames"], return_tensors="pt").to(device)
    with torch.no_grad():
        videomae_output = videomae_model(**inputs)
        videomae_score = torch.softmax(videomae_output.logits, dim=-1)
        scores.append(videomae_score.max().item())

    # Timesformer model analysis
    timesformer_model = models["timesformer"].to(device)
    timesformer_processor = VideoMAEFeatureExtractor.from_pretrained(Config.MODEL_PATHS["timesformer"])
    inputs = timesformer_processor(video_data["frames"], return_tensors="pt").to(device)
    with torch.no_grad():
        timesformer_output = timesformer_model(**inputs)
        timesformer_score = torch.softmax(timesformer_output.logits, dim=-1)
        scores.append(timesformer_score.max().item())

    # Llava model analysis
    llava_model = models["llava"].to(device)
    llava_processor = models["processors"]["llava"]
    prompt = "Analyze this video for signs of digital manipulation or inconsistencies."
    inputs = llava_processor(video=video_data["frames"], text=prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = llava_model.generate(**inputs)
        analysis = llava_processor.decode(outputs[0], skip_special_tokens=True)
        llava_score = analyze_llava_response(analysis)
        scores.append(llava_score)

    # Additional analyses
    temporal_anomalies = analyze_temporal_features(video_data["temporal_features"])
    anomalies.extend(temporal_anomalies)
    optical_flow_anomalies = analyze_optical_flow(video_data["optical_flow"])
    anomalies.extend(optical_flow_anomalies)

    final_score = np.mean(scores)
    return DeepfakeAnalysisResult(
        score=float(final_score),
        label="REAL" if final_score > 0.7 else "FAKE",
        confidence=float(np.std(scores)),
        method="video_analysis",
        anomalies=anomalies
    )

**Define Tool Creation Functions**
- Define functions for creating deepfake detection tools and agents

In [None]:
# Define Tool Creation Functions
def create_deepfake_detection_tools(models: Dict[str, Any], device: torch.device) -> List[Tool]:
    tools = [
        Tool(
            name="analyze_video",
            func=lambda x: advanced_video_analysis(x, models, device),
            description="Analyzes video content for signs of manipulation"
        ),
        Tool(
            name="analyze_audio",
            func=lambda x: advanced_audio_analysis(x, models, device),
            description="Analyzes audio content for signs of manipulation"
        ),
        Tool(
            name="analyze_image",
            func=lambda x: advanced_image_analysis(x, models, device),
            description="Analyzes image content for signs of manipulation"
        ),
        Tool(
            name="semantic_analysis",
            func=lambda x: semantic_consistency_analysis(x, models["llms"]),
            description="Analyzes semantic consistency across modalities"
        ),
        Tool(
            name="face_forgery_detection",
            func=lambda x: face_forgery_detection(x, models, device),
            description="Detects face forgeries in video content"
        ),
        Tool(
            name="lip_sync_detection",
            func=lambda x: lip_sync_detection(x, models, device),
            description="Analyzes lip-sync consistency between audio and video"
        ),
        Tool(
            name="background_consistency",
            func=lambda x: background_consistency_analysis(x, models, device),
            description="Analyzes background consistency across frames"
        ),
        Tool(
            name="real_time_streaming_analysis",
            func=lambda x: real_time_streaming_analysis(x, models, device),
            description="Analyzes live video streams for signs of manipulation"
        ),
        Tool(
            name="text_analysis",
            func=lambda x: text_analysis(x, models["llms"]),
            description="Analyzes text content for signs of manipulation"
        ),
        Tool(
            name="metadata_analysis",
            func=lambda x: metadata_analysis(x),
            description="Analyzes metadata for signs of manipulation"
        )
    ]
    return tools

**Create Detection Agent**
- Define the detection graph for processing the input data

In [None]:
# Create Detection Agent
def create_detection_agent(tools: List[Tool], llm: ChatOpenAI) -> AgentExecutor:
    class DeepfakeDetectionOutputParser:
        def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
            try:
                if "Final Answer:" in llm_output:
                    return AgentFinish(
                        return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                        log=llm_output,
                    )
                action_match = re.search(r"Action: (.*?)\nAction Input: (.*)", llm_output, re.DOTALL)
                if not action_match:
                    raise ValueError("Could not parse LLM output: " + llm_output)
                action = action_match.group(1).strip()
                action_input = action_match.group(2).strip()
                return AgentAction(tool=action, tool_input=action_input, log=llm_output)
            except Exception as e:
                raise ValueError(f"Could not parse LLM output: {llm_output}") from e

    prompt = ChatPromptTemplate.from_messages([
        SystemMessagePromptTemplate.from_template("""
            You are an expert deepfake detection system. Your goal is to analyze content across multiple modalities
            to determine authenticity. Consider all available evidence and patterns including:

            1. Visual elements: inconsistencies, artifacts, unnatural patterns
            2. Audio characteristics: synthetic artifacts, unnatural transitions
            3. Semantic coherence: logical consistency across modalities
            4. Temporal patterns: synchronization, continuity
            5. Biometric features: facial landmarks, expressions, movements

            Available tools:
            {tools}

            Process:
            1. Analyze the input using appropriate tools
            2. Evaluate evidence across modalities
            3. Make a final determination on authenticity

            Format your response as:
            Action: [tool name]
            Action Input: [tool input]
            Observation: [result]
            ... (repeat for additional tools as needed)
            Final Answer: [detailed analysis and verdict]
            """),
        HumanMessagePromptTemplate.from_template("{input}")
    ])

    return AgentExecutor.from_agent_and_tools(
        agent=LLMSingleActionAgent(
            llm_chain=LLMChain(llm=llm, prompt=prompt),
            output_parser=DeepfakeDetectionOutputParser(),
            stop=["Observation:", "Final Answer:"],
            allowed_tools=[tool.name for tool in tools]
        ),
        tools=tools,
        verbose=True
    )

**Define Detection Graph**
- Define the detection graph for processing the input data

In [None]:
# Define Detection Graph
def create_detection_graph() -> StateGraph:
    def preprocess(state):
        input_data = state["input"]
        processed_data = asyncio.run(enhanced_preprocessing(input_data))
        return {**state, "processed_data": processed_data}

    def analyze_modalities(state):
        processed_data = state["processed_data"]
        models = state["models"]
        device = state["device"]
        results = {
            "video": asyncio.run(advanced_video_analysis(processed_data, models, device)) if "frames" in processed_data else None,
            "audio": asyncio.run(advanced_audio_analysis(processed_data.get("audio"), models, device)) if "audio" in processed_data else None,
            "image": asyncio.run(advanced_image_analysis(processed_data["image"], models, device)) if "image" in processed_data else None,
            "text": asyncio.run(text_analysis(processed_data.get("text"), models["llms"])) if "text" in processed_data else None
        }
        return {**state, "modality_results": results}

    def cross_modal_analysis(state):
        results = state["modality_results"]
        processed_data = state["processed_data"]
        models = state["models"]
        cross_modal_score = analyze_cross_modal_consistency(results, processed_data, models)
        return {**state, "cross_modal_score": cross_modal_score}

    def generate_report(state):
        results = state["modality_results"]
        cross_modal_score = state["cross_modal_score"]
        processed_data = state["processed_data"]
        report = generate_comprehensive_report(results, cross_modal_score, processed_data)
        return {**state, "final_report": report}

    workflow = StateGraph(nodes=[
        ("preprocess", preprocess),
        ("analyze_modalities", analyze_modalities),
        ("cross_modal_analysis", cross_modal_analysis),
        ("generate_report", generate_report)
    ])
    workflow.add_edge("preprocess", "analyze_modalities")
    workflow.add_edge("analyze_modalities", "cross_modal_analysis")
    workflow.add_edge("cross_modal_analysis", "generate_report")
    workflow.add_edge("generate_report", END)
    return workflow

**Define Functions for Analyzing Cross-Modal Consistency and Generating Reports**

In [None]:
def analyze_cross_modal_consistency(
    results: Dict[str, DeepfakeAnalysisResult],
    processed_data: Dict[str, Any],
    models: Dict[str, Any]
) -> float:
    scores = []
    if results["audio"] and results["video"]:
        sync_score = analyze_av_sync(processed_data["frames"], processed_data["audio"])
        scores.append(sync_score)
    semantic_score = analyze_semantic_consistency(results, processed_data, models["llms"])
    scores.append(semantic_score)
    temporal_score = analyze_temporal_coherence(processed_data["temporal_features"])
    scores.append(temporal_score)
    bio_score = analyze_biometric_consistency(processed_data["frames"], models)
    scores.append(bio_score)
    return float(np.mean(scores))

def analyze_semantic_consistency(
    results: Dict[str, DeepfakeAnalysisResult],
    processed_data: Dict[str, Any],
    llms: Dict[str, ChatOpenAI]
) -> float:
    prompt = ChatPromptTemplate.from_template("""
        Analyze the consistency between different modalities in the content:

        Video Analysis: {video_analysis}
        Audio Analysis: {audio_analysis}
        Image Analysis: {image_analysis}
        Text Analysis: {text_analysis}

        Consider:
        1. Do the modalities tell a coherent story?
        2. Are there logical contradictions?
        3. Do temporal patterns align?
        4. Is the emotional content consistent?

        Rate the consistency from 0 to 1, where 1 is perfectly consistent.
        Provide detailed reasoning for your rating.

        Output format:
        Score: [0-1]
        Reasoning: [detailed explanation]
    """)
    chain = LLMChain(llm=llms["gpt4"], prompt=prompt)
    response = chain.run({
        "video_analysis": results["video"].dict() if results["video"] else "N/A",
        "audio_analysis": results["audio"].dict() if results["audio"] else "N/A",
        "image_analysis": results["image"].dict() if results["image"] else "N/A",
        "text_analysis": results["text"].dict() if results["text"] else "N/A"
    })
    score_match = re.search(r"Score: (0\.\d+|1\.0)", response)
    if score_match:
        return float(score_match.group(1))
    return 0.5

def generate_comprehensive_report(
    results: Dict[str, DeepfakeAnalysisResult],
    cross_modal_score: float,
    processed_data: Dict[str, Any]
) -> MultimodalAnalysisReport:
    scores = [
        results["video"].score if results["video"] else 0.5,
        results["audio"].score if results["audio"] else 0.5,
        results["image"].score if results["image"] else 0.5,
        results["text"].score if results["text"] else 0.5,
        cross_modal_score
    ]
    weights = [0.3, 0.2, 0.2, 0.2, 0.1]
    final_score = sum(s * w for s, w in zip(scores, weights))
    evidence = []
    for modality, result in results.items():
        if result:
            evidence.extend([
                {
                    "type": modality,
                    "description": anomaly,
                    "confidence": result.confidence,
                    "method": result.method
                }
                for anomaly in result.anomalies
            ])
    return MultimodalAnalysisReport(
        case_id=f"DFD-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
        file_info=processed_data["metadata"] if "metadata" in processed_data else {},
        video_analysis=results["video"],
        audio_analysis=results["audio"],
        image_analysis=results["image"],
        text_analysis=results["text"],
        multimodal_score=float(final_score),
        verdict="AUTHENTIC" if final_score > 0.7 else "MANIPULATED",
        evidence=evidence,
        metadata={
            "processing_time": datetime.now().isoformat(),
            "models_used": list(results.keys()),
            "cross_modal_score": cross_modal_score,
            "confidence_distribution": {
                modality: result.confidence
                for modality, result in results.items() if result
            }
        }
    )

**Define Real-Time Streaming Analysis**
- Define a function for real-time streaming analysis

In [None]:
async def real_time_streaming_analysis(video_stream: Any, models: Dict[str, Any], device: torch.device) -> DeepfakeAnalysisResult:
    frames = []
    audio_data = None
    metadata = {
        "fps": 30,
        "frame_count": 0,
        "width": 0,
        "height": 0,
        "duration": 0,
        "codec": "N/A",
        "file_size": 0
    }
    frame_quality_metrics = []
    optical_flow_data = []
    prev_frame = None
    temporal_features = []

    loop = asyncio.get_event_loop()
    executor = ThreadPoolExecutor(max_workers=4)

    async def process_frame(frame):
        return await loop.run_in_executor(executor, enhance_resolution, frame)

    async def process_quality_metrics(frame):
        return await loop.run_in_executor(executor, lambda: {
            "blur": cv2.Laplacian(frame, cv2.CV_64F).var(),
            "noise": estimate_noise(frame),
            "brightness": np.mean(frame),
            "contrast": calculate_contrast(frame),
            "compression_artifacts": detect_compression_artifacts(frame)
        })

    async def process_optical_flow(prev_frame, frame):
        return await loop.run_in_executor(executor, calculate_dense_optical_flow, prev_frame, frame)

    async def process_temporal_features(flow):
        return await loop.run_in_executor(executor, extract_temporal_features, flow)

    while True:
        frame = await video_stream.read()
        if frame is None:
            break
        frame, quality_metrics = await asyncio.gather(
            process_frame(frame),
            process_quality_metrics(frame)
        )
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(rgb_frame)
        frame_quality_metrics.append(quality_metrics)
        if prev_frame is not None:
            flow, temp_features = await asyncio.gather(
                process_optical_flow(prev_frame, frame),
                process_temporal_features(flow)
            )
            optical_flow_data.append(flow)
            temporal_features.append(temp_features)
        prev_frame = frame.copy()

    video_data = {
        "frames": frames,
        "audio": audio_data,
        "metadata": metadata,
        "quality_metrics": frame_quality_metrics,
        "optical_flow": optical_flow_data,
        "temporal_features": temporal_features
    }

    return await advanced_video_analysis(video_data, models, device)

**Define Text Analysis**
- Define a function for text analysis using Groq models:

In [None]:
async def text_analysis(text: str, llms: Dict[str, ChatOpenAI]) -> DeepfakeAnalysisResult:
    response = groq_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Analyze the following text for signs of manipulation or inconsistencies:\n\n{text}"
            }
        ],
        model="llama-3.3-70b-versatile",
    )
    content = response.choices[0].message.content
    score_match = re.search(r"Score: (0\.\d+|1\.0)", content)
    if score_match:
        score = float(score_match.group(1))
    else:
        score = 0.5
    anomalies = [line.strip() for line in content.split("Reasoning:")[-1].strip().split("\n") if line.strip()]
    return DeepfakeAnalysisResult(
        score=score,
        label="REAL" if score > 0.7 else "FAKE",
        confidence=0.0,
        method="text_analysis",
        anomalies=anomalies
    )

**Define Metadata Analysis**
- Define a function for metadata analysis

In [None]:
def metadata_analysis(metadata: Dict[str, Any]) -> DeepfakeAnalysisResult:
    anomalies = []
    scores = []

    if metadata.get("fps") < 10:
        anomalies.append("Unusually low frame rate")
        scores.append(0.2)
    if metadata.get("duration") < 1:
        anomalies.append("Unusually short duration")
        scores.append(0.2)
    if metadata.get("file_size") < 100000:
        anomalies.append("Unusually small file size")
        scores.append(0.2)

    final_score = np.mean(scores) if scores else 0.5
    return DeepfakeAnalysisResult(
        score=float(final_score),
        label="REAL" if final_score > 0.7 else "FAKE",
        confidence=float(np.std(scores)) if scores else 0.0,
        method="metadata_analysis",
        anomalies=anomalies
    )

**Define the Main Function**
- Define the main function to run the deepfake detection system

In [None]:
async def run_deepfake_detection(file_path: str, mode: str = "all") -> MultimodalAnalysisReport:
    try:
        tools = create_deepfake_detection_tools(env["models"], env["device"])
        agent = create_detection_agent(tools, env["models"]["llms"]["gpt4"])
        workflow = create_detection_graph()
        initial_state = {
            "input": file_path,
            "mode": mode,
            "models": env["models"],
            "device": env["device"]
        }
        final_state = workflow.run(initial_state)
        return final_state["final_report"]
    except Exception as e:
        print(f"Error in deepfake detection: {str(e)}")
        raise

In [None]:
# WebSocket Server
async def handle_client(websocket, path):
    async for message in websocket:
        data = json.loads(message)
        file_path = data.get("file_path")
        mode = data.get("mode", "all")
        if file_path:
            report = await run_deepfake_detection(file_path, mode)
            await websocket.send(json.dumps(report.dict(), indent=2, default=str))
        else:
            await websocket.send(json.dumps({"error": "File path is required"}, indent=2))

async def main():
    async with websockets.serve(handle_client, "localhost", 8765):
        await asyncio.Future()

if __name__ == "__main__":
    asyncio.run(main())