<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Deepfake_and_Manipulated_Media_Analysis_R%26D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Deepfake Detection and Manipulated Media Analysis using Multiagent System and Compound AI Approach**

In [1]:
!pip install -q langchain langchain-community langgraph torch transformers opencv-python librosa numpy face-recognition dlib mediapipe scipy pillow tqdm pydantic

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.2/138.2 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.7/44.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for face-recognition-models (setup.py) ... [?25l[?25hdone


In [2]:
# Import additional required packages
import torch
import numpy as np
import cv2
import librosa
import face_recognition
import mediapipe as mp
from typing import Dict, List, Any, Tuple, Optional
from pydantic import BaseModel, Field
from datetime import datetime
import json
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, AutoModelForAudioClassification,
    CLIPProcessor, CLIPModel, VideoMAEFeatureExtractor, VideoMAEForVideoClassification,
    Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, WhisperProcessor, WhisperForAudioClassification,
    BlenderbotTokenizer, BlenderbotForConditionalGeneration, VisionEncoderDecoderModel,
    ViTImageProcessor, InstructBlipProcessor, InstructBlipForConditionalGeneration,
    ImageGPTForCausalImageModeling, TimesformerForVideoClassification
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.chains import LLMChain
from langchain.tools import BaseTool
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent
from langchain.schema import AgentAction, AgentFinish
from langgraph.graph import Graph, StateGraph
from PIL import Image
import io
import soundfile as sf
import wave
import scipy
import openai
from moviepy.editor import VideoFileClip
import hashlib
from scipy.spatial.distance import cosine
from sklearn.metrics import roc_curve, auc

  if event.key is 'enter':



In [3]:
# Enhanced Pydantic Models
class DeepfakeAnalysisResult(BaseModel):
    score: float = Field(..., description="Confidence score (0-1)")
    label: str = Field(..., description="Classification label")
    anomalies: List[str] = Field(default_factory=list)
    artifacts: List[str] = Field(default_factory=list)
    confidence: float = Field(..., description="Model confidence")
    method: str = Field(..., description="Detection method used")
    timestamp: datetime = Field(default_factory=datetime.now)

class MultimodalAnalysisReport(BaseModel):
    case_id: str
    file_info: Dict[str, Any]
    video_analysis: Optional[DeepfakeAnalysisResult]
    audio_analysis: Optional[DeepfakeAnalysisResult]
    image_analysis: Optional[DeepfakeAnalysisResult]
    text_analysis: Optional[DeepfakeAnalysisResult]
    multimodal_score: float
    verdict: str
    evidence: List[Dict[str, Any]]
    metadata: Dict[str, Any]

In [4]:
def setup_enhanced_detection_environment():
    """Initialize all required models and processors"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    models = {
        # Video Analysis
        "videomae": VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics"),
        "timesformer": TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400"),

        # Image Analysis
        "clip": CLIPModel.from_pretrained("openai/clip-vit-large-patch14"),
        "instructblip": InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b"),
        "vit_gpt2": VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning"),

        # Audio Analysis
        "wav2vec2": Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ks"),
        "whisper": WhisperForAudioClassification.from_pretrained("openai/whisper-large-v3"),

        # Text Analysis
        "roberta_fake": AutoModelForSequenceClassification.from_pretrained("deepset/roberta-base-squad2"),
        "blenderbot": BlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill"),

        # Face Analysis
        "face_detector": mp.solutions.face_detection.FaceDetection(min_detection_confidence=0.7),
        "face_mesh": mp.solutions.face_mesh.FaceMesh(
            static_image_mode=False,
            max_num_faces=1,
            min_detection_confidence=0.7
        ),

        # Processors
        "processors": {
            "clip": CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14"),
            "instructblip": InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b"),
            "vit": ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k"),
            "wav2vec2": Wav2Vec2Processor.from_pretrained("superb/wav2vec2-base-superb-ks"),
            "whisper": WhisperProcessor.from_pretrained("openai/whisper-large-v3"),
            "roberta": AutoTokenizer.from_pretrained("deepset/roberta-base-squad2"),
            "blenderbot": BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
        },

        # LLMs
        "llms": {
            "gpt4": ChatOpenAI(model="gpt-4", temperature=0.2),
            "claude": ChatOpenAI(model="claude-3-opus", temperature=0.2)
        }
    }

    return {"device": device, "models": models}

In [6]:
def enhanced_video_preprocessing(video_path: str) -> Dict[str, Any]:
    """Advanced video preprocessing with comprehensive feature extraction"""
    cap = cv2.VideoCapture(video_path)
    frames = []
    audio_data = None

    # Extract video metadata
    metadata = {
        "fps": cap.get(cv2.CAP_PROP_FPS),
        "frame_count": int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
        "width": int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
        "height": int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
        "duration": float(cap.get(cv2.CAP_PROP_FRAME_COUNT)) / float(cap.get(cv2.CAP_PROP_FPS)),
        "codec": int(cap.get(cv2.CAP_PROP_FOURCC)).to_bytes(4, byteorder='little').decode(),
        "file_size": os.path.getsize(video_path)
    }

    # Enhanced frame extraction
    frame_quality_metrics = []
    optical_flow_data = []
    prev_frame = None
    temporal_features = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Convert to RGB and store
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(rgb_frame)

        # Calculate quality metrics
        quality_metrics = {
            "blur": cv2.Laplacian(frame, cv2.CV_64F).var(),
            "noise": estimate_noise(frame),
            "brightness": np.mean(frame),
            "contrast": calculate_contrast(frame),
            "compression_artifacts": detect_compression_artifacts(frame)
        }
        frame_quality_metrics.append(quality_metrics)

        # Calculate optical flow
        if prev_frame is not None:
            flow = calculate_dense_optical_flow(prev_frame, frame)
            optical_flow_data.append(flow)

            # Extract temporal features
            temp_features = extract_temporal_features(flow)
            temporal_features.append(temp_features)

        prev_frame = frame.copy()

    cap.release()

    # Extract audio
    try:
        video = VideoFileClip(video_path)
        audio = video.audio
        if audio is not None:
            audio_array = audio.to_soundarray()
            audio_data = extract_audio_features(audio_array, audio.fps)
        video.close()
    except Exception as e:
        print(f"Audio extraction error: {e}")
        audio_data = None

    return {
        "frames": frames,
        "audio": audio_data,
        "metadata": metadata,
        "quality_metrics": frame_quality_metrics,
        "optical_flow": optical_flow_data,
        "temporal_features": temporal_features
    }

def advanced_audio_analysis(audio_data: Dict[str, Any], models: Dict[str, Any], device: torch.device) -> DeepfakeAnalysisResult:
    """Enhanced audio deepfake detection"""
    if audio_data is None:
        return DeepfakeAnalysisResult(
            score=0.0,
            label="NO_AUDIO",
            confidence=0.0,
            method="audio_analysis",
            anomalies=["No audio data available"]
        )

    anomalies = []
    scores = []

    # Wav2Vec2 Analysis
    wav2vec2_model = models["wav2vec2"].to(device)
    wav2vec2_processor = models["processors"]["wav2vec2"]

    inputs = wav2vec2_processor(
        audio_data["waveform"],
        sampling_rate=audio_data["sample_rate"],
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        wav2vec2_output = wav2vec2_model(**inputs)
        wav2vec2_score = torch.softmax(wav2vec2_output.logits, dim=-1)
        scores.append(wav2vec2_score.max().item())

    # Whisper Analysis
    whisper_model = models["whisper"].to(device)
    whisper_processor = models["processors"]["whisper"]

    inputs = whisper_processor(
        audio_data["waveform"],
        sampling_rate=audio_data["sample_rate"],
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        whisper_output = whisper_model(**inputs)
        whisper_score = torch.softmax(whisper_output.logits, dim=-1)
        scores.append(whisper_score.max().item())

    # Analyze spectral features
    spectral_anomalies = analyze_spectral_features(audio_data)
    anomalies.extend(spectral_anomalies)

    # Analyze temporal patterns
    temporal_anomalies = analyze_temporal_patterns(audio_data)
    anomalies.extend(temporal_anomalies)

    # Calculate final score
    final_score = np.mean(scores)

    return DeepfakeAnalysisResult(
        score=float(final_score),
        label="REAL" if final_score > 0.7 else "FAKE",
        confidence=float(np.std(scores)),
        method="audio_analysis",
        anomalies=anomalies
    )

def advanced_image_analysis(image: np.ndarray, models: Dict[str, Any], device: torch.device) -> DeepfakeAnalysisResult:
    """Enhanced image deepfake detection"""
    anomalies = []
    scores = []

    # CLIP Analysis
    clip_model = models["clip"].to(device)
    clip_processor = models["processors"]["clip"]

    inputs = clip_processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        clip_output = clip_model.get_image_features(**inputs)
        clip_score = analyze_clip_features(clip_output)
        scores.append(clip_score)

    # InstructBLIP Analysis
    instructblip_model = models["instructblip"].to(device)
    instructblip_processor = models["processors"]["instructblip"]

    prompt = "Analyze this image for signs of digital manipulation or inconsistencies."
    inputs = instructblip_processor(images=image, text=prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = instructblip_model.generate(**inputs)
        analysis = instructblip_processor.decode(outputs[0], skip_special_tokens=True)
        instructblip_score = analyze_instructblip_response(analysis)
        scores.append(instructblip_score)

    # Face analysis
    face_anomalies = analyze_face_authenticity(image, models["face_detector"], models["face_mesh"])
    anomalies.extend(face_anomalies)

    # Image quality analysis
    quality_metrics = analyze_image_quality(image)
    quality_score = evaluate_quality_metrics(quality_metrics)
    scores.append(quality_score)

    if quality_metrics["compression_level"] > 0.8:
        anomalies.append("High compression artifacts detected")

    if quality_metrics["noise_level"] > 0.7:
        anomalies.append("Suspicious noise patterns detected")

    # Calculate final score
    final_score = np.mean(scores)

    return DeepfakeAnalysisResult(
        score=float(final_score),
        label="REAL" if final_score > 0.7 else "FAKE",
        confidence=float(np.std(scores)),
        method="image_analysis",
        anomalies=anomalies
    )

In [7]:
def create_deepfake_detection_tools(models: Dict[str, Any], device: torch.device) -> List[Tool]:
    """Create LangChain tools for deepfake detection"""
    tools = [
        Tool(
            name="analyze_video",
            func=lambda x: advanced_video_analysis(x, models, device),
            description="Analyzes video content for signs of manipulation"
        ),
        Tool(
            name="analyze_audio",
            func=lambda x: advanced_audio_analysis(x, models, device),
            description="Analyzes audio content for signs of manipulation"
        ),
        Tool(
            name="analyze_image",
            func=lambda x: advanced_image_analysis(x, models, device),
            description="Analyzes image content for signs of manipulation"
        ),
        Tool(
            name="semantic_analysis",
            func=lambda x: semantic_consistency_analysis(x, models["llms"]),
            description="Analyzes semantic consistency across modalities"
        )
    ]
    return tools

In [8]:
def create_detection_agent(tools: List[Tool], llm: ChatOpenAI) -> AgentExecutor:
    """Create LangChain agent for orchestrating deepfake detection"""
    # Custom output parser for detection agent
    class DeepfakeDetectionOutputParser:
        def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
            try:
                # Parse the LLM output for action and input
                if "Final Answer:" in llm_output:
                    return AgentFinish(
                        return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                        log=llm_output,
                    )

                # Extract action and action input
                action_match = re.search(r"Action: (.*?)\nAction Input: (.*)", llm_output, re.DOTALL)
                if not action_match:
                    raise ValueError("Could not parse LLM output: " + llm_output)

                action = action_match.group(1).strip()
                action_input = action_match.group(2).strip()

                return AgentAction(tool=action, tool_input=action_input, log=llm_output)
            except Exception as e:
                raise ValueError(f"Could not parse LLM output: {llm_output}") from e

    prompt = ChatPromptTemplate.from_messages([
        SystemMessagePromptTemplate.from_template("""
            You are an expert deepfake detection system. Your goal is to analyze content across multiple modalities
            to determine authenticity. Consider all available evidence and patterns including:

            1. Visual elements: inconsistencies, artifacts, unnatural patterns
            2. Audio characteristics: synthetic artifacts, unnatural transitions
            3. Semantic coherence: logical consistency across modalities
            4. Temporal patterns: synchronization, continuity
            5. Biometric features: facial landmarks, expressions, movements

            Available tools:
            {tools}

            Process:
            1. Analyze the input using appropriate tools
            2. Evaluate evidence across modalities
            3. Make a final determination on authenticity

            Format your response as:
            Action: [tool name]
            Action Input: [tool input]
            Observation: [result]
            ... (repeat for additional tools as needed)
            Final Answer: [detailed analysis and verdict]
            """),
        HumanMessagePromptTemplate.from_template("{input}")
    ])

    return AgentExecutor.from_agent_and_tools(
        agent=LLMSingleActionAgent(
            llm_chain=LLMChain(llm=llm, prompt=prompt),
            output_parser=DeepfakeDetectionOutputParser(),
            stop=["Observation:", "Final Answer:"],
            allowed_tools=[tool.name for tool in tools]
        ),
        tools=tools,
        verbose=True
    )

def create_detection_graph() -> StateGraph:
    """Create LangGraph workflow for deepfake detection"""

    # Define graph nodes
    def preprocess(state):
        """Preprocess input data"""
        input_data = state["input"]
        processed_data = enhanced_video_preprocessing(input_data)
        return {**state, "processed_data": processed_data}

    def analyze_modalities(state):
        """Analyze individual modalities"""
        processed_data = state["processed_data"]
        models = state["models"]
        device = state["device"]

        results = {
            "video": advanced_video_analysis(processed_data, models, device),
            "audio": advanced_audio_analysis(processed_data.get("audio"), models, device),
            "image": advanced_image_analysis(processed_data["frames"][0], models, device)
        }
        return {**state, "modality_results": results}

    def cross_modal_analysis(state):
        """Perform cross-modal consistency analysis"""
        results = state["modality_results"]
        processed_data = state["processed_data"]
        models = state["models"]

        cross_modal_score = analyze_cross_modal_consistency(results, processed_data, models)
        return {**state, "cross_modal_score": cross_modal_score}

    def generate_report(state):
        """Generate final analysis report"""
        results = state["modality_results"]
        cross_modal_score = state["cross_modal_score"]
        processed_data = state["processed_data"]

        report = generate_comprehensive_report(results, cross_modal_score, processed_data)
        return {**state, "final_report": report}

    # Create workflow graph
    workflow = StateGraph(nodes=[
        ("preprocess", preprocess),
        ("analyze_modalities", analyze_modalities),
        ("cross_modal_analysis", cross_modal_analysis),
        ("generate_report", generate_report)
    ])

    # Define edges
    workflow.add_edge("preprocess", "analyze_modalities")
    workflow.add_edge("analyze_modalities", "cross_modal_analysis")
    workflow.add_edge("cross_modal_analysis", "generate_report")
    workflow.add_edge("generate_report", END)

    return workflow

def analyze_cross_modal_consistency(
    results: Dict[str, DeepfakeAnalysisResult],
    processed_data: Dict[str, Any],
    models: Dict[str, Any]
) -> float:
    """Analyze consistency across different modalities"""
    scores = []

    # Audio-Visual Synchronization
    if results["audio"].score > 0 and results["video"].score > 0:
        sync_score = analyze_av_sync(processed_data["frames"], processed_data["audio"])
        scores.append(sync_score)

    # Semantic Consistency
    semantic_score = analyze_semantic_consistency(results, processed_data, models["llms"])
    scores.append(semantic_score)

    # Temporal Coherence
    temporal_score = analyze_temporal_coherence(processed_data["temporal_features"])
    scores.append(temporal_score)

    # Biometric Consistency
    bio_score = analyze_biometric_consistency(processed_data["frames"], models)
    scores.append(bio_score)

    return float(np.mean(scores))

def analyze_semantic_consistency(
    results: Dict[str, DeepfakeAnalysisResult],
    processed_data: Dict[str, Any],
    llms: Dict[str, ChatOpenAI]
) -> float:
    """Analyze semantic consistency using LLMs"""
    prompt = ChatPromptTemplate.from_template("""
        Analyze the consistency between different modalities in the content:

        Video Analysis: {video_analysis}
        Audio Analysis: {audio_analysis}
        Image Analysis: {image_analysis}

        Consider:
        1. Do the modalities tell a coherent story?
        2. Are there logical contradictions?
        3. Do temporal patterns align?
        4. Is the emotional content consistent?

        Rate the consistency from 0 to 1, where 1 is perfectly consistent.
        Provide detailed reasoning for your rating.

        Output format:
        Score: [0-1]
        Reasoning: [detailed explanation]
    """)

    chain = LLMChain(llm=llms["gpt4"], prompt=prompt)

    response = chain.run({
        "video_analysis": results["video"].dict(),
        "audio_analysis": results["audio"].dict(),
        "image_analysis": results["image_analysis"].dict()
    })

    # Extract score from response
    score_match = re.search(r"Score: (0\.\d+|1\.0)", response)
    if score_match:
        return float(score_match.group(1))
    return 0.5  # Default score if parsing fails

def generate_comprehensive_report(
    results: Dict[str, DeepfakeAnalysisResult],
    cross_modal_score: float,
    processed_data: Dict[str, Any]
) -> MultimodalAnalysisReport:
    """Generate detailed analysis report"""
    # Calculate final verdict
    scores = [
        results["video"].score,
        results["audio"].score,
        results["image"].score,
        cross_modal_score
    ]

    weights = [0.4, 0.2, 0.2, 0.2]  # Adjusted weights for each modality
    final_score = sum(s * w for s, w in zip(scores, weights))

    # Compile evidence
    evidence = []
    for modality, result in results.items():
        evidence.extend([
            {
                "type": modality,
                "description": anomaly,
                "confidence": result.confidence,
                "method": result.method
            }
            for anomaly in result.anomalies
        ])

    # Generate report
    return MultimodalAnalysisReport(
        case_id=f"DFD-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
        file_info=processed_data["metadata"],
        video_analysis=results["video"],
        audio_analysis=results["audio"],
        image_analysis=results["image"],
        multimodal_score=float(final_score),
        verdict="AUTHENTIC" if final_score > 0.7 else "MANIPULATED",
        evidence=evidence,
        metadata={
            "processing_time": datetime.now().isoformat(),
            "models_used": list(results.keys()),
            "cross_modal_score": cross_modal_score,
            "confidence_distribution": {
                modality: result.confidence
                for modality, result in results.items()
            }
        }
    )

def run_deepfake_detection(file_path: str, mode: str = "all") -> MultimodalAnalysisReport:
    """Main function to run enhanced deepfake detection"""
    try:
        # Initialize environment
        env = setup_enhanced_detection_environment()

        # Create detection tools
        tools = create_deepfake_detection_tools(env["models"], env["device"])

        # Create detection agent
        agent = create_detection_agent(tools, env["models"]["llms"]["gpt4"])

        # Create detection workflow
        workflow = create_detection_graph()

        # Initialize state
        initial_state = {
            "input": file_path,
            "mode": mode,
            "models": env["models"],
            "device": env["device"]
        }

        # Run workflow
        final_state = workflow.run(initial_state)

        return final_state["final_report"]

    except Exception as e:
        print(f"Error in deepfake detection: {str(e)}")
        raise

In [9]:
if __name__ == "__main__":
    # Example usage
    file_path = "path/to/your/content.mp4"
    report = run_deepfake_detection(file_path)

    # Print formatted report
    print("\nDeepfake Detection Report:")
    print(json.dumps(report.dict(), indent=2, default=str))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/22.9k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/104k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/9.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

config.json:   0%|          | 0.00/2.42k [00:00<?, ?B/s]




pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at openai/whisper-large-v3 and are newly initialized: ['model.classifier.bias', 'model.classifier.weight', 'model.projector.bias', 'model.projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/730M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/549 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

qformer_tokenizer/tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

qformer_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

qformer_tokenizer/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

qformer_tokenizer/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

(…)former_tokenizer/special_tokens_map.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]




Error in deepfake detection: Can't load tokenizer for 'superb/wav2vec2-base-superb-ks'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'superb/wav2vec2-base-superb-ks' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.


OSError: Can't load tokenizer for 'superb/wav2vec2-base-superb-ks'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'superb/wav2vec2-base-superb-ks' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.