In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

gemini_long_context_path = kagglehub.competition_download('gemini-long-context')
kane0068_deception_data_path = kagglehub.dataset_download('kane0068/deception-data')
google_gemini_1_5_flash_api_api_gemini_1_5_flash_1_path = kagglehub.model_download('google/gemini-1.5-flash-api/Api/gemini-1.5-flash/1')

print('Data source import complete.')


# 🕵️‍♂️ MediaAnalyzer: Truth in the Age of Misinformation

## 🤖 Powered by Google Gemini 1.5
## 📊 Multi-Modal Media Verification
## 🛡️ Deception Detection Platform

# **MediaAnalyzer: The Truth Detection Revolution**
- In today's digital age, we're drowning in information. But how much of it can we truly trust?
- Misinformation is everywhere. Fake videos, doctored texts, misleading audio clips - they're not just annoying, they're dangerous. Individuals, businesses, and entire societies are at risk.
- Meet MediaAnalyzer - the world's most advanced AI-powered media verification platform. We don't just detect deception; we dissect it.


## **1. Video Analysis**

**Our advanced video analysis goes beyond surface level. We track**

- Exact transcriptions with speaker identification
- Behavioral micro-signals
- Physiological response tracking
- Deception probability assessment

## **2. Text Verification**

**Textual content isn't safe from scrutiny. MediaAnalyzer:**

- Identifies factual inaccuracies
- Highlights grammatical errors
- Reveals logical inconsistencies
- Provides precise corrections

## **3. Audio Forensics**

**Even audio isn't beyond our reach. We provide:**

- Complete transcriptions
- Statement verifications
- Contextual insights
- Speaker bias detection

## - While traditional AI models struggle with 32,000 tokens, Gemini 1.5 processes up to 2 MILLION tokens. That's equivalent to:

    - 100,000 lines of code
    - 10 years of text messages
    - 16 complete novels

  
- Powered by Google's Gemini AI, our platform uses multiple sophisticated analysis layers. We don't just scan - we understand.

-   Complete transparency with detailed token usage reports. Know exactly how your AI analysis works, down to the last token.

# **In a world of deepfakes, misinformation, and digital manipulation, MediaAnalyzer is your guardian of truth.**

In [None]:
import os
import time
import json
from typing import List, Dict, Any, Tuple ,Optional
import google.generativeai as genai
from kaggle_secrets import UserSecretsClient

class MediaAnalyzer:
    def __init__(self, api_key: str):
        #Initialize the analyzer wih API credentials.
        genai.configure(api_key=api_key)
        self.generation_config = {
            "temperature": 0.1,
            "top_p": 1.0,
            "top_k": 32,
            "max_output_tokens": 4096,
        }
        self.model = genai.GenerativeModel(
            model_name="gemini-1.5-flash",
            generation_config=self.generation_config,
        )


        self.uploaded_files = {}  # Track uploaded files and their states
        self.total_prompts = 0  # Track total number of prompts
        self.token_stats = {
            "summary": {
                "total_prompts": 0,
                "total_input_tokens": 0,
                "total_output_tokens": 0,
                "total_tokens": 0
            },
            "details": []
        }

    def _track_token_usage(self, response):
        #Track token usage for each model response.

        try:
            # Check if the response has usage metadata
            if hasattr(response, 'usage_metadata'):
                input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
                output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
                total_tokens = getattr(response.usage_metadata, 'total_token_count', 0)

                # Update summary stats
                self.token_stats['summary']['total_prompts'] += 1
                self.token_stats['summary']['total_input_tokens'] += input_tokens
                self.token_stats['summary']['total_output_tokens'] += output_tokens
                self.token_stats['summary']['total_tokens'] += total_tokens

                # Store detailed stats for this response
                self.token_stats['details'].append({
                    'input_tokens': input_tokens,
                    'output_tokens': output_tokens,
                    'total_tokens': total_tokens
                })

                # Calculate average tokens per prompt
                if self.token_stats['summary']['total_prompts'] > 0:
                    self.token_stats['summary']['average_tokens_per_prompt'] = (
                        self.token_stats['summary']['total_tokens'] /
                        self.token_stats['summary']['total_prompts']
                    )

                # Print token usage for this response
                print(f"Token Usage - Input: {input_tokens}, Output: {output_tokens}, Total: {total_tokens}")

        except Exception as e:
            print(f"Error tracking token usage: {e}")



    def _validate_file(self, file_path: str, expected_mime_type: str) -> bool:
        """
        Validate file exists and matches expected type.
        Returns True if valid, False otherwise.
        """
        if not os.path.exists(file_path):
            print(f"Error: File not found - {file_path}")
            return False

        # Basic MIME type validation based on file extension
        extension = os.path.splitext(file_path)[1].lower()
        mime_map = {
            '.mp4': 'video/mp4',
            '.txt': 'text/plain',
            '.mp3': 'audio/mpeg'
        }

        if extension not in mime_map or mime_map[extension] != expected_mime_type:
            print(f"Error: Invalid file type. Expected {expected_mime_type}")
            return False

        return True

    def _upload_file_with_retry(self, file_path: str, mime_type: str, max_retries: int = 3) -> Optional[genai.types.File]:
        """
        Upload file with retry logic and validation.
        Returns File object if successful, None if failed.
        """
        if not self._validate_file(file_path, mime_type):
            return None

        for attempt in range(max_retries):
            try:
                print(f"Upload attempt {attempt + 1}/{max_retries}")
                file = genai.upload_file(file_path, mime_type=mime_type)

                # Wait briefly to ensure file is active
                time.sleep(2)

                # Store file reference
                self.uploaded_files[file_path] = file
                return file

            except Exception as e:
                print(f"Upload attempt {attempt + 1} failed: {str(e)}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)  # Exponential backoff
                else:
                    print("Max retries reached. Upload failed.")
                    return None

    def analyze_media(self, file_path: str, media_type: str) -> Dict[str, Any]:

        #Unified media analysis method with proper error handling.

        mime_types = {
            "video": "video/mp4",
            "text": "text/plain",
            "audio": "audio/mpeg"
        }

        if media_type not in mime_types:
            return {"error": f"Unsupported media type: {media_type}"}

        try:
            # Upload file with retry logic
            file_obj = self._upload_file_with_retry(file_path, mime_types[media_type])
            if not file_obj:
                return {"error": "File upload failed"}

            # Choose appropriate analysis method
            if media_type == "video":
                return self.analyze_video(file_obj)
            elif media_type == "text":
                return self.analyze_text(file_obj)
            elif media_type == "audio":
                return self.analyze_audio(file_obj)

        except Exception as e:
            return {
                "error": f"Analysis error: {str(e)}",
                "file_path": file_path,
                "media_type": media_type
            }

    def analyze_video(self, video_file: genai.types.File) -> Dict[str, Any]:
        """Analyze video for potential deception indicators."""
        try:
            chat = self.model.start_chat()

            # Verify file is ready and processed
            print("Processing video...")
            processing_start = time.time()
            file = genai.get_file(video_file.name)
            while file.state.name == "PROCESSING":
                elapsed = time.time() - processing_start
                print(f"\rProcessing... {elapsed:.1f}s", end="", flush=True)
                time.sleep(2)
                file = genai.get_file(file.name)
            print("\nVideo processing complete!")

            # Sequence of analysis prompts
            analysis_prompts = [
                ("Transcription", """
                Analyze this video and provide:
                1. Exact transcription of all spoken words with timestamps
                2. Speaker identification (name or description)
                3. Basic scene description

                Format as:
                TIMESTAMP: [SPEAKER]: [SPOKEN TEXT]
                """),
                ("Behavioral Analysis", """
                Based on the video, analyze these specific behavioral indicators:

                1. Eye Movement Patterns:
                - Direction of gaze
                - Blink rate
                - Eye contact duration
                - Pupil dilation

                2. Facial Expressions:
                - Micro-expressions
                - Emotion displays
                - Asymmetrical expressions
                - Timing of expressions

                3. Body Language:
                - Hand gestures
                - Body positioning
                - Tension indicators
                - Movement patterns

                Provide timestamps and detailed descriptions for each observation.
                """),
                ("Statement Analysis", """
                Analyze the truthfulness of statements in the video:

                For each major statement:
                1. Note the exact statement and timestamp
                2. Analyze for:
                   - Internal consistency
                   - Logical coherence
                   - Verifiable facts
                   - Contradictions
                3. If false, provide the correct information
                4. Rate confidence in analysis (0-100%)

                Focus especially on:
                - Qualifying language
                - Changes in detail level
                - Emotional congruence
                - Response latency
                """),
                ("Physiological Analysis", """
                Analyze physiological indicators of potential deception:

                1. Voice Analysis:
                - Pitch changes
                - Speech rate variations
                - Voice tremors
                - Volume changes

                2. Physical Responses:
                - Breathing patterns
                - Swallowing frequency
                - Facial color changes
                - Muscle tension

                Provide specific timestamps and descriptions.
                """),
                ("Final Assessment", """
                Based on all previous analyses, provide:
                1. Overall deception probability (0-100%)
                2. List of definitely true statements
                3. List of potentially deceptive statements
                4. Confidence rating in the analysis
                5. Any limitations or caveats
                """)
            ]

            combined_analysis = {}

            # Process each analysis prompt
            for prompt_name, prompt in analysis_prompts:
                print(f"Processing {prompt_name}...")

                # For the first prompt, include the video file
                if prompt_name == "Transcription":
                    response = chat.send_message([prompt, video_file])
                else:
                    response = chat.send_message(prompt)

                # Track token usage
                self._track_token_usage(response)

                # Store analysis result
                combined_analysis[prompt_name.upper()] = self._parse_response(response.text)
                print(f"{prompt_name} analysis complete.")

            return combined_analysis

        except Exception as e:
            error_msg = f"Analysis error: {str(e)}"
            print(error_msg)
            return {"error": error_msg}


    def analyze_text(self, text_file: genai.types.File) -> Dict[str, Any]:
        ##Analyze text file for incorect statements and provide corrections
        try:
            chat = self.model.start_chat()

            # Analysis prompts with increasing depth
            analysis_prompts = [
                ("Content Analysis", """
                Please analyze this text document and:
                1. Identify any factually incorrect statements
                2. Point out grammatical errors
                3. Highlight logical inconsistencies
                4. Provide corrections for each issue found

                Format your response as:
                ISSUE TYPE: [Original Text] -> [Correction]
                EXPLANATION: [Why this needs correction]
                """),
                ("Verification", """
                For each correction suggested, please:
                1. Rate confidence in the correction (0-100%)
                2. Provide source or reasoning for the correction
                3. Note if any alternative interpretations are possible
                """),
                ("Contextual Insights", """
                Provide additional context about:
                1. The writing style
                2. Potential author biases
                3. Cultural or historical context that might influence the text
                4. Any subtle nuances or implied meanings
                """)
            ]

            # Store analysis results
            combined_analysis = {}

            # Process each analysis prompt
            for prompt_name, prompt in analysis_prompts:
                print(f"Processing {prompt_name}...")

                # For the first prompt, include the text file
                if prompt_name == "Content Analysis":
                    response = chat.send_message([prompt, text_file])
                else:
                    response = chat.send_message(prompt)

                # Track token usage
                self._track_token_usage(response)

                # Store analysis result
                combined_analysis[prompt_name.replace(" ", "_").upper()] = self._parse_response(response.text)
                print(f"{prompt_name} analysis complete.")

            return combined_analysis

        except Exception as e:
            return {"error": str(e)}

    def _parse_response(self, response_text: str) -> List[str]:
        """Parse response text into structured format."""
        if not response_text:
            return ["No data available"]

        # Split response into lines and clean
        lines = [line.strip() for line in response_text.split('\n') if line.strip()]

        # If no meaningful content, return default
        if not lines:
            return ["No meaningful data detected"]

        return lines



    def analyze_audio(self, audio_file: genai.types.File) -> Dict[str, Any]:
        """Analyze audio file for incorrect statements and provide corrections."""
        try:
            chat = self.model.start_chat()

            # Analysis prompts with increasing depth
            analysis_prompts = [
                ("Transcription", """
                Please analyze this audio file and provide:
                1. Complete transcription with timestamps
                2. Identify any incorrect statements or claims
                3. Note any unclear or ambiguous passages

                Format as:
                TIMESTAMP: [SPEAKER]: [SPOKEN TEXT]
                ISSUES: [List any problems found]
                """),
                ("Error Analysis", """
                For each identified issue, please provide:
                1. The incorrect statement
                2. The correct information
                3. Confidence level in the correction (0-100%)
                4. Context or explanation for why it's incorrect
                """),
                ("Quality Assurance", """
                Please verify:
                1. Are there any cultural or contextual factors that might affect interpretation?
                2. Could any statements be correct in different contexts?
                3. Are there any ambiguous statements that need clarification?
                4. Provide insights into speaker's tone, emphasis, and potential biases
                """)
            ]

            # Store analysis results
            combined_analysis = {}

            # Process each analysis prompt
            for prompt_name, prompt in analysis_prompts:
                print(f"Processing {prompt_name}...")

                # For the first prompt, include the audio file
                if prompt_name == "Transcription":
                    response = chat.send_message([prompt, audio_file])
                else:
                    response = chat.send_message(prompt)

                # Track token usage
                self._track_token_usage(response)

                # Store analysis result
                combined_analysis[prompt_name.replace(" ", "_").upper()] = self._parse_response(response.text)
                print(f"{prompt_name} analysis complete.")

            return combined_analysis

        except Exception as e:
            return {"error": str(e)}


    def generate_report(self) -> str:
        """Generate a detailed token usage report."""
        report = """
TOKEN USAGE REPORT
=================
Total Prompts: {total_prompts}
Total Input Tokens: {total_input_tokens:,}
Total Output Tokens: {total_output_tokens:,}
Total Tokens: {total_tokens:,}
Average Tokens per Prompt: {avg_tokens:.2f}

Detailed Token Breakdown:
""".format(
            total_prompts=self.token_stats['summary']['total_prompts'],
            total_input_tokens=self.token_stats['summary']['total_input_tokens'],
            total_output_tokens=self.token_stats['summary']['total_output_tokens'],
            total_tokens=self.token_stats['summary']['total_tokens'],
            avg_tokens=self.token_stats['summary'].get('average_tokens_per_prompt', 0)
        )

        # Add details of each prompt's token usage
        for i, detail in enumerate(self.token_stats['details'], 1):
            report += f"""
Prompt {i}:
  Input Tokens: {detail['input_tokens']:,}
  Output Tokens: {detail['output_tokens']:,}
  Total Tokens: {detail['total_tokens']:,}
"""

        return report



def main():
    try:
        print("Initializing system...")
        user_secrets = UserSecretsClient()
        api_key = user_secrets.get_secret("GEMINI_API_KEY")

        analyzer = MediaAnalyzer(api_key)
        print("Initialization complete!")

        # Define media files to process
        media_files = {
            "video": "/kaggle/input/deception-data/video.mp4",
            "text": "/kaggle/input/deception-data/fake_text.txt",
            "audio": "/kaggle/input/deception-data/academic_misconceptions.mp3"
        }

        results = {}

        # Process each media file
        for media_type, file_path in media_files.items():
            if os.path.exists(file_path):
                print(f"\nProcessing {media_type}...")
                start_time = time.time()

                # Analyze media
                analysis = analyzer.analyze_media(file_path, media_type)

                # Generate and save report
                if "error" not in analysis:
                    timestamp = time.strftime("%Y%m%d_%H%M%S")
                    report_filename = f"{media_type}_analysis_report_{timestamp}.txt"

                    with open(report_filename, "w", encoding='utf-8') as f:
                        json.dump(analysis, f, indent=2)

                    print(f"Analysis completed in {time.time() - start_time:.2f} seconds")
                    print(f"Report saved to: {report_filename}")
                else:
                    print(f"Analysis failed: {analysis['error']}")

                results[media_type] = analysis

        # Print summary
        print("\nAnalysis Summary:")
        for media_type, analysis in results.items():
            status = "Success" if "error" not in analysis else f"Failed: {analysis['error']}"
            print(f"{media_type.capitalize()}: {status}")

    except Exception as e:
        print(f"\nCritical error: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    start_time = time.time()
    main()
    end_time = time.time()
    print(f"\nTotal execution time: {end_time - start_time:.2f} seconds")

In [None]:
import os
import json
from rich.console import Console
from rich.table import Table
############################################################
def print_report(filename):
    file_path = os.path.join('/kaggle/working', filename)
    console = Console()

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = json.load(file)


            console.rule(f"[bold blue]{filename.upper()}[/bold blue]")


            for section, details in content.items():
                console.print(f"\n[bold green]{section}[/bold green]")


                if isinstance(details, list):
                    table = Table(show_header=False)
                    for item in details:
                        table.add_row(str(item))
                    console.print(table)


                elif isinstance(details, dict):
                    table = Table(title=section)
                    table.add_column("Key")
                    table.add_column("Value")
                    for key, value in details.items():
                        table.add_row(str(key), str(value))
                    console.print(table)

                else:
                    console.print(str(details))

    except Exception as e:
        console.print(f"[red]Error reading {filename}: {e}[/red]")
##########################################################################################

output_files = [f for f in os.listdir('/kaggle/working') if f.endswith('.txt')]

for filename in output_files:
    print_report(filename)