<a href="https://colab.research.google.com/github/Kavish1504/SceneSense/blob/main/SceneSense.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Video Understanding AI System
A complete implementation using pre-trained models for video captioning and Q&A
"""

import cv2
import numpy as np
from PIL import Image
import base64
from io import BytesIO
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
import torch

class VideoUnderstandingSystem:
    def __init__(self):
        """Initialize the video understanding system with BLIP models"""
        print("Loading BLIP models...")

        self.caption_processor=BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        self.caption_model=BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

        self.vqa_processor=BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
        self.vqa_model=BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.caption_model.to(self.device)
        self.vqa_model.to(self.device)
        print(f"Models loaded on {self.device}")

    def extract_frames(self, video_path, num_frames=8, method='uniform'):
        cap=cv2.VideoCapture(video_path)
        total_frames=int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps=cap.get(cv2.CAP_PROP_FPS)
        duration=total_frames / fps if fps > 0 else 0

        frames=[]

        if method=='uniform':
            # Extract frames uniformly across the video
            frame_indices=np.linspace(0, total_frames - 1, num_frames, dtype=int)
            for idx in frame_indices:
                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
                ret, frame=cap.read()
                if ret:
                    frame_rgb=cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    pil_image=Image.fromarray(frame_rgb)
                    frames.append(pil_image)

        cap.release()
        return frames, duration

    def generate_frame_captions(self, frames):
        captions = []
        for i, frame in enumerate(frames):
            # Unconditional captioning
            inputs=self.caption_processor(frame, return_tensors="pt").to(self.device)
            with torch.no_grad():
                outputs=self.caption_model.generate(**inputs, max_length=50)

            caption=self.caption_processor.decode(outputs[0], skip_special_tokens=True)
            captions.append(caption)

        return captions

    def answer_frame_questions(self, frames, question):
        answers=[]
        for i, frame in enumerate(frames):
            # Process image and question for VQA
            inputs = self.vqa_processor(frame, question, return_tensors="pt").to(self.device)
            with torch.no_grad():
                outputs = self.vqa_model.generate(**inputs, max_length=50)

            answer=self.vqa_processor.decode(outputs[0], skip_special_tokens=True)
            answers.append(answer)

        return answers

    def summarize_video(self, captions, duration):
        # Remove duplicate consecutive captions
        unique_captions=[]
        prev=None
        for cap in captions:
            if cap != prev:
                unique_captions.append(cap)
                prev = cap

        summary=f"📹 Video Summary (Duration: {duration:.1f}s)\n\n"
        summary+="Key Observations:\n"

        for i, caption in enumerate(unique_captions, 1):
            summary+=f"{i}. {caption}\n"

        # Generate overall description
        summary+=f"\n🎬 Overall Description:\n"
        summary+=f"This video appears to show {unique_captions[0].lower()}"

        if len(unique_captions) > 1:
            summary+=f", transitioning through scenes including {', '.join(unique_captions[1:]).lower()}"

        summary+="."
        return summary

    def answer_question(self, video_path, question, num_frames=8):
        print(f"Analyzing video to answer: {question}")

        # Extract frames
        frames, duration=self.extract_frames(video_path, num_frames)

        # Generate answers using VQA model
        answers=self.answer_frame_questions(frames, question)

        # Aggregate results
        result=f"🤔 Question: {question}\n\n"
        result+=f"📊 Analysis of {len(frames)} frames:\n\n"

        for i, answer in enumerate(answers, 1):
            result+=f"Frame {i}: {answer}\n"

        # Find most common answer
        from collections import Counter
        answer_counts=Counter(answers)
        most_common_answer=answer_counts.most_common(1)[0][0]
        result+=f"\n✅ Most Consistent Answer:\n"
        result+=f"{most_common_answer.capitalize()}\n\n"

        if len(answer_counts) > 1:
            result+=f"💡 Note: The answer varied across frames. The most common answer appeared in {answer_counts[most_common_answer]} out of {len(frames)} frames.\n"
            result+=f"\nOther answers observed: {', '.join([ans for ans in answer_counts.keys() if ans != most_common_answer])}"

        return result

    def process_video(self, video_path, task="describe", question=None, num_frames=8):
        try:
            if task=="qa" and not question:
                return "⚠️ Please provide a question for Q&A mode"

            if task=="describe":
                print(f"Extracting {num_frames} frames...")
                frames, duration=self.extract_frames(video_path, num_frames)

                print("Generating captions...")
                captions=self.generate_frame_captions(frames)

                summary=self.summarize_video(captions, duration)
                return summary

            elif task=="qa":
                answer=self.answer_question(video_path, question, num_frames)
                return answer

        except Exception as e:
            return f"Error processing video: {str(e)}\n\nStack trace: {repr(e)}"


# Create Gradio Interface
def create_interface():
    """Create Gradio web interface"""

    system=VideoUnderstandingSystem()

    def process_wrapper(video, task, question, num_frames):
        if video is None:
            return "⚠️ Please upload a video file"
        return system.process_video(video, task, question, num_frames)

    # Create interface
    with gr.Blocks(title="Video Understanding AI System", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🎥 Video Understanding AI System
        ### Upload a video and get AI-powered descriptions or ask questions about it

        **Features:**
        - 📝 Automatic video summarization
        - 🤔 Video Question Answering
        - 🎯 Frame-by-frame analysis
        """)

        with gr.Row():
            with gr.Column(scale=1):
                video_input = gr.Video(label="Upload Video")

                task_selector = gr.Radio(
                    choices=["describe", "qa"],
                    value="describe",
                    label="Task",
                    info="Choose 'describe' for summary or 'qa' for questions"
                )

                question_input = gr.Textbox(
                    label="Question (for Q&A mode)",
                    placeholder="e.g., What color is the liquid? What is the person doing?",
                    lines=2
                )

                num_frames_slider = gr.Slider(
                    minimum=4,
                    maximum=16,
                    value=8,
                    step=1,
                    label="Number of Frames to Analyze",
                    info="More frames = more detail but slower processing"
                )

                process_btn = gr.Button("🚀 Process Video", variant="primary")

            with gr.Column(scale=1):
                output_text = gr.Textbox(
                    label="Result",
                    lines=20,
                    placeholder="Results will appear here..."
                )

        gr.Markdown("""
        ### 💡 Tips:
        - Use **describe** mode for general video understanding
        - Use **qa** mode with specific questions like "What color is the car?" or "How many people are visible?"
        - More frames = better understanding but slower processing
        - Works best with short videos (< 1 minute)

        ### 🔧 Technical Details:
        - **Captioning Model**: BLIP Image Captioning
        - **VQA Model**: BLIP Visual Question Answering
        - **Frame Extraction**: OpenCV with uniform sampling
        - **Processing**: CPU/GPU (automatic detection)

        ### 🆕 What's Fixed:
        - Now uses separate models for captioning vs. question answering
        - VQA model properly answers questions instead of repeating them
        - Aggregates answers across frames for more reliable results
        """)

        # Connect button
        process_btn.click(
            fn=process_wrapper,
            inputs=[video_input, task_selector, question_input, num_frames_slider],
            outputs=output_text
        )

    return demo


# Main execution
if __name__ == "__main__":
    print("=" * 60)
    print("VIDEO UNDERSTANDING AI SYSTEM - FIXED VERSION")
    print("=" * 60)
    print("\nInitializing...")

    # Create and launch interface
    demo = create_interface()
    demo.launch(share=True)

    print("\n✅ System ready! Open the link above to use the interface.")
    print("📝 Upload a video and start analyzing!")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


VIDEO UNDERSTANDING AI SYSTEM - FIXED VERSION

Initializing...
Loading BLIP models...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Models loaded on cpu


  with gr.Blocks(title="Video Understanding AI System", theme=gr.themes.Soft()) as demo:


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d1929b32b28815ca1f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



✅ System ready! Open the link above to use the interface.
📝 Upload a video and start analyzing!
