In [None]:
print('Hello World')

In [None]:
pip install transformers torch torchvision openai

How It Works
1. Extracts 8 frames from a 5-second video.
2. Encodes frames using BLIP-2’s Vision Transformer (ViT).
3. Aggregates frame embeddings (mean pooling).
4. Sends embeddings + question to ChatGPT.
5. ChatGPT generates an answer based on the video context.

In [None]:
import torch
import torchvision.transforms as transforms
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import openai  # For ChatGPT API calls

# OpenAI API Key (replace with your own key)
openai.api_key = "YOUR_OPENAI_API_KEY"

# Load BLIP-2 Processor & Model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")  # Or use 'blip2-flan-t5-xl' for T5-based
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

# Image Preprocessing Pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def extract_video_frames(video_path, num_frames=8):
    """
    Extracts evenly spaced frames from a video file.
    """
    import cv2
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = torch.linspace(0, total_frames - 1, num_frames).long().tolist()

    frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(transform(frame))
    cap.release()

    return torch.stack(frames).to(device)  # Shape: (num_frames, 3, 224, 224)

def get_video_embedding(video_path):
    """
    Extracts a single video embedding by processing multiple frames with BLIP-2's ViT.
    """
    frames = extract_video_frames(video_path)
    inputs = processor(images=frames, return_tensors="pt").to(device)
    with torch.no_grad():
        video_embeds = model.vision_model(**inputs).last_hidden_state  # Extract ViT embeddings
        video_embeds = video_embeds.mean(dim=1)  # Mean pooling across frames
    return video_embeds  # Shape: (num_frames, hidden_dim)

def generate_chatgpt_response(question, video_embeds):
    """
    Uses video embeddings as context and queries ChatGPT.
    """
    video_context = video_embeds.cpu().numpy().tolist()  # Convert tensor to list for API
    prompt = f"Answer this question based on the given video embeddings:\n\n{question}"

    response = openai.ChatCompletion.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI that answers video-related questions."},
            {"role": "user", "content": prompt},
            {"role": "user", "content": f"Video embedding: {video_context}"}
        ],
        max_tokens=100
    )
    return response["choices"][0]["message"]["content"]

# Example Usage
video_path = "sample_video.mp4"
question = "What is happening in this video?"

video_embeds = get_video_embedding(video_path)
answer = generate_chatgpt_response(question, video_embeds)

print(f"Q: {question}\nA: {answer}")


2. Fine-Tuning BLIP-2 for Video Question Answering
Objective: Teach BLIP-2 to process ego-centric video frames and answer driving-related questions.

In [None]:
{
  "video_id": "example_video.mp4",
  "frames": ["frame_1.jpg", "frame_2.jpg", ..., "frame_8.jpg"],  
  "question": "Is the car allowed to turn right at this intersection?",  
  "answer": "No, because there is a no-right-turn sign."  
}

In [None]:
import torch
import json
from transformers import Blip2Processor, Blip2ForConditionalGeneration, TrainingArguments, Trainer
from torch.utils.data import Dataset

# Load BLIP-2 Model & Processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Salesforce/blip2-opt-2.7b"
processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(model_name).to(device)

# Load Your Dataset
class DrivingVideoQADataset(Dataset):
    def __init__(self, json_file, processor):
        with open(json_file, "r") as f:
            self.data = json.load(f)
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        frames = [Image.open(frame_path).convert("RGB") for frame_path in sample["frames"]]
        inputs = self.processor(images=frames, text=sample["question"], return_tensors="pt", padding=True)
        inputs["labels"] = self.processor.tokenizer(sample["answer"], return_tensors="pt")["input_ids"]
        return {k: v.squeeze(0) for k, v in inputs.items()}  # Remove batch dim

# Initialize Dataset & DataLoader
dataset = DrivingVideoQADataset("driving_video_qa.json", processor)

# Training Configuration
training_args = TrainingArguments(
    output_dir="./blip2_driving",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_dir="./logs",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,
)

# Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)
trainer.train()