In [3]:
from ultralytics import YOLO

def download_and_load_yolo():
    try:
        print("Attempting to load YOLO model...")
        model = YOLO('yolov8n.pt')
        print("Model loaded successfully!")
        return model
    except FileNotFoundError:
        print("Model not found locally, downloading...")
        # Alternative download method
        import torch
        torch.hub.download_url_to_file(
            'https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt',
            'yolov8n.pt'
        )
        print("Download completed, loading model...")
        model = YOLO('yolov8n.pt')
        return model

# Use the function
model = download_and_load_yolo()

Attempting to load YOLO model...
Model loaded successfully!


In [10]:
import cv2
import numpy as np
import torch
from ultralytics import YOLO
import supervision as sv
from collections import defaultdict
import tkinter as tk
from tkinter import ttk
import datetime
import argparse
import os

In [12]:
class TimeSelector:
    def __init__(self, video_path1, video_path2):
        self.root = tk.Tk()
        self.root.title("Video Time Selector")
        
        # Get video durations using OpenCV
        cap = cv2.VideoCapture(video_path1)
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        self.duration = int(frame_count / fps)
        cap.release()
        
        # Variables to store selected times
        self.start_time = tk.StringVar(value="00:00:00")
        self.end_time = tk.StringVar(value=self._format_time(self.duration))
        
        self._create_widgets()
        
    def _format_time(self, seconds):
        """Convert seconds to HH:MM:SS format"""
        return str(datetime.timedelta(seconds=seconds))
    
    def _parse_time(self, time_str):
        """Convert HH:MM:SS to seconds"""
        h, m, s = map(int, time_str.split(':'))
        return h * 3600 + m * 60 + s
    
    def _create_widgets(self):
        # Start time selection
        ttk.Label(self.root, text="Start Time (HH:MM:SS):").grid(row=0, column=0, padx=5, pady=5)
        ttk.Entry(self.root, textvariable=self.start_time).grid(row=0, column=1, padx=5, pady=5)
        
        # End time selection
        ttk.Label(self.root, text="End Time (HH:MM:SS):").grid(row=1, column=0, padx=5, pady=5)
        ttk.Entry(self.root, textvariable=self.end_time).grid(row=1, column=1, padx=5, pady=5)
        
        # Duration display
        ttk.Label(self.root, text=f"Video Duration: {self._format_time(self.duration)}").grid(
            row=2, column=0, columnspan=2, pady=10)
        
        # OK button
        ttk.Button(self.root, text="OK", command=self.root.quit).grid(
            row=3, column=0, columnspan=2, pady=10)
        
    def get_times(self):
        """Return selected start and end times in seconds"""
        self.root.mainloop()
        self.root.destroy()
        return (self._parse_time(self.start_time.get()),
                self._parse_time(self.end_time.get()))

class MultiViewObjectTracker:
    def __init__(self, yolo_model="yolov8n.pt"):
        # Initialize YOLO model for object detection
        self.model = YOLO(yolo_model)
        # Initialize SIFT detector
        self.sift = cv2.SIFT_create()
        # FLANN matcher
        FLANN_INDEX_KDTREE = 1
        index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
        search_params = dict(checks=50)
        self.flann = cv2.FlannBasedMatcher(index_params, search_params)
        
    def detect_objects(self, frame):
        """Detect objects using YOLO"""
        results = self.model(frame)[0]
        
        # Create a custom Detections class to hold the results
        class Detections:
            def __init__(self, boxes, confidences, class_ids):
                self.xyxy = boxes
                self.confidence = confidences
                self.class_id = class_ids
        
        # Extract detection information directly from YOLO results
        if hasattr(results.boxes, 'xyxy'):
            boxes = results.boxes.xyxy.cpu().numpy()
            confidences = results.boxes.conf.cpu().numpy()
            class_ids = results.boxes.cls.cpu().numpy().astype(int)
        else:
            # If no detections, return empty arrays
            boxes = np.array([])
            confidences = np.array([])
            class_ids = np.array([])
        
        return Detections(boxes, confidences, class_ids)
    
    def extract_object_features(self, frame, bbox):
        """Extract SIFT features for an object"""
        try:
            x1, y1, x2, y2 = map(int, bbox)
            obj_img = frame[y1:y2, x1:x2]
            if obj_img.size == 0:
                return None, None
            gray = cv2.cvtColor(obj_img, cv2.COLOR_BGR2GRAY)
            keypoints, descriptors = self.sift.detectAndCompute(gray, None)
            return keypoints, descriptors
        except Exception as e:
            print(f"Error in feature extraction: {e}")
            return None, None
    
    def match_objects_between_views(self, frame1, frame2, detections1, detections2):
        """Match objects between two views and estimate depths"""
        depths = []
        
        if len(detections1.xyxy) == 0 or len(detections2.xyxy) == 0:
            return depths
        
        for i, (bbox1, conf1, class_id1) in enumerate(zip(detections1.xyxy, 
                                                         detections1.confidence, 
                                                         detections1.class_id)):
            # Extract features for object in first view
            kp1, desc1 = self.extract_object_features(frame1, bbox1)
            if kp1 is None or len(kp1) < 4:
                continue
                
            best_match = None
            min_dist = float('inf')
            matched_idx = -1
            
            # Compare with all objects of same class in second view
            for j, (bbox2, conf2, class_id2) in enumerate(zip(detections2.xyxy, 
                                                             detections2.confidence, 
                                                             detections2.class_id)):
                if class_id1 != class_id2:
                    continue
                    
                kp2, desc2 = self.extract_object_features(frame2, bbox2)
                if kp2 is None or len(kp2) < 4:
                    continue
                
                try:
                    # Match features
                    matches = self.flann.knnMatch(desc1, desc2, k=2)
                    
                    # Apply Lowe's ratio test
                    good_matches = []
                    for m, n in matches:
                        if m.distance < 0.7 * n.distance:
                            good_matches.append(m)
                    
                    if len(good_matches) < 4:
                        continue
                    
                    # Calculate average distance between matched features
                    total_dist = 0
                    for match in good_matches:
                        pt1 = kp1[match.queryIdx].pt
                        pt2 = kp2[match.trainIdx].pt
                        total_dist += np.sqrt((pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2)
                    avg_dist = total_dist / len(good_matches)
                    
                    if avg_dist < min_dist:
                        min_dist = avg_dist
                        best_match = bbox2
                        matched_idx = j
                        
                except Exception as e:
                    print(f"Error in matching: {e}")
                    continue
            
            if best_match is not None:
                # Calculate depth (assuming DEPTH_CONSTANT is calibrated)
                DEPTH_CONSTANT = 1000  # This should be calibrated based on your setup
                depth = DEPTH_CONSTANT / min_dist if min_dist > 0 else 0
                depths.append((bbox1, best_match, depth, class_id1))
        
        return depths
    
def process_stereo_video(video1_path, video2_path, output_path, start_time=None, end_time=None, 
                        use_gui=False):
    """
    Process stereo video with time selection
    """
    
    # Time selection
    if use_gui:
        time_selector = TimeSelector(video1_path, video2_path)
        start_time, end_time = time_selector.get_times()
    
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    
    # Initialize video captures
    cap1 = cv2.VideoCapture(video1_path)
    cap2 = cv2.VideoCapture(video2_path)
    
    if not cap1.isOpened() or not cap2.isOpened():
        raise ValueError("Error opening video files")
    
    # Get video properties
    fps = int(cap1.get(cv2.CAP_PROP_FPS))
    width = int(cap1.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap1.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap1.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Calculate frame positions
    start_frame = int(start_time * fps) if start_time is not None else 0
    end_frame = int(end_time * fps) if end_time is not None else total_frames
    
    # Set starting positions
    cap1.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
    cap2.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
    
    # Initialize tracker
    tracker = MultiViewObjectTracker()
    
    # Initialize video writer with proper codec and parameters
    output_width = width * 2  # Double width for side-by-side display
    output_height = height
    
    # Try different codecs
    codecs = [
        ('XVID', '.avi'),
        ('MP4V', '.mp4'),
        ('MJPG', '.avi'),
        ('H264', '.mp4')
    ]
    
    out = None
    for codec, ext in codecs:
        try:
            temp_output = f'temp_output{ext}'
            fourcc = cv2.VideoWriter_fourcc(*codec)
            out = cv2.VideoWriter(temp_output, fourcc, fps, (output_width, output_height))
            
            # Test if VideoWriter was successfully created
            if out is not None and out.isOpened():
                output_path = f"{output_path}{ext}"  # Update output path with correct extension
                print(f"Successfully created video writer with codec: {codec}")
                break
        except Exception as e:
            print(f"Failed to create video writer with codec {codec}: {e}")
            if out is not None:
                out.release()
    
    if out is None:
        raise ValueError("Could not initialize video writer with any codec")
    
    # Progress bar setup
    total_frames_to_process = end_frame - start_frame
    current_frame = 0
    
    try:
        while cap1.isOpened() and cap2.isOpened():
            ret1, frame1 = cap1.read()
            ret2, frame2 = cap2.read()
            
            if not ret1 or not ret2 or current_frame >= total_frames_to_process:
                break
            
            # Update progress
            current_frame += 1
            progress = (current_frame / total_frames_to_process) * 100
            print(f"\rProcessing: {progress:.1f}% complete", end="")
            
            # Process frames
            detections1 = tracker.detect_objects(frame1)
            detections2 = tracker.detect_objects(frame2)
            depth_matches = tracker.match_objects_between_views(frame1, frame2, 
                                                              detections1, detections2)
            
            # Draw results
            for bbox1, bbox2, depth, class_id in depth_matches:
                # Draw bounding boxes and labels in first view
                x1, y1, x2, y2 = map(int, bbox1)
                cv2.rectangle(frame1, (x1, y1), (x2, y2), (0, 255, 0), 2)
                label = f"{tracker.model.names[class_id]}: {depth:.1f}m"
                cv2.putText(frame1, label, (x1, y1-10), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
                
                # Draw bounding boxes and labels in second view
                x1, y1, x2, y2 = map(int, bbox2)
                cv2.rectangle(frame2, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame2, label, (x1, y1-10), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            
            # Add timestamp
            current_time = start_time + (current_frame / fps)
            timestamp = f"Time: {datetime.timedelta(seconds=int(current_time))}"
            cv2.putText(frame1, timestamp, (10, 30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
            cv2.putText(frame2, timestamp, (10, 30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
            
            # Combine frames
            combined_frame = np.hstack((frame1, frame2))
            
            # Write frame
            out.write(combined_frame)
            
            # Display frame
            cv2.imshow('Stereo Object Detection', cv2.resize(combined_frame, (1600, 600)))
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            
    except Exception as e:
        print(f"\nError during processing: {e}")
        
    finally:
        # Cleanup
        cap1.release()
        cap2.release()
        out.release()
        cv2.destroyAllWindows()
        
        # Verify output file
        if os.path.exists(temp_output) and os.path.getsize(temp_output) > 0:
            import shutil
            shutil.move(temp_output, output_path)
            print(f"\nProcessing complete! Output saved to: {output_path}")
        else:
            print("\nError: Output file is invalid or empty")

Successfully created video writer with codec: MJPG
Processing: 0.5% complete
0: 384x640 (no detections), 56.6ms
Speed: 3.1ms preprocess, 56.6ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 54.4ms
Speed: 1.6ms preprocess, 54.4ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)
Processing: 1.0% complete
0: 384x640 (no detections), 111.5ms
Speed: 4.1ms preprocess, 111.5ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 58.9ms
Speed: 2.2ms preprocess, 58.9ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)
Processing: 1.5% complete
0: 384x640 (no detections), 69.4ms
Speed: 2.0ms preprocess, 69.4ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 86.3ms
Speed: 1.8ms preprocess, 86.3ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)
Processing: 2.0% complete
0: 384x640 (no detections), 73.1ms
Speed: 2.5ms 