In [None]:
# Importing Libraries
import imageio.v3 as iio # pip install imageio[ffmpeg]
import matplotlib.pyplot as plt
import cv2
import numpy as np

## 2. Frame Extraction

In [None]:
def load_video_frames(video_path, frame_interval=10, display_frames=True):
    # Load frames from a video file at specified intervals.

    frame_count = 0
    frames = []  # List to store frames for visualization

    try:
        print(f"Opening video file: {video_path}")
        
        # Iterate through frames in the video file
        for frame in iio.imiter(video_path):
            if frame_count % frame_interval == 0:
                frames.append(frame)  # Store frame in list
                
                # Display frame if display_frames is True
                if display_frames:
                    plt.imshow(frame)
                    plt.title(f'Frame {frame_count}')
                    plt.axis('off')
                    plt.show()
                
            frame_count += 1

    except FileNotFoundError:
        print(f"Error: Video file not found at {video_path}")
        return [], 0
    except Exception as e:
        print(f"An error occurred while processing the video: {e}")
        return [], 0

    print(f"\nFinished processing video.")
    print(f"Total frames iterated: {frame_count}.")
    
    return frames

frames = load_video_frames('../videos/bedroomvideo.mp4', frame_interval=10, display_frames=False)

## 5. Essential/ Fundamental Matrix Computation

Both matrices describe the epipolar geometry between two images of the same scene taken from different viewpoints. Epipolar geometry defines the constraint that corresponding points must satisfy. If you have a point x in the first image, its corresponding point x' in the second image must lie on a specific line called the epiline. Both matrices capture this relationship, but they differ based on camera calibration.

The Fundamental Matrix acts as a bridge between the 2D pixel coordinates of corresponding points in two different images, capturing the geometric constraints imposed by the 3D scene and the camera positions, even when you don't know the camera's exact internal details. It's a fundamental tool for tasks like finding correct feature matches, estimating camera motion, and ultimately reconstructing the 3D scene.

- Fundamental Matrix (F):
  - **What it is**: A 3x3 matrix that relates corresponding points between two images in pixel coordinates.
  - **Information encoded**: It contains information about the camera's relative rotation and translation (extrinsic parameters) and the intrinsic parameters (like focal length, principal point) of both cameras.
  - **Equation**: It satisfies the epipolar constraint: x'^T * F * x = 0, where x and x' are the homogeneous coordinates of the matching points in pixels.
  - **When to use**: Use the Fundamental Matrix when the cameras are uncalibrated, meaning you don't know their intrinsic parameters.
  - **Computation**: Typically requires at least 8 pairs of corresponding points (using the 8-point algorithm) or 7 pairs (7-point algorithm). OpenCV's function often uses robust methods like RANSAC or LMedS which handle outliers well using many more points.

If you are recording video with a phone and have not performed a specific camera calibration procedure to find its intrinsic matrix (K), then using the Fundamental Matrix (F) is the recommended approach. You treat the camera as uncalibrated.

- Essential Matrix (E):
  - **What it is**: A 3x3 matrix that relates corresponding points between two images in normalized image coordinates (independent of camera intrinsics).
  - **Information encoded**: It contains only information about the camera's relative rotation (R) and translation (t), up to a scale factor. It does not include camera intrinsic information.
  - **Equation**: It satisfies the epipolar constraint in normalized coordinates: x_norm'^T * E * x_norm = 0.
  - **When to use**: Use the Essential Matrix when the cameras are calibrated, meaning you know their intrinsic parameters (focal length, principal point - often represented in a camera matrix K).
  - **Computation**: Requires at least 5 pairs of corresponding points (using the 5-point algorithm), though robust methods in OpenCV use more points.
  - **Relation to F**: E = K'^T * F * K, where K and K' are the camera intrinsic matrices for the two views. If the camera is the same for both views, K' = K.

In [None]:
def compute_fundamental_matrix(pts1, pts2, method=cv2.FM_RANSAC, ransacReprojThreshold=1.0, confidence=0.999, maxIters=2000):
    """
    Computes the Fundamental Matrix (F) and filters inliers between two sets of points.

    Args:
        pts1 (np.ndarray): NumPy array of N points from image 1, shape (N, 1, 2), dtype=float32/64.
        pts2 (np.ndarray): NumPy array of N points from image 2, shape (N, 1, 2), dtype=float32/64.
        method (int, optional): Method for computation (e.g., cv2.FM_RANSAC, cv2.FM_LMEDS).
                                Defaults to cv2.FM_RANSAC.
        ransacReprojThreshold (float, optional): RANSAC reprojection threshold (pixels). Defaults to 1.0.
        confidence (float, optional): RANSAC confidence level. Defaults to 0.999.
        maxIters (int, optional): Maximum RANSAC iterations. Defaults to 2000.

    Returns:
        tuple: A tuple containing:
            - F (np.ndarray or None): The computed 3x3 Fundamental Matrix, or None if failed.
            - pts1_inliers (np.ndarray or None): Inlier points from image 1, or None if failed.
            - pts2_inliers (np.ndarray or None): Inlier points from image 2, or None if failed.
            - status_mask (np.ndarray or None): The mask indicating inliers (1) and outliers (0).
    """
    if pts1 is None or pts2 is None or len(pts1) < 8 or len(pts2) < 8:
        print("Error: Not enough points to compute Fundamental Matrix.")
        return None, None, None, None

    try:
        # Compute the Fundamental Matrix using the chosen method
        F, mask = cv2.findFundamentalMat(pts1, pts2,
                                         method=method,
                                         ransacReprojThreshold=ransacReprojThreshold,
                                         confidence=confidence,
                                         maxIters=maxIters)

        if F is None or mask is None:
            print("Warning: findFundamentalMat returned None.")
            return None, None, None, None

        # Filter points using the mask to get inliers
        pts1_inliers = pts1[mask.ravel() == 1]
        pts2_inliers = pts2[mask.ravel() == 1]
        num_inliers = len(pts1_inliers)
        num_total = len(pts1)

        print(f"Fundamental Matrix computed. Inliers: {num_inliers} / {num_total}")

        return F, pts1_inliers, pts2_inliers, mask

    except cv2.error as e:
        print(f"OpenCV Error during findFundamentalMat: {e}")
        return None, None, None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None, None, None

In [None]:
# Driver code
# 1. Assume you have your matched points (replace with your actual data)
num_points = 150
pts1_matched = np.random.rand(num_points, 1, 2).astype(np.float32) * 500
translation_demo = np.array([30, 10], dtype=np.float32).reshape(1, 1, 2)
noise_demo = np.random.randn(num_points, 1, 2).astype(np.float32) * 3
# Add some outliers
outlier_indices = np.random.choice(num_points, size=int(num_points * 0.2), replace=False)
pts2_matched = pts1_matched + translation_demo + noise_demo
pts2_matched[outlier_indices] = np.random.rand(len(outlier_indices), 1, 2).astype(np.float32) * 500

# 2. Call the function
fundamental_matrix, pts1_inliers, pts2_inliers, inlier_mask = compute_fundamental_matrix(
    pts1_matched,
    pts2_matched,
    ransacReprojThreshold=1.5, # Example threshold adjustment
    confidence=0.99
)

# 3. Check the result and use the outputs
if fundamental_matrix is not None:
    print("Successfully computed F and filtered inliers.")
    print(f"Shape of pts1_inliers: {pts1_inliers.shape}")
    print(f"Shape of pts2_inliers: {pts2_inliers.shape}")

    # Now you can pass 'fundamental_matrix', 'pts1_inliers', 'pts2_inliers'
    # to the next stage (e.g., camera pose estimation)

else:
    print("Failed to compute Fundamental Matrix or find sufficient inliers.")

## 6. Camera Pose Estimation

The goal of this stage is to determine the relative motion between the two camera views where you matched features. This motion is described by:

- **Rotation (R)**: A 3x3 matrix describing how the camera orientation changed between the two shots.
- **Translation (t)**: A 3x1 vector describing how the camera position changed between the two shots. Note that the translation vector t can only be determined up to a scale factor. This means you know the direction of motion but not the absolute distance moved.

- Input Matrix: cv2.recoverPose technically requires the Essential Matrix (E) as its main input, not the Fundamental Matrix (F). It also needs the corresponding inlier points from the previous stage (pts1_inliers, pts2_inliers) and the camera intrinsic matrix (K).
- Handling the Uncalibrated Case (Starting with F): Since you computed F (because the phone camera was treated as uncalibrated), you need a way to get E to use recoverPose. The relationship is E = K^T * F * K. But K is unknown!
  - The Workaround: You need to assume a plausible K matrix. A common approach is:
Set the principal point (cx, cy) to the image center (e.g., width/2, height/2).
  - Estimate or guess the focal length (fx, fy). Sometimes fx = fy = image_width is used as a starting guess, or a typical value for phone cameras (e.g., 500-1000 pixels) might be assumed.
  - Compute E: Calculate E = K_assumed.T @ F @ K_assumed.
  - Use recoverPose: Call cv2.recoverPose using this computed E, the same assumed K, and your inlier points.
  - Important: You must document this assumption about K in your report. The resulting pose (especially translation t) will be relative to the scale defined by your assumed K and F.
- Chirality Problem: Mathematically, decomposing the E matrix yields four possible solutions for the rotation (R) and translation (t). However, only one of these solutions is physically correct – the one where the reconstructed 3D points lie in front of both cameras.
  - How recoverPose Solves It: The cv2.recoverPose function handles this automatically! It takes your inlier points (pts1_inliers, pts2_inliers) and the K matrix, triangulates the points for each of the four possible (R, t) combinations, and counts how many points end up in front of both camera views. It then returns the R and t corresponding to the hypothesis with the most positive depth points, effectively resolving the chirality ambiguity. Your report should explain this concept.

In [None]:
def estimate_camera_pose(pts1, pts2, F, K_assumed):
    """
    Estimates the relative camera pose (Rotation and Translation) from the
    Fundamental Matrix (F) and corresponding inlier points using cv2.recoverPose.

    Args:
        pts1 (np.ndarray): Inlier points from image 1, shape (N, 1, 2) or (N, 2).
        pts2 (np.ndarray): Inlier points from image 2, shape (N, 1, 2) or (N, 2).
        F (np.ndarray): The 3x3 Fundamental Matrix computed previously.
        K_assumed (np.ndarray): The assumed 3x3 camera intrinsic matrix.
                           Crucial for converting F to E and for chirality check.

    Returns:
        tuple: A tuple containing:
            - R (np.ndarray or None): The 3x3 estimated Rotation matrix, or None if failed.
            - t (np.ndarray or None): The 3x1 estimated Translation vector (up to scale), or None if failed.
            - points_valid_mask (np.ndarray or None): Mask indicating points used by recoverPose for the final check.
    """
    if F is None or K_assumed is None or pts1 is None or pts2 is None or len(pts1) < 5:
         print("Error: Insufficient input for pose estimation.")
         return None, None, None

    # Ensure points are in the correct shape (N, 2) for recoverPose input consistency check
    # Although recoverPose can sometimes handle (N, 1, 2), ensuring (N, 2) is safer.
    pts1_rp = np.reshape(pts1, (-1, 2)).astype(np.float64)
    pts2_rp = np.reshape(pts2, (-1, 2)).astype(np.float64)

    try:
        # 1. Compute Essential Matrix from F and assumed K
        E = K_assumed.T @ F @ K_assumed
        print("Computed Essential Matrix (E) from F and assumed K.")
        # print("E:\n", E) # Uncomment to view

        # 2. Recover Pose (R, t) using cv2.recoverPose
        # This function internally handles the chirality check using the points and K
        points, R, t, mask_pose = cv2.recoverPose(E, pts1_rp, pts2_rp, K_assumed)

        num_valid_points = cv2.countNonZero(mask_pose) if mask_pose is not None else 0
        print(f"Pose recovered. Number of points consistent with the pose: {num_valid_points} / {len(pts1_rp)}")

        if num_valid_points < 5: # Need at least a few points for a reliable pose
             print("Warning: Very few points consistent with the recovered pose. Result might be unreliable.")
             # Depending on requirements, you might return None here
             # return None, None, None

        return R, t, mask_pose # R is 3x3, t is 3x1

    except cv2.error as e:
        print(f"OpenCV Error during recoverPose: {e}")
        return None, None, None
    except Exception as e:
        print(f"An unexpected error occurred during pose estimation: {e}")
        return None, None, None

In [None]:
# Driver code

# 1. Assume you have results from the previous stage:
# fundamental_matrix, pts1_inliers, pts2_inliers (from compute_fundamental_matrix)

# --- Placeholder results (replace with your actual data) ---
# F_computed = np.random.rand(3, 3) # Replace with actual F
# pts1_in = np.random.rand(50, 1, 2).astype(np.float32) * 500 # Replace with actual inliers
# pts2_in = np.random.rand(50, 1, 2).astype(np.float32) * 500 # Replace with actual inliers
# If the compute_fundamental_matrix function was just run:
F_computed = fundamental_matrix
pts1_in = pts1_inliers
pts2_in = pts2_inliers
# -- Placeholder end ---

# 2. Define your assumed K matrix (Example!)
#    You MUST justify your choice/estimation of K in your report.
image_width = 640 # Example image width
image_height = 480 # Example image height
assumed_cx = image_width / 2
assumed_cy = image_height / 2
assumed_focal = image_width # A common starting guess if unknown
K_assumed = np.array([
    [assumed_focal, 0, assumed_cx],
    [0, assumed_focal, assumed_cy],
    [0, 0, 1]
], dtype=np.float64)
print("\nUsing Assumed K matrix:\n", K_assumed)

# 3. Call the pose estimation function
if F_computed is not None and pts1_in is not None and pts2_in is not None:
    R_estimated, t_estimated, pose_mask = estimate_camera_pose(pts1_in, pts2_in, F_computed, K_assumed)

    # 4. Check results
    if R_estimated is not None and t_estimated is not None:
        print("\nSuccessfully estimated camera pose.")
        print("Rotation (R):\n", R_estimated)
        print("Translation (t) (up to scale):\n", t_estimated)
        # You now have the relative rotation R and translation t between the two camera views.
    else:
        print("\nFailed to estimate camera pose.")
else:
    print("\nSkipping pose estimation due to missing F or inlier points from previous stage.")

## 7. 3D Point Triangulation and Scene Visualisation