In [23]:
import os
import glob
import scipy.io
import cv2
import matplotlib.pyplot as plt
import numpy as np
import h5py # Import h5py

In [27]:
# Define paths (adjust these to your actual paths)
TVPARM_DATA_DIR = r"C:\Users\Omen\OneDrive\Documents\AI\AgenticAIWorkspace\video_summarization_project\data\tvsum"
VIDEO_DIR = os.path.join(TVPARM_DATA_DIR, 'ydata-tvsum50-video') # This directory will contain individual MP4 videos
ANNOTATION_DIR = os.path.join(TVPARM_DATA_DIR, 'ydata-tvsum50-matlab') # This directory contains ydata-tvsum50.mat


In [29]:
# 1. Check if directories exist
print(f"Video directory exists: {os.path.exists(VIDEO_DIR)}")
print(f"Annotation directory exists: {os.path.exists(ANNOTATION_DIR)}")

Video directory exists: True
Annotation directory exists: True


In [31]:
# 2. List some video files (Assuming individual video files are in ydata-tvsum50-video)
video_files = glob.glob(os.path.join(VIDEO_DIR, '*.mp4'))
print(f"\nFound {len(video_files)} video files. First 5:")
for i, vf in enumerate(video_files[:5]):
    print(f"- {os.path.basename(vf)}")


Found 50 video files. First 5:
- -esJrBWj2d8.mp4
- 0tmA_C6XwfM.mp4
- 37rzWOQsNIw.mp4
- 3eYKfiOEJNs.mp4
- 4wU_LUjG5Ic.mp4


In [13]:
# 3. List some annotation files
annotation_files = glob.glob(os.path.join(ANNOTATION_DIR, '*.mat'))
print(f"\nFound {len(annotation_files)} annotation files. First 5:")
for i, af in enumerate(annotation_files[:5]):
    print(f"- {os.path.basename(af)}")



Found 1 annotation files. First 5:
- ydata-tvsum50.mat


In [25]:
# 00_data_exploration.ipynb

import os
import glob
import scipy.io
import cv2
import matplotlib.pyplot as plt
import numpy as np
import h5py # Import h5py

TVPARM_BASE_DIR = r'C:\Users\Omen\OneDrive\Documents\AI\AgenticAIWorkspace\video_summarization_project\ydata-tvsum50-v1_1'

# Now, define the paths to the video and annotation directories relative to the base
VIDEO_DIR = os.path.join(TVPARM_BASE_DIR, 'ydata-tvsum50-video')
ANNOTATION_DIR = os.path.join(TVPARM_BASE_DIR, 'ydata-tvsum50-matlab')

# 1. Check if directories exist
print(f"Video directory exists: {os.path.exists(VIDEO_DIR)}")
print(f"Annotation directory exists: {os.path.exists(ANNOTATION_DIR)}")

# 2. List some video files (Assuming individual video files are in ydata-tvsum50-video)
video_files = glob.glob(os.path.join(VIDEO_DIR, '*.mp4'))
print(f"\nFound {len(video_files)} video files. First 5:")
for i, vf in enumerate(video_files[:5]):
    print(f"- {os.path.basename(vf)}")

Video directory exists: True
Annotation directory exists: True

Found 0 video files. First 5:


In [27]:
# 3. Specifically target the ydata-tvsum50.mat file
main_annotation_file = os.path.join(ANNOTATION_DIR, 'ydata-tvsum50.mat')

# We won't try to load a single tvsum_data_np here, as the structure is different.
# Instead, we'll load individual components.
all_gt_scores = None # Initialize to None
all_n_frames = None
all_change_points = None
all_video_titles = None
all_user_scores = None # New

if os.path.exists(main_annotation_file):
    print(f"\nFound main annotation file: {os.path.basename(main_annotation_file)}")
    print(f"\nInspecting main annotation: {os.path.basename(main_annotation_file)}")

    # Use h5py to open the .mat file
    with h5py.File(main_annotation_file, 'r') as f:
        print("Keys in .mat file (using h5py):")
        for key in f.keys():
            print(f"- {key}")

        if 'tvsum50' in f:
            tvsum_group = f['tvsum50'] # Access the 'tvsum50' Group
            print(f"\nType of f['tvsum50']: {type(tvsum_group)}")

            print(f"Keys inside the 'tvsum50' group:")
            for key in tvsum_group.keys():
                print(f"- {key}")

            # Now, load each relevant dataset directly from the group
            try:
                # Load ground truth scores (this should be a 2D array: (num_videos, max_segments))
                # For each video, the gt_score will be an array of importance scores for its segments.
                all_gt_scores = tvsum_group['gt_score'][()] # Use [()] to load the whole dataset
                print(f"Loaded 'gt_score'. Shape: {all_gt_scores.shape}")

                # Load number of frames for each video
                all_n_frames = tvsum_group['nframes'][()]
                print(f"Loaded 'nframes'. Shape: {all_n_frames.shape}")

                # Load change points for each video (these define segments)
                all_change_points = tvsum_group['change_points'][()]
                print(f"Loaded 'change_points'. Shape: {all_change_points.shape}")

                # Load video titles (often stored as references to strings)
                # You'll need to dereference these if they are HDF5 references
                video_title_refs = tvsum_group['title'][()]
                all_video_titles = []
                for ref in video_title_refs.flatten(): # Flatten to iterate through refs if they are nested
                     # Sometimes titles are directly stored, sometimes as refs
                    if isinstance(ref, h5py.Reference):
                        title_bytes = f[ref][()].flatten()[0] # Access the string from the reference
                        all_video_titles.append(title_bytes.decode('utf-16le')) # Decode string, might be utf-16
                    else:
                        # If not a reference, assume it's a direct string or bytes (e.g., from an older .mat file)
                        all_video_titles.append(ref.decode('utf-16le')) # Try decoding if bytes
                print(f"Loaded 'title'. First 5: {all_video_titles[:5]}")

                # Load user annotations (for more detailed analysis, bonus)
                # This is typically a list of references to arrays of user scores.
                user_anno_refs = tvsum_group['user_anno'][()]
                all_user_scores = []
                for ref in user_anno_refs.flatten():
                    if isinstance(ref, h5py.Reference):
                        all_user_scores.append(f[ref][()])
                    # else: this case might not happen for user_anno
                print(f"Loaded 'user_anno'. Length: {len(all_user_scores)}")


            except Exception as e:
                print(f"Error loading datasets from 'tvsum50' group: {e}")
                print("Could not load annotation data from expected structure.")
        else:
            print("Error: 'tvsum50' key not found in the .mat file. Check file integrity or source.")

# Now, we process the loaded data if successful
if all_gt_scores is not None and all_n_frames is not None and all_change_points is not None:
    print("\nAnnotation data loaded successfully. Proceeding with exploration.")

    # Let's inspect data for the first video (index 0)
    video_idx = 0
    if video_idx < len(all_gt_scores):
        print(f"\n--- Data for video index {video_idx} ---")

        # Get the GT score for the first video.
        # It's an array of arrays/lists of importance scores for segments.
        # We need to access the specific segment importance for this video.
        # This typically means f[all_gt_scores[video_idx, 0]] if it's a reference.
        # Or if it's already an array of arrays, then all_gt_scores[video_idx]
        # Let's assume it's a reference that needs to be dereferenced.
        current_video_gt_score_ref = all_gt_scores[video_idx, 0] # Often a (N,1) array of refs
        gt_score = f[current_video_gt_score_ref][:] # Dereference the HDF5 object link and load content

        print(f"gt_score shape for video {video_idx}: {gt_score.shape}")
        print(f"gt_score (first 10 values) for video {video_idx}:\n{gt_score.flatten()[:10]}")

        plt.figure(figsize=(12, 4))
        plt.plot(gt_score.flatten())
        plt.title(f"Ground Truth Importance Score for Video {video_idx} (Title: {all_video_titles[video_idx] if all_video_titles else 'N/A'})")
        plt.xlabel("Segment Index")
        plt.ylabel("Importance Score")
        plt.grid(True)
        plt.show()

        n_frames = all_n_frames[video_idx, 0] # Often stored as a 2D array (N,1)
        print(f"Total frames in video {video_idx}: {int(n_frames)}")

        change_points_ref = all_change_points[video_idx, 0]
        change_points = f[change_points_ref][:]
        print(f"Change points shape for video {video_idx}: {change_points.shape}")
        print(f"First 10 change points for video {video_idx}:\n{change_points.flatten()[:10]}")
        print(f"Number of segments (change points): {len(change_points)}") # Or len(gt_score)

        if all_user_scores and video_idx < len(all_user_scores):
            user_score = all_user_scores[video_idx]
            print(f"user_score shape for video {video_idx}: {user_score.shape}")
            print(f"Number of users who annotated this video: {user_score.shape[0]}")
            print(f"First user's scores (first 10 values):\n{user_score[0, :10].flatten()}")

    else:
        print(f"Video index {video_idx} is out of bounds for the loaded data.")

else:
    print("\nFailed to load all required annotation data (gt_score, nframes, change_points).")




Failed to load all required annotation data (gt_score, nframes, change_points).


In [11]:
import pandas as pd

# Assuming the file is in 'data/tvsum/'
anno_df = pd.read_csv(r'C:\Users\Omen\OneDrive\Documents\AI\AgenticAIWorkspace\video_summarization_project\data\tvsum\ydata-tvsum50-anno.tsv', sep='\t', header=None)
print(anno_df.head())

             0   1                                                  2
0  AwmHb44_ouw  VT  4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,...
1  AwmHb44_ouw  VT  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,...
2  AwmHb44_ouw  VT  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,...
3  AwmHb44_ouw  VT  4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,...
4  AwmHb44_ouw  VT  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,...


In [23]:
import os

# Define the absolute base directory where you extracted 'ydata-tvsum50-v1_1'
# Please ensure this path matches your system exactly
TVPARM_BASE_DIR = r'C:\Users\Omen\OneDrive\Documents\AI\AgenticAIWorkspace\video_summarization_project\ydata-tvsum50-v1_1'

# Now, define the paths to the video and annotation directories relative to the base
VIDEO_DIR = os.path.join(TVPARM_BASE_DIR, 'ydata-tvsum50-video')
ANNOTATION_DIR = os.path.join(TVPARM_BASE_DIR, 'ydata-tvsum50-matlab')

# The main annotation file path will then correctly point to the .mat file
# main_annotation_file = os.path.join(ANNOTATION_DIR, 'ydata-tvsum50.mat') # This line can remain as is if you have it

In [29]:
import os
import cv2
import numpy as np
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm # For progress bars

# --- Configuration ---
# Set the base directory for your TVSum dataset
TVPARM_BASE_DIR = r'C:\Users\Omen\OneDrive\Documents\AI\AgenticAIWorkspace\video_summarization_project\ydata-tvsum50-v1_1'
VIDEO_DIR = os.path.join(TVPARM_BASE_DIR, 'ydata-tvsum50-video')
FEATURES_SAVE_DIR = 'data/extracted_features' # Where to save processed features

# Ensure the save directory exists
os.makedirs(FEATURES_SAVE_DIR, exist_ok=True)

# Choose your CNN model (ResNet-50 is a good balance)
# pretrained=True means it uses weights trained on a large image dataset (ImageNet)
model = models.resnet50(pretrained=True)
# We want features, not classifications, so remove the last classification layer
model = torch.nn.Sequential(*(list(model.children())[:-1]))
model.eval() # Set model to evaluation mode (no learning updates, turn off dropout etc.)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

# Define image transformations needed for the CNN
# All pre-trained models expect 224x224 images and specific normalization
preprocess = transforms.Compose([
    transforms.ToPILImage(), # Convert OpenCV BGR image to PIL image
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(), # Convert to PyTorch Tensor, scales to [0,1]
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # ImageNet normalization
])

def extract_features_from_video(video_path, frame_sampling_rate=15):
    """
    Extracts features from a video using a pre-trained CNN.
    Args:
        video_path (str): Path to the video file.
        frame_sampling_rate (int): Process every 'n' frames to reduce data size.
                                   e.g., 15 means process every 15th frame.
    Returns:
        np.ndarray: Array of features, shape (num_sampled_frames, feature_dim).
                    Returns None if video cannot be opened.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return None

    features_list = []
    frame_count = 0

    with torch.no_grad(): # Don't compute gradients; just forward pass
        while True:
            ret, frame = cap.read()
            if not ret:
                break # No more frames

            if frame_count % frame_sampling_rate == 0:
                # Convert BGR (OpenCV default) to RGB (PyTorch/PIL default)
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                
                # Preprocess the frame and add a batch dimension
                input_tensor = preprocess(frame_rgb).unsqueeze(0) # (1, C, H, W)
                input_tensor = input_tensor.to(device)

                # Get features
                with torch.no_grad():
                    feature = model(input_tensor) # (1, feature_dim, 1, 1) if avg_pool is last
                
                # Squeeze the 1x1 spatial dimensions and move to CPU
                features_list.append(feature.squeeze().cpu().numpy())

            frame_count += 1
    
    cap.release()
    if features_list:
        return np.array(features_list)
    else:
        return np.array([]) # Return empty array if no frames processed

def process_all_tvsum_videos(video_dir, save_dir):
    """
    Processes all videos in the TVSum video directory and saves their features.
    """
    video_files = [f for f in os.listdir(video_dir) if f.endswith('.mp4')]
    
    print(f"Found {len(video_files)} videos to process...")
    for video_name in tqdm(video_files, desc="Extracting Features"):
        video_path = os.path.join(video_dir, video_name)
        save_path = os.path.join(save_dir, video_name.replace('.mp4', '_features.npy'))

        if os.path.exists(save_path):
            # print(f"Features for {video_name} already exist. Skipping.")
            continue # Skip if already processed

        features = extract_features_from_video(video_path, frame_sampling_rate=15)
        if features is not None and features.size > 0:
            np.save(save_path, features)
            # print(f"Saved features for {video_name} with shape {features.shape}")
        else:
            print(f"Warning: No features extracted for {video_name}.")

if __name__ == '__main__':
    print(f"Starting feature extraction. Videos from: {VIDEO_DIR}, saving to: {FEATURES_SAVE_DIR}")
    process_all_tvsum_videos(VIDEO_DIR, FEATURES_SAVE_DIR)
    print("Feature extraction complete.")



Using device: cuda
Starting feature extraction. Videos from: C:\Users\Omen\OneDrive\Documents\AI\AgenticAIWorkspace\video_summarization_project\ydata-tvsum50-v1_1\ydata-tvsum50-video, saving to: data/extracted_features
Found 50 videos to process...


Extracting Features: 100%|█████████████████████████████████████████████████████████████| 50/50 [14:16<00:00, 17.13s/it]

Feature extraction complete.





In [43]:
import os
import h5py
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import re

# --- Configuration (match feature_extraction.py) ---
TVPARM_BASE_DIR = r'C:\Users\Omen\OneDrive\Documents\AI\AgenticAIWorkspace\video_summarization_project\ydata-tvsum50-v1_1'
MATLAB_ANNOTATION_FILE = os.path.join(TVPARM_BASE_DIR, 'ydata-tvsum50-matlab', 'ydata-tvsum50.mat')
FEATURES_SAVE_DIR = 'data/extracted_features' # Where you saved processed features

class TVSumDataset(Dataset):
    def __init__(self, features_dir, annotation_file, sample_rate=15):
        """
        Initializes the dataset by loading features and annotations.
        Adapts to missing 'change_points' by approximating segment importance.
        Args:
            features_dir (str): Directory containing saved .npy feature files.
            annotation_file (str): Path to ydata-tvsum50.mat.
            sample_rate (int): Frame sampling rate used during feature extraction.
                               Used for conceptual alignment, but actual alignment
                               is now based on proportional mapping of gt_score.
        """
        self.features_dir = features_dir
        self.sample_rate = sample_rate
        self.data = [] # List to store (features, importance_scores, video_name) tuples

        self.video_annotations = self._load_annotations(annotation_file)
        
        feature_files = [f for f in os.listdir(features_dir) if f.endswith('_features.npy')]
        # Create a set of YouTube IDs for which features exist, for quick lookup
        video_ids_with_features = {f.replace('_features.npy', '') for f in feature_files}

        print("Preparing dataset (loading features and aligning GT with approximation)...")
        # Iterate through annotations using the reliable YouTube ID as key
        for youtube_id, anno in tqdm(self.video_annotations.items(), desc="Loading Annotations"):
            
            # Check if features for this YouTube ID actually exist
            if youtube_id not in video_ids_with_features:
                # print(f"Features not found for video: {youtube_id}. Skipping.")
                continue

            feature_filename = f"{youtube_id}_features.npy"
            feature_path = os.path.join(features_dir, feature_filename)

            try:
                features = np.load(feature_path)
                
                gt_score = anno['gt_score'].flatten()
                
                num_sampled_frames = features.shape[0]
                num_original_segments = len(gt_score)
                
                frame_importance = np.zeros(num_sampled_frames, dtype=np.float32)

                if num_original_segments > 0:
                    sampled_frames_per_original_segment = num_sampled_frames / num_original_segments

                    for i in range(num_original_segments):
                        start_idx = int(round(i * sampled_frames_per_original_segment))
                        end_idx = int(round((i + 1) * sampled_frames_per_original_segment))
                        
                        start_idx = max(0, min(start_idx, num_sampled_frames))
                        end_idx = max(0, min(end_idx, num_sampled_frames))

                        if start_idx < end_idx:
                            frame_importance[start_idx:end_idx] = gt_score[i]
                        elif start_idx < num_sampled_frames:
                             frame_importance[start_idx] = gt_score[i]
                
                self.data.append({
                    'features': torch.tensor(features, dtype=torch.float32),
                    'importance_scores': torch.tensor(frame_importance, dtype=torch.float32),
                    'video_name': youtube_id
                })

            except Exception as e:
                print(f"Error processing video {youtube_id}: {e}. Skipping.")

        print(f"Finished dataset preparation. Total videos loaded: {len(self.data)}")

    def _load_annotations(self, annotation_file):
        """
        Helper to load annotations from the .mat file.
        This version now loads YouTube IDs directly from the 'video' field.
        Returns a dict mapping video YouTube ID to its annotation data.
        """
        annotations = {}
        with h5py.File(annotation_file, 'r') as f:
            if 'tvsum50' in f:
                tvsum_group = f['tvsum50']
                
                video_title_refs = tvsum_group['title'][()]
                user_anno_refs = tvsum_group['user_anno'][()]
                gt_score_refs = tvsum_group['gt_score'][()]
                # --- NEW: Get reference to the 'video' (YouTube ID) dataset ---
                video_id_refs = tvsum_group['video'][()]
                # --- END NEW ---

                for i in range(len(video_title_refs)):
                    # Get video title (from previous fix)
                    title_char_array = f[video_title_refs[i, 0]][()]
                    video_title = "".join(chr(c) for c in title_char_array.flatten() if c != 0)
                    
                    # --- NEW: Get YouTube ID directly from 'video' field ---
                    youtube_id_char_array = f[video_id_refs[i, 0]][()]
                    youtube_id = "".join(chr(c) for c in youtube_id_char_array.flatten() if c != 0)
                    
                    if not youtube_id:
                        print(f"Warning: Skipping annotation for '{video_title}' - extracted YouTube ID is empty. Raw ID data: {youtube_id_char_array}")
                        continue
                    # --- END NEW ---

                    gt_score = f[gt_score_refs[i, 0]][()]
                    user_anno = f[user_anno_refs[i, 0]][()]
                    
                    annotations[youtube_id] = {
                        'title': video_title,
                        'gt_score': gt_score,
                        'user_anno': user_anno,
                    }
            else:
                raise RuntimeError("Key 'tvsum50' not found in the .mat file. Check file integrity.")
        return annotations

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# This collate_fn remains the same
def collate_fn(batch):
    max_len = max([item['features'].shape[0] for item in batch])
    
    padded_features = torch.zeros(len(batch), max_len, batch[0]['features'].shape[1])
    padded_importance_scores = torch.zeros(len(batch), max_len)
    
    lengths = []
    video_names = []

    for i, item in enumerate(batch):
        seq_len = item['features'].shape[0]
        padded_features[i, :seq_len, :] = item['features']
        padded_importance_scores[i, :seq_len] = item['importance_scores']
        lengths.append(seq_len)
        video_names.append(item['video_name'])
    
    lengths, perm_idx = torch.tensor(lengths).sort(descending=True)
    padded_features = padded_features[perm_idx]
    padded_importance_scores = padded_importance_scores[perm_idx]
    video_names = [video_names[i] for i in perm_idx]

    return padded_features, padded_importance_scores, lengths, video_names

if __name__ == '__main__':
    print("--- Testing TVSumDataset (with approximate segment mapping) ---")
    tvsum_dataset = TVSumDataset(FEATURES_SAVE_DIR, MATLAB_ANNOTATION_FILE)

    if len(tvsum_dataset) > 0:
        first_video_data = tvsum_dataset[0]
        print(f"\nFirst video features shape: {first_video_data['features'].shape}")
        print(f"First video importance scores shape: {first_video_data['importance_scores'].shape}")
        print(f"First video name: {first_video_data['video_name']}")

        train_dataloader = DataLoader(tvsum_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

        print("\nIterating through a batch from DataLoader:")
        for i, (features, scores, lengths, names) in enumerate(train_dataloader):
            print(f"Batch {i+1}:")
            print(f"  Features shape (padded): {features.shape}")
            print(f"  Scores shape (padded): {scores.shape}")
            print(f"  Original lengths: {lengths}")
            print(f"  Video Names: {names}")
            if i == 0: break
    else:
        print("Dataset is empty. Please ensure feature_extraction.py ran successfully and paths are correct.")

--- Testing TVSumDataset (with approximate segment mapping) ---
Preparing dataset (loading features and aligning GT with approximation)...


Loading Annotations: 100%|█████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 32.44it/s]


Finished dataset preparation. Total videos loaded: 50

First video features shape: torch.Size([707, 2048])
First video importance scores shape: torch.Size([707])
First video name: AwmHb44_ouw

Iterating through a batch from DataLoader:
Batch 1:
  Features shape (padded): torch.Size([2, 417, 2048])
  Scores shape (padded): torch.Size([2, 417])
  Original lengths: tensor([417, 376])
  Video Names: ['LRw_obCPUt0', 'XkqCExn6_Us']


In [33]:
import h5py
import os

# --- Configuration (Use your exact path to ydata-tvsum50.mat) ---
TVPARM_BASE_DIR = r'C:\Users\Omen\OneDrive\Documents\AI\AgenticAIWorkspace\video_summarization_project\ydata-tvsum50-v1_1'
MATLAB_ANNOTATION_FILE = os.path.join(TVPARM_BASE_DIR, 'ydata-tvsum50-matlab', 'ydata-tvsum50.mat')

def print_hdf5_structure(name, obj, indent=''):
    """Recursively prints the structure of an HDF5 group or dataset."""
    print(f"{indent}- {name} ({'Group' if isinstance(obj, h5py.Group) else 'Dataset'})")
    if isinstance(obj, h5py.Dataset):
        try:
            # For datasets, print shape and dtype
            print(f"{indent}  Shape: {obj.shape}, Dtype: {obj.dtype}")
        except Exception as e:
            print(f"{indent}  (Could not get shape/dtype: {e})")
    elif isinstance(obj, h5py.Group):
        # Recursively call for members of a group
        for key in obj.keys():
            print_hdf5_structure(key, obj[key], indent + '  ')

if __name__ == '__main__':
    if not os.path.exists(MATLAB_ANNOTATION_FILE):
        print(f"Error: .mat file not found at {MATLAB_ANNOTATION_FILE}")
    else:
        print(f"Inspecting structure of: {MATLAB_ANNOTATION_FILE}\n")
        try:
            with h5py.File(MATLAB_ANNOTATION_FILE, 'r') as f:
                if 'tvsum50' in f:
                    tvsum_group = f['tvsum50']
                    print_hdf5_structure('tvsum50', tvsum_group)
                else:
                    print("Top-level key 'tvsum50' not found.")
                
                # Also, search for 'change_points' specifically in the entire file
                found_change_points = False
                for item_name in f.keys():
                    f.visititems(lambda name, obj: print(f"Found 'change_points' at: {name}") if 'change_points' in name else None)

        except Exception as e:
            print(f"An error occurred while inspecting the HDF5 file: {e}")

Inspecting structure of: C:\Users\Omen\OneDrive\Documents\AI\AgenticAIWorkspace\video_summarization_project\ydata-tvsum50-v1_1\ydata-tvsum50-matlab\ydata-tvsum50.mat

- tvsum50 (Group)
  - category (Dataset)
    Shape: (50, 1), Dtype: object
  - gt_score (Dataset)
    Shape: (50, 1), Dtype: object
  - length (Dataset)
    Shape: (50, 1), Dtype: object
  - nframes (Dataset)
    Shape: (50, 1), Dtype: object
  - title (Dataset)
    Shape: (50, 1), Dtype: object
  - user_anno (Dataset)
    Shape: (50, 1), Dtype: object
  - video (Dataset)
    Shape: (50, 1), Dtype: object


In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm
import os # Import os for saving models

# Import your defined Dataset and Model
# Ensure FEATURES_SAVE_DIR and MATLAB_ANNOTATION_FILE are correctly set in dataset.py
from dataset import TVSumDataset, FEATURES_SAVE_DIR, MATLAB_ANNOTATION_FILE, collate_fn
from model import VideoSummarizerLSTM

# --- Hyperparameters (You can tune these!) ---
FEATURE_DIM = 2048       # ResNet-50 output feature dimension
HIDDEN_DIM = 512         # LSTM hidden state size
NUM_LSTM_LAYERS = 2      # Number of LSTM layers
DROPOUT_RATE = 0.5       # Dropout for regularization
BIDIRECTIONAL = True     # Use Bidirectional LSTM (often performs better)
BATCH_SIZE = 4           # How many videos to process at once (adjust based on GPU memory)
LEARNING_RATE = 0.0001   # Learning rate for the optimizer
NUM_EPOCHS = 50          # How many times to loop through the entire dataset
SPLIT_RATIO = 0.8        # 80% for training, 20% for validation/testing

# --- Device Configuration ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Model Saving Configuration ---
MODEL_SAVE_DIR = 'checkpoints'
os.makedirs(MODEL_SAVE_DIR, exist_ok=True) # Ensure the directory exists

def train_model():
    # 1. Load Dataset
    print("Loading TVSum Dataset...")
    full_dataset = TVSumDataset(FEATURES_SAVE_DIR, MATLAB_ANNOTATION_FILE)
    
    # Check if dataset is empty - crucial check!
    if len(full_dataset) == 0:
        print("Error: Dataset is empty. Cannot start training.")
        print("Please ensure feature_extraction.py ran successfully and paths in dataset.py are correct.")
        return

    # Split into training and validation sets
    train_size = int(SPLIT_RATIO * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    print(f"Training on {len(train_dataset)} videos, Validating on {len(val_dataset)} videos.")

    # 2. Initialize Model, Loss Function, and Optimizer
    model = VideoSummarizerLSTM(FEATURE_DIM, HIDDEN_DIM, num_layers=NUM_LSTM_LAYERS, 
                                dropout=DROPOUT_RATE, bidirectional=BIDIRECTIONAL).to(device)
    
    # Loss function: Mean Squared Error is common for importance score prediction
    criterion = nn.MSELoss() 
    
    # Optimizer: Adam is a good general-purpose choice
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # 3. Training Loop
    print("\nStarting training...")
    best_val_loss = float('inf') # To save the best model
    
    for epoch in range(NUM_EPOCHS):
        model.train() # Set model to training mode
        total_train_loss = 0
        train_batches = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} (Train)")
        
        for features, importance_scores, lengths, _ in train_batches:
            features, importance_scores = features.to(device), importance_scores.to(device)

            # Zero the gradients (clear previous gradients)
            optimizer.zero_grad()

            # Forward pass: Get predictions from the model
            predicted_scores = model(features, lengths)
            
            # Mask the loss for padded values: Only compute loss on actual data, not padding
            # This ensures that padding (zeros) at the end of shorter sequences doesn't affect loss
            mask = torch.arange(predicted_scores.size(1)).unsqueeze(0).to(device) < lengths.unsqueeze(1).to(device)
            
            # Apply mask to both predictions and ground truth scores
            masked_predicted_scores = predicted_scores * mask.float()
            masked_importance_scores = importance_scores * mask.float()

            # Calculate loss based on masked scores
            loss = criterion(masked_predicted_scores, masked_importance_scores)

            # Backward pass: Compute gradients
            loss.backward()
            # Optimizer step: Update model parameters
            optimizer.step()

            # Accumulate loss (multiply by batch size for correct average)
            total_train_loss += loss.item() * features.size(0) 

            train_batches.set_postfix(loss=loss.item())

        avg_train_loss = total_train_loss / len(train_dataset)
        print(f"Epoch {epoch+1} Training Loss: {avg_train_loss:.4f}")

        # 4. Validation Phase (Evaluate on unseen data after each epoch)
        model.eval() # Set model to evaluation mode (turns off dropout, batch norm updates etc.)
        total_val_loss = 0
        with torch.no_grad(): # No gradient computation needed for validation (saves memory/time)
            val_batches = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS} (Validation)")
            for features, importance_scores, lengths, _ in val_batches:
                features, importance_scores = features.to(device), importance_scores.to(device)
                
                predicted_scores = model(features, lengths)
                
                mask = torch.arange(predicted_scores.size(1)).unsqueeze(0).to(device) < lengths.unsqueeze(1).to(device)
                masked_predicted_scores = predicted_scores * mask.float()
                masked_importance_scores = importance_scores * mask.float()
                
                loss = criterion(masked_predicted_scores, masked_importance_scores)
                total_val_loss += loss.item() * features.size(0)
                val_batches.set_postfix(loss=loss.item())
        
        avg_val_loss = total_val_loss / len(val_dataset)
        print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}")
        
        # Save model checkpoint if validation loss improves
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model_save_path = os.path.join(MODEL_SAVE_DIR, f"summarizer_model_best_val_loss.pth")
            torch.save(model.state_dict(), model_save_path)
            print(f"Model saved to {model_save_path} (New best validation loss: {best_val_loss:.4f})")
        
        # You can also save models periodically (e.g., every 10 epochs)
        if (epoch + 1) % 10 == 0:
            model_save_path_periodic = os.path.join(MODEL_SAVE_DIR, f"summarizer_model_epoch_{epoch+1}.pth")
            torch.save(model.state_dict(), model_save_path_periodic)
            print(f"Model saved to {model_save_path_periodic}")


    print("\nTraining complete!")

if __name__ == '__main__':
    train_model()

ModuleNotFoundError: No module named 'dataset'