In [1]:
!pip install -q opencv-python h5py tqdm

In [1]:
import av
import cv2
import numpy as np
import h5py
from pathlib import Path
import concurrent.futures
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from typing import List
from pathlib import Path
import zarr

In [None]:
%cd ../gameboyprep/data/

In [None]:
%cd VideoGen/data/

In [4]:
mp4_files = []
for file in Path('./data/gameboy_longplays_mp4').glob('*.mp4'):
   mp4_files.append(f'./data/gameboy_longplays_mp4/{file.name}')

In [None]:
mp4_files

In [5]:
def check_video_frames(video_path):
    try:
        with av.open(video_path) as container:
            frame_count = container.streams.video[0].frames
            return True, frame_count
    except:
        return False, None

def scan_videos(directory):
    video_files = list(Path(directory).glob("**/*.mp4"))
    
    print(f"Found {len(video_files)} MP4 files")
    issues = []
    
    for video_path in tqdm(video_files, desc="Scanning videos"):
        has_frames, count = check_video_frames(str(video_path))
        if not has_frames:
            issues.append(video_path)
    
    print("\nVideos without accessible frame counts:")
    for path in issues:
        print(f"- {path}")
    
    print(f"\nTotal: {len(issues)}/{len(video_files)} videos have inaccessible frame counts")

In [None]:
scan_videos("./data/gameboy_longplays_mp4/")

In [4]:
def decode_video_to_hdf5_chunked(
    video_path: str,
    output_dir: str,
    target_size: tuple = (256, 256),
    chunk_size: int = 16,
    dataset_name: str = "video_frames",
    compression: str = "gzip",
    compression_opts: int = 4
):
    """
    Decodes a single MP4 file in 'chunk_size' batches and writes frames to an HDF5 file.
    
    Args:
        video_path (str): Path to the input MP4 file.
        output_dir (str): Directory where the per-video HDF5 file will be created.
        chunk_size (int): Number of frames to read and write at a time (batch size).
        dataset_name (str): Name of the dataset inside the HDF5 file.
        compression (str): HDF5 compression algorithm (e.g., 'gzip', 'lzf', or None).
        compression_opts (int): Compression level (if using 'gzip', 1-9).
    """
    longplay_id, video_id, _ = os.path.basename(video_path).split('_', maxsplit=2)
    hdf5_out_path = os.path.join(output_dir, f"{longplay_id}_{video_id}.h5")
    
    def process_frame(frame):
        arr = frame.to_ndarray(format='rgb24')
        arr = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY)
        return cv2.resize(arr, target_size)
    
    def write_batch(f, dset, batch_frames, frame_count):
        batch_array = np.stack(batch_frames, axis=0)
        if dset is None:
            height, width = batch_array.shape[1:3]
            dset = f.create_dataset(
                dataset_name,
                shape=(0, height, width, 1),  # since it's grayscale
                maxshape=(None, height, width, 1),
                dtype=batch_array.dtype,
                chunks=(chunk_size, height, width, 1),
                compression=compression,
                compression_opts=compression_opts
            )
        
        dset.resize(dset.shape[0] + batch_array.shape[0], axis=0)
        dset[-batch_array.shape[0]:] = batch_array[..., np.newaxis]  # add channel dimension
        frame_count += batch_array.shape[0]
        return dset, frame_count

    def get_total_frames(container):
        return container.streams.video[0].frames

    with av.open(video_path) as container:
        with h5py.File(hdf5_out_path, "w") as f:
            total_frames = get_total_frames(container)
            with tqdm(total=total_frames, desc=f"Processing {longplay_id}_{video_id}", unit="frames", leave=False, position=1) as pbar:
        
                dset = None
                frame_count = 0
                batch_frames = []
                
                for frame in container.decode(video=0):
                    batch_frames.append(process_frame(frame))
                    
                    if len(batch_frames) == chunk_size:
                        dset, frame_count = write_batch(f, dset, batch_frames, frame_count)
                        batch_frames = []
                        pbar.update(chunk_size)
                
                # write any remaining frames
                if batch_frames:
                    dset, frame_count = write_batch(f, dset, batch_frames, frame_count)
                    pbar.update(len(batch_frames))
                
                if dset is not None:
                    dset.attrs["video_path"] = video_path
                    
                    # original video properties
                    stream = container.streams.video[0]
                    dset.attrs["original_width"] = stream.width
                    dset.attrs["original_height"] = stream.height
                    dset.attrs["fps"] = float(stream.average_rate)
                    dset.attrs["duration_seconds"] = float(stream.duration * stream.time_base)
                    dset.attrs["total_frames"] = stream.frames
                    
                    # processing parameters
                    dset.attrs["target_size"] = target_size
                    dset.attrs["compression"] = compression
                    dset.attrs["compression_level"] = compression_opts
                    dset.attrs["chunk_size"] = chunk_size

In [5]:
def decode_videos_in_parallel_chunked(
    video_paths: List[str],
    output_dir: str,
    target_size: tuple = (256, 256),
    chunk_size: int = 16,
    max_workers: int = 4
):
    """
    Decodes multiple MP4 files in parallel, each to its own HDF5 file using chunked writes.
    
    Args:
        video_paths (List[str]): List of MP4 paths to decode.
        output_dir (str): Directory where per-video HDF5 files will be placed.
        chunk_size (int): Number of frames to read/write at a time for each video.
        max_workers (int): Number of parallel processes.
    """
    os.makedirs(output_dir, exist_ok=True)
    total_videos = len(video_paths)
    
    with tqdm(total=total_videos, desc="Processing videos", unit="video", position=0) as pbar:
        with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
            futures = {
                executor.submit(decode_video_to_hdf5_chunked, path, output_dir, target_size, chunk_size): path
                for path in video_paths
            }
            
            for future in concurrent.futures.as_completed(futures):
                path = futures[future]
                try:
                    future.result()
                    pbar.update(1)
                except Exception as exc:
                    print(f"\nError decoding {path}: {exc}")
                    pbar.update(1)

In [None]:
decode_videos_in_parallel_chunked(
    mp4_files[:5],
    output_dir="data/longplay_h5_files",
    chunk_size=16,
    max_workers=4
)

In [5]:
def inspect_hdf5_with_viz(file_path, num_sample_frames=5):
    with h5py.File(file_path, 'r') as f:
        # Get dataset name (assuming one dataset)
        dataset_name = list(f.keys())[0]
        dataset = f[dataset_name]
        
        print(f"Dataset: {dataset_name}")
        print(f"Shape: {dataset.shape}")
        print(f"Dtype: {dataset.dtype}")
        print("\nAttributes:")
        for key, value in dataset.attrs.items():
            print(f"- {key}: {value}")
        
        # Visualize sample frames
        num_frames = dataset.shape[0]
        indices = np.linspace(0, num_frames-1, num_sample_frames, dtype=int)
        
        fig, axes = plt.subplots(1, num_sample_frames, figsize=(20, 4))
        for i, idx in enumerate(indices):
            frame = dataset[idx]
            if frame.shape[-1] == 1:  # If single channel
                frame = frame.squeeze()  # Remove channel dimension
            axes[i].imshow(frame, cmap='gray')
            axes[i].axis('off')
            axes[i].set_title(f'Frame {idx}')
        
        plt.tight_layout()
        plt.show()
        
        # Show frame statistics
        print("\nFrame Statistics:")
        print(f"First Frame - Min: {np.min(dataset[0])}, Max: {np.max(dataset[0])}, Mean: {np.mean(dataset[0]):.2f}")
        print(f"Last Frame - Min: {np.min(dataset[-1])}, Max: {np.max(dataset[-1])}, Mean: {np.mean(dataset[-1]):.2f}")

In [None]:
!ls

In [None]:
inspect_hdf5_with_viz('longplays.h5')

In [9]:
def inspect_zarr_with_viz(zarr_path, num_sample_frames=5):
    """
    Inspect a Zarr array containing video frames and visualize sample frames.
    
    Parameters:
    -----------
    zarr_path : str
        Path to input Zarr store
    num_sample_frames : int
        Number of sample frames to visualize (default: 5)
    """
    # Open the zarr array (assuming frames are in a dataset called 'frames')
    root = zarr.open(zarr_path)
    dataset = root['frames']  # Assuming 'frames' is the dataset name
    
    # Print basic information
    print(f"Dataset: frames")
    print(f"Shape: {dataset.shape}")
    print(f"Dtype: {dataset.dtype}")
    print(f"Chunks: {dataset.chunks}")
    print("\nAttributes:")
    for key, value in dataset.attrs.items():
        print(f"- {key}: {value}")
    
    # Visualize sample frames
    num_frames = dataset.shape[0]
    indices = np.linspace(0, num_frames-1, num_sample_frames, dtype=int)
    
    fig, axes = plt.subplots(1, num_sample_frames, figsize=(20, 4))
    
    # Handle case where num_sample_frames = 1
    if num_sample_frames == 1:
        axes = [axes]
        
    for i, idx in enumerate(indices):
        frame = dataset[idx]
        if frame.shape[-1] == 1:  # If single channel
            frame = frame.squeeze()  # Remove channel dimension
        axes[i].imshow(frame, cmap='gray')
        axes[i].axis('off')
        axes[i].set_title(f'Frame {idx}')
    
    plt.tight_layout()
    plt.show()
    
    # Show frame statistics
    print("\nFrame Statistics:")
    # Using numpy operations directly on zarr arrays
    print(f"First Frame - Min: {np.min(dataset[0])}, Max: {np.max(dataset[0])}, "
          f"Mean: {np.mean(dataset[0]):.2f}")
    print(f"Last Frame - Min: {np.min(dataset[-1])}, Max: {np.max(dataset[-1])}, "
          f"Mean: {np.mean(dataset[-1]):.2f}")
    
    # Add memory usage information
    nbytes = dataset.nbytes
    print(f"\nMemory Usage:")
    print(f"Total size: {nbytes / (1024**2):.2f} MB")
    print(f"Chunk size: {np.prod(dataset.chunks) * dataset.dtype.itemsize / 1024:.2f} KB")
    
    # Add compression information
    if dataset.compressor:
        print("\nCompression Info:")
        print(f"Compressor: {dataset.compressor}")

In [None]:
%pwd

In [None]:
inspect_zarr_with_viz('longplay_zarr_files/100_0.zarr')

In [None]:
%cd VideoGen/data/

In [None]:
with h5py.File('./longplay_h5_files/0_0.h5', 'r') as f_old:
    with h5py.File('./longplay_h5_files/0_0.new.h5', 'r') as f_new:
        

In [9]:
def rechunk_dataset(filename, dataset_name, new_chunk_size):
    with h5py.File(filename, 'r+') as f:
        # Get original dataset
        old_dset = f[dataset_name]
        
        # Store original attributes and dtype
        attrs = dict(old_dset.attrs)
        dtype = old_dset.dtype
        shape = old_dset.shape
        
        # Create temporary dataset name
        temp_name = dataset_name + '_temp'
        
        # Create new dataset with desired chunk size
        new_dset = f.create_dataset(
            temp_name,
            shape=shape,
            dtype=dtype,
            chunks=(new_chunk_size,) + shape[1:],  # Assuming first dim is frames
            compression=old_dset.compression,
            compression_opts=old_dset.compression_opts
        )
        
        # Copy data in reasonable block sizes
        block_size = 4096  # Adjust based on memory constraints
        for i in tqdm(range(0, shape[0], block_size)):
            end = min(i + block_size, shape[0])
            new_dset[i:end] = old_dset[i:end]
        
        # Copy attributes
        for key, value in attrs.items():
            new_dset.attrs[key] = value
        
        # Delete old dataset
        del f[dataset_name]
        
        # Rename new dataset to original name
        f[dataset_name] = f[temp_name]
        del f[temp_name]

# Usage example:
# rechunk_dataset('video.h5', '/frames', new_chunk_size=1)

In [None]:
rechunk_dataset(filename='longplay_h5_files/101_0.h5', dataset_name='video_frames', new_chunk_size=1)