# 1. Analyzing Locally Hosted Data

In [4]:
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

# Define the paths
base_path = "/home/ndelafuente/D-MAE/data/data"
train_path = os.path.join(base_path, "train")
val_path = os.path.join(base_path, "val")

# Subfolders
image_folder = "images"
depth_folder = "depth"
compressed_depth_folder = "compressed_depth"

def calculate_statistics(image_files, depth_files, compressed_depth_files):
    # Initialize lists to store statistics
    image_stats = {"min": [], "max": [], "mean": [], "std": []}
    depth_stats = {"min": [], "max": [], "mean": [], "std": []}
    compressed_depth_stats = {"min": [], "max": [], "mean": [], "std": []}
    
    # Iterate through files and compute statistics
    for image_file, depth_file, compressed_depth_file in tqdm(zip(image_files, depth_files, compressed_depth_files)):
        # Load image and compute statistics
        image = np.array(Image.open(image_file)) / 255.0  # Normalize to [0, 1]
        image_stats["min"].append(np.min(image))
        image_stats["max"].append(np.max(image))
        image_stats["mean"].append(np.mean(image))
        image_stats["std"].append(np.std(image))
        
        # Load depth map and compute statistics
        depth = np.load(depth_file)
        depth_stats["min"].append(np.min(depth))
        depth_stats["max"].append(np.max(depth))
        depth_stats["mean"].append(np.mean(depth))
        depth_stats["std"].append(np.std(depth))
        
        # Load compressed depth map and compute statistics
        compressed_depth = np.load(compressed_depth_file)
        compressed_depth_array = compressed_depth[compressed_depth.files[0]]  # Access the first array in the .npz file
        compressed_depth_stats["min"].append(np.min(compressed_depth_array))
        compressed_depth_stats["max"].append(np.max(compressed_depth_array))
        compressed_depth_stats["mean"].append(np.mean(compressed_depth_array))
        compressed_depth_stats["std"].append(np.std(compressed_depth_array))
    
    # Aggregate statistics
    image_stats_agg = {k: {"min": np.min(v), "max": np.max(v), "mean": np.mean(v), "std": np.std(v)} for k, v in image_stats.items()}
    depth_stats_agg = {k: {"min": np.min(v), "max": np.max(v), "mean": np.mean(v), "std": np.std(v)} for k, v in depth_stats.items()}
    compressed_depth_stats_agg = {k: {"min": np.min(v), "max": np.max(v), "mean": np.mean(v), "std": np.std(v)} for k, v in compressed_depth_stats.items()}
    
    return image_stats_agg, depth_stats_agg, compressed_depth_stats_agg

# Now run the updated code
image_stats, depth_stats, compressed_depth_stats = calculate_statistics(image_files, depth_files, compressed_depth_files)

# Print the statistics
print("Image Statistics:", image_stats)
print("Depth Map Statistics:", depth_stats)
print("Compressed Depth Map Statistics:", compressed_depth_stats)



118287it [17:09, 114.86it/s]


Image Statistics: {'min': {'min': 0.0, 'max': 0.4196078431372549, 'mean': 0.0007729621391109726, 'std': 0.00867470330872324}, 'max': {'min': 0.3411764705882353, 'max': 1.0, 'mean': 0.9978616316546145, 'std': 0.0184084239945548}, 'mean': {'min': 0.005642723952326028, 'max': 0.9856017777777778, 'mean': 0.4415595511481634, 'std': 0.11892355725563003}, 'std': {'min': 0.02401428636566994, 'max': 0.48944016276365343, 'mean': 0.249455558874881, 'std': 0.051816131122659426}}
Depth Map Statistics: {'min': {'min': 0.0, 'max': 6.2383575, 'mean': 0.19016321, 'std': 0.4069841}, 'max': {'min': 2.9486177, 'max': 50.807907, 'mean': 13.444314, 'std': 3.7953913}, 'mean': {'min': 0.012939928, 'max': 11.1360235, 'mean': 4.6975307, 'std': 1.4317963}, 'std': {'min': 0.15645814, 'max': 11.423761, 'mean': 3.4330912, 'std': 0.99024653}}
Compressed Depth Map Statistics: {'min': {'min': 0.0, 'max': 6.2383575, 'mean': 0.19016321, 'std': 0.4069841}, 'max': {'min': 2.9486177, 'max': 50.807907, 'mean': 13.444314, 's

# 2. Loading and Analyzing Data from Hugging Face

In [5]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("neildlf/depth_coco")

# Check the structure of the dataset
print(dataset)

# Function to visualize data from the Hugging Face dataset
def visualize_huggingface_data(dataset, split="train", index=0):
    # Get the data
    data = dataset[split][index]

    # Convert the images and depth maps to numpy arrays for visualization
    image = np.array(data["image"])
    depth = np.array(data["depth"])
    compressed_depth = np.array(data["compressed_depth"])
    
    # Plotting the image and depth maps
    plt.figure(figsize=(15, 5))
    
    # Display image
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.title('Image')
    plt.axis('off')
    
    # Display depth map
    plt.subplot(1, 3, 2)
    plt.imshow(depth, cmap='gray')
    plt.title('Depth Map')
    plt.axis('off')
    
    # Display compressed depth map
    plt.subplot(1, 3, 3)
    plt.imshow(compressed_depth, cmap='gray')
    plt.title('Compressed Depth Map')
    plt.axis('off')
    
    plt.show()

# Example of visualizing data from Hugging Face dataset
visualize_huggingface_data(dataset, split="train", index=0)

  from .autonotebook import tqdm as notebook_tqdm
Downloading data: 100%|██████████| 31/31 [10:50<00:00, 20.99s/files]
Downloading data: 100%|██████████| 326M/326M [00:05<00:00, 56.1MB/s] 
Downloading data: 100%|██████████| 324M/324M [00:05<00:00, 54.7MB/s] 
Generating train split: 100%|██████████| 118287/118287 [00:42<00:00, 2777.67 examples/s]
Generating validation split: 100%|██████████| 5000/5000 [00:01<00:00, 2815.56 examples/s]

DatasetDict({
    train: Dataset({
        features: ['image'],
        num_rows: 118287
    })
    validation: Dataset({
        features: ['image'],
        num_rows: 5000
    })
})





KeyError: 'depth'