**Getting familiar with the images**

In [17]:
import pandas as pd
import os
from PIL import Image
from collections import Counter
from typing import Dict, Tuple

In [13]:
def count_images(folder: str) -> int: 
    count = 0
    for root, _, files in os.walk(folder): 
        for file in files: 
            if file.endswith((".jpg", ".png")):
                count += 1
    return count

folder_path = "../data/raw/train_images/"
print(f"Total images: {count_images(folder_path)}")

Total images: 23501


In [18]:
def analyze_image_dimensions(folder: str, limit: int = 100) -> Dict[Tuple[int, int], int]:
    dimensions = Counter()

    processed = 0 

    for root, _, files in os.walk(folder): 
        for file in files: 
            if processed >= limit: 
                break 
            if file.endswith((".jpg", ".png")): 
                img_path = os.path.join(root, file)
                with Image.open(img_path) as img: 
                    dimensions[img.size] += 1
                processed += 1
        if processed >= limit: 
            break 

    return dict(dimensions)

In [19]:
folder_path = "../data/raw/train_images/"
dimensions = analyze_image_dimensions(folder_path, limit=500)  
print("Image dimensions and counts:", dimensions)

Image dimensions and counts: {(1280, 720): 500}


In [20]:
import os
from typing import Tuple

def calculate_space(folder: str) -> Tuple[float, float, int]:

    total_size = 0  # Total size in bytes
    image_count = 0  # Total number of images

    for root, _, files in os.walk(folder):  # Traverse subfolders
        for file in files:
            if file.endswith((".jpg", ".png", ".jpeg")):  # Match image extensions
                image_count += 1
                file_path = os.path.join(root, file)
                total_size += os.path.getsize(file_path)  # File size in bytes

    total_size_mb = total_size / (1024 * 1024)  # Convert to megabytes
    average_size_kb = (total_size / image_count / 1024) if image_count > 0 else 0  # Average size in kilobytes

    return total_size_mb, average_size_kb, image_count

# Example usage
folder_path = "../data/raw/train_images/"
total_size_mb, average_size_kb, image_count = calculate_space(folder_path)
print(f"Total size: {total_size_mb:.2f} MB")
print(f"Average image size: {average_size_kb:.2f} KB")
print(f"Total number of images: {image_count}")


Total size: 14510.45 MB
Average image size: 632.26 KB
Total number of images: 23501


In [21]:
from PIL import Image
import os
import matplotlib.pyplot as plt

def plot_image(folder: str):
    """Plots one image from the dataset.

    Args:
        folder (str): Path to the folder containing images.
    """
    # Find the first image in the folder
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith((".jpg", ".png", ".jpeg")):
                img_path = os.path.join(root, file)  # Full path to the image
                with Image.open(img_path) as img:  # Open the image
                    # Plot the image
                    plt.imshow(img)
                    plt.axis("off")  # Turn off axes for a cleaner view
                    plt.title(f"Image: {file}")
                    plt.show()
                return  # Stop after plotting the first image

# Example usage
folder_path = "../data/raw/train_images/"
plot_image(folder_path)


ModuleNotFoundError: No module named 'matplotlib'