# GPU-Accelerated Image Processing Demo

This notebook demonstrates GPU acceleration for image processing tasks including:
- Gaussian blur
- Edge detection
- Image resizing

Make sure to enable GPU runtime for this notebook:
- Click "Runtime" > "Change runtime type"
- Select "GPU" under Hardware accelerator
- Click "Save"

In [None]:
# Install required packages
!pip install opencv-python

In [None]:
# Import necessary libraries
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import urllib.request
from PIL import Image

# Check GPU availability
print("=== GPU AVAILABILITY CHECK ===")
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print(f"✅ GPU is available: {torch.cuda.get_device_name(0)}")
    print(f"Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    print(f"Memory reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")
else:
    device = torch.device("cpu")
    print("❌ No GPU available. Running on CPU only.")
    print("Note: To enable GPU, go to Runtime > Change runtime type > Hardware accelerator > GPU")

def print_separator():
    print("\n" + "="*70 + "\n")

## Download and Load Sample Image

First, let's download a sample image to use for our processing tasks.

In [None]:
def download_sample_image():
    """Download a sample image if not already present."""
    image_path = "sample_image.jpg"
    if not os.path.exists(image_path):
        print("Downloading sample image...")
        url = "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e7/Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg/1280px-Everest_North_Face_toward_Base_Camp_Tibet_Luca_Galuzzi_2006.jpg"
        urllib.request.urlretrieve(url, image_path)
        print(f"Image downloaded to {image_path}")
    return image_path

def load_image(path):
    """Load an image and convert to PyTorch tensor."""
    # Read image with OpenCV
    img = cv2.imread(path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    
    # Convert to PyTorch tensor
    img_tensor = torch.from_numpy(img).float().permute(2, 0, 1) / 255.0  # Normalize to [0, 1]
    
    return img, img_tensor

# Download and load the image
image_path = download_sample_image()
img_np, img_tensor = load_image(image_path)

# Display the image
plt.figure(figsize=(12, 8))
plt.imshow(img_np)
plt.title("Sample Image")
plt.axis('off')
plt.show()

print(f"Image loaded: {image_path}")
print(f"Image shape: {img_np.shape}")

## 1. Gaussian Blur

Let's implement Gaussian blur on both CPU and GPU and compare the performance.

In [None]:
def gaussian_blur_cpu(img_tensor, kernel_size=15, sigma=5.0):
    """Apply Gaussian blur using CPU."""
    # Convert to numpy for OpenCV
    img_np = (img_tensor.permute(1, 2, 0).numpy() * 255.0).astype(np.uint8)
    
    start_time = time.time()
    blurred = cv2.GaussianBlur(img_np, (kernel_size, kernel_size), sigma)
    cpu_time = time.time() - start_time
    
    # Convert back to tensor
    result = torch.from_numpy(blurred).float().permute(2, 0, 1) / 255.0
    
    return result, cpu_time

def gaussian_blur_gpu(img_tensor, kernel_size=15, sigma=5.0):
    """Apply Gaussian blur using GPU."""
    if device.type != "cuda":
        # Fall back to CPU if GPU not available
        return gaussian_blur_cpu(img_tensor, kernel_size, sigma)
    
    # Move tensor to GPU
    img_gpu = img_tensor.to(device)
    
    # Create Gaussian kernel
    x = torch.arange(-(kernel_size // 2), kernel_size // 2 + 1, dtype=torch.float32, device=device)
    kernel_1d = torch.exp(-x**2 / (2 * sigma**2))
    kernel_1d = kernel_1d / kernel_1d.sum()
    
    # Create 2D kernel
    kernel_2d = torch.outer(kernel_1d, kernel_1d).to(device)
    kernel_2d = kernel_2d.view(1, 1, kernel_size, kernel_size)
    kernel_2d = kernel_2d.repeat(3, 1, 1, 1)  # One kernel per channel
    
    # Prepare input for convolution
    img_gpu = img_gpu.unsqueeze(0)  # Add batch dimension
    
    # Apply convolution
    start_time = time.time()
    padding = kernel_size // 2
    blurred = torch.nn.functional.conv2d(
        img_gpu, kernel_2d, padding=padding, groups=3
    )
    torch.cuda.synchronize()  # Ensure GPU operations complete
    gpu_time = time.time() - start_time
    
    # Move result back to CPU
    result = blurred.squeeze(0).cpu()
    
    return result, gpu_time

# Apply Gaussian blur
print("\nApplying Gaussian Blur...")
cpu_blur, cpu_time = gaussian_blur_cpu(img_tensor)
gpu_blur, gpu_time = gaussian_blur_gpu(img_tensor)
print(f"  CPU time: {cpu_time:.4f} seconds")
print(f"  GPU time: {gpu_time:.4f} seconds")
if device.type == "cuda":
    print(f"  Speedup: {cpu_time/gpu_time:.2f}x")

# Display results
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Convert tensors to numpy for display
original_np = img_tensor.permute(1, 2, 0).numpy()
cpu_blur_np = cpu_blur.permute(1, 2, 0).numpy()
gpu_blur_np = gpu_blur.permute(1, 2, 0).numpy()

# Display images
axes[0].imshow(original_np)
axes[0].set_title("Original")
axes[0].axis('off')

axes[1].imshow(cpu_blur_np)
axes[1].set_title(f"CPU Blur: {cpu_time:.4f}s")
axes[1].axis('off')

axes[2].imshow(gpu_blur_np)
axes[2].set_title(f"GPU Blur: {gpu_time:.4f}s")
axes[2].axis('off')

plt.suptitle(f"Gaussian Blur - Speedup: {cpu_time/gpu_time:.2f}x" if device.type == "cuda" else "Gaussian Blur")
plt.tight_layout()
plt.show()

## 2. Sobel Edge Detection

Now let's implement Sobel edge detection on both CPU and GPU.

In [None]:
def sobel_edge_detection_cpu(img_tensor):
    """Apply Sobel edge detection using CPU."""
    # Convert to numpy for OpenCV
    img_np = (img_tensor.permute(1, 2, 0).numpy() * 255.0).astype(np.uint8)
    img_gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
    
    start_time = time.time()
    
    # Apply Sobel operators
    sobel_x = cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3)
    sobel_y = cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)
    
    # Compute magnitude
    magnitude = np.sqrt(sobel_x**2 + sobel_y**2)
    
    # Normalize
    magnitude = cv2.normalize(magnitude, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    
    cpu_time = time.time() - start_time
    
    # Convert back to tensor (single channel)
    result = torch.from_numpy(magnitude).float().unsqueeze(0) / 255.0
    
    return result, cpu_time

def sobel_edge_detection_gpu(img_tensor):
    """Apply Sobel edge detection using GPU."""
    if device.type != "cuda":
        # Fall back to CPU if GPU not available
        return sobel_edge_detection_cpu(img_tensor)
    
    # Move tensor to GPU
    img_gpu = img_tensor.to(device)
    
    # Convert to grayscale
    gray_weights = torch.tensor([0.299, 0.587, 0.114], device=device).view(1, 3, 1, 1)
    img_gpu = img_gpu.unsqueeze(0)  # Add batch dimension
    img_gray = torch.sum(img_gpu * gray_weights, dim=1, keepdim=True)
    
    # Define Sobel kernels
    sobel_x = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=torch.float32, device=device)
    sobel_y = torch.tensor([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=torch.float32, device=device)
    
    sobel_x = sobel_x.view(1, 1, 3, 3)
    sobel_y = sobel_y.view(1, 1, 3, 3)
    
    start_time = time.time()
    
    # Apply convolution
    grad_x = torch.nn.functional.conv2d(img_gray, sobel_x, padding=1)
    grad_y = torch.nn.functional.conv2d(img_gray, sobel_y, padding=1)
    
    # Compute magnitude
    magnitude = torch.sqrt(grad_x**2 + grad_y**2)
    
    # Normalize
    magnitude = magnitude / magnitude.max()
    
    torch.cuda.synchronize()  # Ensure GPU operations complete
    gpu_time = time.time() - start_time
    
    # Move result back to CPU
    result = magnitude.squeeze(0).cpu()
    
    return result, gpu_time

# Apply Sobel edge detection
print("\nApplying Sobel Edge Detection...")
cpu_edge, cpu_time = sobel_edge_detection_cpu(img_tensor)
gpu_edge, gpu_time = sobel_edge_detection_gpu(img_tensor)
print(f"  CPU time: {cpu_time:.4f} seconds")
print(f"  GPU time: {gpu_time:.4f} seconds")
if device.type == "cuda":
    print(f"  Speedup: {cpu_time/gpu_time:.2f}x")

# Display results
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Convert tensors to numpy for display
original_np = img_tensor.permute(1, 2, 0).numpy()
cpu_edge_np = cpu_edge.squeeze().numpy()
gpu_edge_np = gpu_edge.squeeze().numpy()

# Display images
axes[0].imshow(original_np)
axes[0].set_title("Original")
axes[0].axis('off')

axes[1].imshow(cpu_edge_np, cmap='gray')
axes[1].set_title(f"CPU Edge Detection: {cpu_time:.4f}s")
axes[1].axis('off')

axes[2].imshow(gpu_edge_np, cmap='gray')
axes[2].set_title(f"GPU Edge Detection: {gpu_time:.4f}s")
axes[2].axis('off')

plt.suptitle(f"Sobel Edge Detection - Speedup: {cpu_time/gpu_time:.2f}x" if device.type == "cuda" else "Sobel Edge Detection")
plt.tight_layout()
plt.show()

## 3. Image Resizing

Finally, let's compare CPU and GPU performance for image resizing.

In [None]:
def image_resize_cpu(img_tensor, scale_factor=0.25):
    """Resize image using CPU."""
    # Convert to numpy for OpenCV
    img_np = (img_tensor.permute(1, 2, 0).numpy() * 255.0).astype(np.uint8)
    
    start_time = time.time()
    
    # Resize image
    h, w = img_np.shape[:2]
    new_h, new_w = int(h * scale_factor), int(w * scale_factor)
    resized = cv2.resize(img_np, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
    
    cpu_time = time.time() - start_time
    
    # Convert back to tensor
    result = torch.from_numpy(resized).float().permute(2, 0, 1) / 255.0
    
    return result, cpu_time

def image_resize_gpu(img_tensor, scale_factor=0.25):
    """Resize image using GPU."""
    if device.type != "cuda":
        # Fall back to CPU if GPU not available
        return image_resize_cpu(img_tensor, scale_factor)
    
    # Move tensor to GPU
    img_gpu = img_tensor.to(device)
    
    start_time = time.time()
    
    # Add batch dimension
    img_gpu = img_gpu.unsqueeze(0)
    
    # Resize using PyTorch's interpolate function
    resized = torch.nn.functional.interpolate(
        img_gpu, 
        scale_factor=scale_factor, 
        mode='bicubic',
        align_corners=False
    )
    
    torch.cuda.synchronize()  # Ensure GPU operations complete
    gpu_time = time.time() - start_time
    
    # Move result back to CPU
    result = resized.squeeze(0).cpu()
    
    return result, gpu_time

# Apply image resizing
print("\nPerforming Image Resizing...")
cpu_resize, cpu_time = image_resize_cpu(img_tensor, scale_factor=0.25)
gpu_resize, gpu_time = image_resize_gpu(img_tensor, scale_factor=0.25)
print(f"  CPU time: {cpu_time:.4f} seconds")
print(f"  GPU time: {gpu_time:.4f} seconds")
if device.type == "cuda":
    print(f"  Speedup: {cpu_time/gpu_time:.2f}x")

# Display results
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Convert tensors to numpy for display
original_np = img_tensor.permute(1, 2, 0).numpy()
cpu_resize_np = cpu_resize.permute(1, 2, 0).numpy()
gpu_resize_np = gpu_resize.permute(1, 2, 0).numpy()

# Display images
axes[0].imshow(original_np)
axes[0].set_title(f"Original ({original_np.shape[1]}x{original_np.shape[0]})")
axes[0].axis('off')

axes[1].imshow(cpu_resize_np)
axes[1].set_title(f"CPU Resize: {cpu_time:.4f}s ({cpu_resize_np.shape[1]}x{cpu_resize_np.shape[0]})")
axes[1].axis('off')

axes[2].imshow(gpu_resize_np)
axes[2].set_title(f"GPU Resize: {gpu_time:.4f}s ({gpu_resize_np.shape[1]}x{gpu_resize_np.shape[0]})")
axes[2].axis('off')

plt.suptitle(f"Image Resizing - Speedup: {cpu_time/gpu_time:.2f}x" if device.type == "cuda" else "Image Resizing")
plt.tight_layout()
plt.show()

## Performance Summary

Let's summarize the performance improvements we observed with GPU acceleration.

In [None]:
print_separator()
print("IMAGE PROCESSING PERFORMANCE SUMMARY")
print_separator()

# Create a summary table
operations = ["Gaussian Blur", "Edge Detection", "Image Resizing"]
cpu_times = [cpu_time_blur, cpu_time_edge, cpu_time_resize]
gpu_times = [gpu_time_blur, gpu_time_edge, gpu_time_resize]

# Calculate speedups
if device.type == "cuda":
    speedups = [cpu/gpu for cpu, gpu in zip(cpu_times, gpu_times)]
    avg_speedup = sum(speedups) / len(speedups)
    
    # Create a bar chart
    plt.figure(figsize=(10, 6))
    bar_width = 0.35
    index = np.arange(len(operations))
    
    plt.bar(index - bar_width/2, cpu_times, bar_width, label='CPU', color='blue')
    plt.bar(index + bar_width/2, gpu_times, bar_width, label='GPU', color='green')
    
    # Add speedup text
    for i, speedup in enumerate(speedups):
        plt.text(i, max(cpu_times[i], gpu_times[i]) + 0.01, f'{speedup:.1f}x', 
                 ha='center', va='bottom', fontweight='bold')
    
    plt.xlabel('Operation')
    plt.ylabel('Time (seconds)')
    plt.title('CPU vs GPU Performance for Image Processing')
    plt.xticks(index, operations)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    print(f"Average speedup across all operations: {avg_speedup:.2f}x")
    print("\nObservations:")
    print(f"  • Gaussian Blur: {speedups[0]:.2f}x speedup")
    print(f"  • Edge Detection: {speedups[1]:.2f}x speedup")
    print(f"  • Image Resizing: {speedups[2]:.2f}x speedup")
    print("\nGPU acceleration is particularly effective for:")
    print("  • Operations that can be parallelized (like convolutions)")
    print("  • Processing larger images")
    print("  • Batch processing multiple images")
else:
    print("No GPU was available for this benchmark.")
    print("To see the benefits of GPU acceleration:")
    print("  1. Go to Runtime > Change runtime type")
    print("  2. Select 'GPU' under Hardware accelerator")
    print("  3. Click 'Save' and run this notebook again")
    print("\nWith a GPU, you would typically see:")
    print("  • 5-20x speedup for Gaussian blur")
    print("  • 10-30x speedup for edge detection")
    print("  • 3-15x speedup for image resizing")

print_separator()