# Oil Spill Detection - Data Preprocessing

This notebook implements the complete data preprocessing pipeline for the oil spill detection project.

## Objectives
1. Apply preprocessing techniques to satellite imagery
2. Implement noise reduction for SAR images
3. Normalize pixel values and enhance contrast
4. Create processed dataset ready for training
5. Generate comprehensive preprocessing statistics

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from tqdm import tqdm
import json
import warnings
warnings.filterwarnings('ignore')

# Custom modules
from data.data_loader import OilSpillDataLoader
from data.preprocessor import OilSpillPreprocessor, create_preprocessing_report
from data.augmentation import OilSpillAugmentor

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Load Dataset and Initialize Preprocessor

In [None]:
# Load dataset
data_dir = "../data/raw"
loader = OilSpillDataLoader(data_dir)
dataset_info = loader.load_dataset_info()

print(f"Dataset loaded: {dataset_info['total_samples']} samples")

# Initialize preprocessor
preprocessor = OilSpillPreprocessor(target_size=(256, 256))

print("Preprocessor initialized with target size: (256, 256)")

## 2. Preprocessing Technique Comparison

In [None]:
# Load a sample image for preprocessing comparison
sample_image, sample_mask = loader.load_image_pair(0)

print(f"Sample image shape: {sample_image.shape}")
print(f"Sample mask shape: {sample_mask.shape}")
print(f"Image value range: [{sample_image.min():.3f}, {sample_image.max():.3f}]")
print(f"Mask value range: [{sample_mask.min():.3f}, {sample_mask.max():.3f}]")

In [None]:
# Compare different normalization methods
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

# Original image
axes[0, 0].imshow(sample_image)
axes[0, 0].set_title('Original Image')
axes[0, 0].axis('off')

# Different normalization methods
norm_methods = ['minmax', 'zscore', 'robust']
for i, method in enumerate(norm_methods):
    normalized = preprocessor.normalize_image(sample_image, method=method)
    axes[0, i+1].imshow(normalized)
    axes[0, i+1].set_title(f'{method.title()} Normalization')
    axes[0, i+1].axis('off')

# Histogram comparison
axes[1, 0].hist(sample_image.flatten(), bins=50, alpha=0.7, label='Original')
axes[1, 0].set_title('Original Histogram')
axes[1, 0].legend()

for i, method in enumerate(norm_methods):
    normalized = preprocessor.normalize_image(sample_image, method=method)
    axes[1, i+1].hist(normalized.flatten(), bins=50, alpha=0.7, label=method)
    axes[1, i+1].set_title(f'{method.title()} Histogram')
    axes[1, i+1].legend()

plt.tight_layout()
plt.show()

## 3. Noise Reduction Techniques

In [None]:
# Compare different noise reduction methods
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

# Original image
axes[0, 0].imshow(sample_image)
axes[0, 0].set_title('Original Image')
axes[0, 0].axis('off')

# Different noise reduction methods
filter_methods = ['gaussian', 'median', 'bilateral']
for i, method in enumerate(filter_methods):
    filtered = preprocessor.reduce_speckle_noise(sample_image, filter_type=method)
    axes[0, i+1].imshow(filtered)
    axes[0, i+1].set_title(f'{method.title()} Filter')
    axes[0, i+1].axis('off')

# Show difference maps
axes[1, 0].imshow(np.zeros_like(sample_image[:,:,0]), cmap='gray')
axes[1, 0].set_title('Reference')
axes[1, 0].axis('off')

for i, method in enumerate(filter_methods):
    filtered = preprocessor.reduce_speckle_noise(sample_image, filter_type=method)
    diff = np.abs(sample_image - filtered)
    axes[1, i+1].imshow(np.mean(diff, axis=2), cmap='hot')
    axes[1, i+1].set_title(f'{method.title()} Difference')
    axes[1, i+1].axis('off')

plt.tight_layout()
plt.show()

## 4. Contrast Enhancement Comparison

In [None]:
# Compare contrast enhancement methods
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Original image
axes[0, 0].imshow(sample_image)
axes[0, 0].set_title('Original Image')
axes[0, 0].axis('off')

# CLAHE enhancement
clahe_enhanced = preprocessor.enhance_contrast(sample_image, method='clahe')
axes[0, 1].imshow(clahe_enhanced)
axes[0, 1].set_title('CLAHE Enhanced')
axes[0, 1].axis('off')

# Histogram equalization
hist_enhanced = preprocessor.enhance_contrast(sample_image, method='histogram_eq')
axes[0, 2].imshow(hist_enhanced)
axes[0, 2].set_title('Histogram Equalized')
axes[0, 2].axis('off')

# Histograms
axes[1, 0].hist(sample_image.flatten(), bins=50, alpha=0.7, label='Original')
axes[1, 0].set_title('Original Histogram')
axes[1, 0].legend()

axes[1, 1].hist(clahe_enhanced.flatten(), bins=50, alpha=0.7, label='CLAHE', color='orange')
axes[1, 1].set_title('CLAHE Histogram')
axes[1, 1].legend()

axes[1, 2].hist(hist_enhanced.flatten(), bins=50, alpha=0.7, label='Hist Eq', color='green')
axes[1, 2].set_title('Hist Eq Histogram')
axes[1, 2].legend()

plt.tight_layout()
plt.show()

## 5. Complete Preprocessing Pipeline

In [None]:
# Demonstrate complete preprocessing pipeline
preprocessor.visualize_preprocessing_effects(sample_image)

## 6. Process Complete Dataset

In [None]:
# Calculate comprehensive preprocessing statistics
print("Calculating preprocessing statistics for the entire dataset...")
print("This may take a few minutes...")

stats = preprocessor.calculate_dataset_statistics(
    loader.image_paths, 
    loader.mask_paths
)

print("\nPreprocessing Statistics:")
print(json.dumps(stats, indent=2))

## 7. Create Processed Dataset

In [None]:
# Create directories for processed data
processed_dir = "../data/processed"
processed_images_dir = os.path.join(processed_dir, "images")
processed_masks_dir = os.path.join(processed_dir, "masks")

os.makedirs(processed_images_dir, exist_ok=True)
os.makedirs(processed_masks_dir, exist_ok=True)

print(f"Processing and saving {len(loader.image_paths)} image pairs...")

# Process and save all images
processed_count = 0
failed_count = 0

for i, (img_path, mask_path) in enumerate(tqdm(
    zip(loader.image_paths, loader.mask_paths),
    total=len(loader.image_paths),
    desc="Processing images"
)):
    try:
        # Load original image and mask
        image = cv2.imread(img_path)
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        
        if image is None or mask is None:
            failed_count += 1
            continue
        
        # Convert BGR to RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Normalize to [0, 1]
        image = image.astype(np.float32) / 255.0
        mask = mask.astype(np.float32) / 255.0
        
        # Apply preprocessing
        processed_image = preprocessor.preprocess_image(
            image,
            apply_noise_reduction=True,
            apply_contrast_enhancement=True,
            normalization_method='minmax'
        )
        
        processed_mask = preprocessor.preprocess_mask(mask, threshold=0.5)
        
        # Convert back to uint8 for saving
        processed_image_uint8 = (processed_image * 255).astype(np.uint8)
        processed_mask_uint8 = (processed_mask * 255).astype(np.uint8)
        
        # Save processed files
        base_name = os.path.splitext(os.path.basename(img_path))[0]
        
        processed_img_path = os.path.join(processed_images_dir, f"{base_name}.png")
        processed_mask_path = os.path.join(processed_masks_dir, f"{base_name}.png")
        
        cv2.imwrite(processed_img_path, cv2.cvtColor(processed_image_uint8, cv2.COLOR_RGB2BGR))
        cv2.imwrite(processed_mask_path, processed_mask_uint8)
        
        processed_count += 1
        
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        failed_count += 1
        continue

print(f"\nProcessing complete!")
print(f"Successfully processed: {processed_count} images")
print(f"Failed to process: {failed_count} images")
print(f"Success rate: {(processed_count/(processed_count+failed_count))*100:.1f}%")

## 8. Data Augmentation Preview

In [None]:
# Initialize augmentor and show augmentation examples
augmentor = OilSpillAugmentor(target_size=(256, 256))

# Load a sample for augmentation demonstration
sample_image, sample_mask = loader.load_image_pair(0)

# Show augmentation examples
augmentor.visualize_augmentations(
    sample_image,
    sample_mask,
    num_examples=5,
    transform_type='train'
)

## 9. Create Small Augmented Dataset

In [None]:
# Create a small augmented dataset for demonstration
augmented_dir = "../data/augmented_sample"

# Use first 10 images for augmentation demo
sample_images = loader.image_paths[:10]
sample_masks = loader.mask_paths[:10]

print(f"Creating augmented dataset with {len(sample_images)} base images...")

aug_stats = augmentor.create_augmented_dataset(
    sample_images,
    sample_masks,
    augmented_dir,
    augmentations_per_image=3,
    transform_type='train'
)

print("\nAugmentation Statistics:")
for key, value in aug_stats.items():
    print(f"{key}: {value}")

## 10. Preprocessing Quality Assessment

In [None]:
# Compare original vs processed images
fig, axes = plt.subplots(3, 4, figsize=(16, 12))

# Load and compare multiple samples
for i in range(4):
    try:
        # Load original
        original_image, original_mask = loader.load_image_pair(i)
        
        # Process
        processed_image = preprocessor.preprocess_image(original_image)
        processed_mask = preprocessor.preprocess_mask(original_mask)
        
        # Show original
        axes[0, i].imshow(original_image)
        axes[0, i].set_title(f'Original {i+1}')
        axes[0, i].axis('off')
        
        # Show processed
        axes[1, i].imshow(processed_image)
        axes[1, i].set_title(f'Processed {i+1}')
        axes[1, i].axis('off')
        
        # Show mask comparison
        mask_comparison = np.hstack([original_mask, processed_mask])
        axes[2, i].imshow(mask_comparison, cmap='gray')
        axes[2, i].set_title(f'Masks {i+1} (Orig|Proc)')
        axes[2, i].axis('off')
        
    except Exception as e:
        print(f"Error comparing sample {i}: {e}")
        continue

plt.tight_layout()
plt.show()

## 11. Save Results and Generate Report

In [None]:
# Create results directories
os.makedirs('../results/figures', exist_ok=True)
os.makedirs('../results/data', exist_ok=True)

# Save preprocessing statistics
preprocessor.save_preprocessing_stats('../results/data/preprocessing_stats.json')

# Save augmentation statistics
augmentor.save_augmentation_stats('../results/data/augmentation_stats.json')

# Create comprehensive preprocessing report
create_preprocessing_report(stats, '../results/figures')

print("All results saved to ../results/ directory")

## 12. Final Summary and Next Steps

In [None]:
# Generate final summary
print("=" * 70)
print("PREPROCESSING PIPELINE SUMMARY REPORT")
print("=" * 70)

print(f"\nüìä PROCESSING RESULTS:")
print(f"   ‚Ä¢ Original dataset: {dataset_info['total_samples']} samples")
print(f"   ‚Ä¢ Successfully processed: {processed_count} samples")
print(f"   ‚Ä¢ Processing success rate: {(processed_count/(processed_count+failed_count))*100:.1f}%")
print(f"   ‚Ä¢ Augmented samples created: {aug_stats['generated_augmentations']}")

print(f"\nüîß PREPROCESSING PIPELINE:")
print(f"   ‚Ä¢ Target image size: 256x256 pixels")
print(f"   ‚Ä¢ Noise reduction: Gaussian filtering")
print(f"   ‚Ä¢ Contrast enhancement: CLAHE")
print(f"   ‚Ä¢ Normalization: Min-Max scaling")
print(f"   ‚Ä¢ Mask binarization: Threshold = 0.5")

print(f"\nüìà DATASET STATISTICS:")
if stats:
    print(f"   ‚Ä¢ Mean pixel value: {stats['pixel_statistics']['mean']:.3f}")
    print(f"   ‚Ä¢ Pixel std deviation: {stats['pixel_statistics']['std']:.3f}")
    print(f"   ‚Ä¢ Average spill ratio: {stats['spill_statistics']['mean_spill_ratio']:.3f}")
    print(f"   ‚Ä¢ Samples with spills: {stats['spill_statistics']['samples_with_spill']}")

print(f"\nüìÅ OUTPUT DIRECTORIES:")
print(f"   ‚Ä¢ Processed images: ../data/processed/")
print(f"   ‚Ä¢ Augmented samples: ../data/augmented_sample/")
print(f"   ‚Ä¢ Results and figures: ../results/")

print(f"\n‚úÖ MILESTONE 1 COMPLETION STATUS:")
print(f"   ‚úì Data Collection: Complete")
print(f"   ‚úì Data Exploration: Complete")
print(f"   ‚úì Data Preprocessing: Complete")
print(f"   ‚úì Data Augmentation: Complete")

print(f"\nüéØ NEXT STEPS (MILESTONE 2):")
print(f"   ‚Ä¢ Implement U-Net model architecture")
print(f"   ‚Ä¢ Set up training pipeline")
print(f"   ‚Ä¢ Configure loss functions and metrics")
print(f"   ‚Ä¢ Begin model training")

print(f"\nüéâ MILESTONE 1 SUCCESSFULLY COMPLETED!")
print("=" * 70)

# Create milestone completion file
milestone_status = {
    "milestone_1": {
        "status": "COMPLETED",
        "completion_date": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
        "modules_completed": [
            "Data Collection",
            "Data Exploration (EDA)",
            "Data Preprocessing",
            "Data Augmentation"
        ],
        "deliverables": {
            "processed_dataset": f"{processed_count} images",
            "augmented_samples": f"{aug_stats['generated_augmentations']} samples",
            "preprocessing_pipeline": "Implemented and tested",
            "documentation": "Complete with statistics and visualizations"
        },
        "next_milestone": "Model Development and Training"
    }
}

with open('../results/data/milestone_1_completion.json', 'w') as f:
    json.dump(milestone_status, f, indent=2)

print("\nMilestone completion status saved to ../results/data/milestone_1_completion.json")