# Oil Spill Detection - Data Exploration

This notebook performs comprehensive exploratory data analysis (EDA) on the oil spill detection dataset.

## Objectives
1. Load and examine the dataset structure
2. Analyze image properties and class distribution
3. Visualize sample data
4. Calculate dataset statistics
5. Identify potential data quality issues

In [None]:
# Import required libraries
import sys
import os
# Fixed import path to correctly reference src directory
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Updated imports to use correct module path
try:
    from data.data_loader import OilSpillDataLoader, create_sample_dataset
    from data.preprocessor import OilSpillPreprocessor
    print("✅ Custom modules imported successfully!")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Current working directory:", os.getcwd())
    print("Python path:", sys.path)
    print("\nTrying alternative import...")
    
    # Alternative import method
    import importlib.util
    
    # Load data_loader module
    spec = importlib.util.spec_from_file_location("data_loader", "../src/data/data_loader.py")
    data_loader_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(data_loader_module)
    
    OilSpillDataLoader = data_loader_module.OilSpillDataLoader
    create_sample_dataset = data_loader_module.create_sample_dataset
    
    # Load preprocessor module
    spec = importlib.util.spec_from_file_location("preprocessor", "../src/data/preprocessor.py")
    preprocessor_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(preprocessor_module)
    
    OilSpillPreprocessor = preprocessor_module.OilSpillPreprocessor
    print("✅ Alternative import successful!")

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("📚 All libraries loaded successfully!")

## 1. Dataset Loading and Initial Inspection

In [None]:
# Updated to work with user's actual dataset structure
# Define data directory - update this path to your actual dataset location
data_dir = "../dataset"  # This should point to your dataset folder with train/val/test

# Check if user's dataset exists
if not os.path.exists(data_dir):
    print(f"❌ Dataset not found at {data_dir}")
    print("Please update the data_dir path to point to your dataset folder")
    print("Expected structure:")
    print("dataset/")
    print("├── train/")
    print("├── val/")
    print("├── test/")
    print("└── label_colors")
    
    # Create sample dataset for demonstration
    sample_dir = "../data/raw"
    print(f"\nCreating sample dataset at {sample_dir} for demonstration...")
    create_sample_dataset(sample_dir, num_samples=50)
    data_dir = sample_dir
    print("✅ Sample dataset created!")
else:
    print(f"✅ Dataset found at {data_dir}")

# Initialize data loader
print("\n🔄 Loading dataset...")
loader = OilSpillDataLoader(data_dir)
dataset_info = loader.load_dataset_info()

print("\n📊 Dataset Information:")
print("=" * 50)
for key, value in dataset_info.items():
    if key not in ['sample_image_paths', 'sample_mask_paths', 'label_colors_info']:
        print(f"{key}: {value}")

# Show label colors info if available
if 'label_colors_info' in dataset_info:
    print(f"\n🎨 Label Colors Info:")
    print(dataset_info['label_colors_info'])

## 2. Sample Data Visualization

In [None]:
# Updated visualization to work with both classification and segmentation datasets
print("🖼️ Visualizing sample data...")

if dataset_info['dataset_type'] == 'classification':
    # Load classification data samples
    try:
        train_images, train_labels = loader.load_classification_data('train', target_size=(256, 256))
        
        # Show first 12 samples
        fig, axes = plt.subplots(3, 4, figsize=(16, 12))
        
        for i in range(min(12, len(train_images))):
            row = i // 4
            col = i % 4
            
            axes[row, col].imshow(train_images[i])
            label_text = "Oil Spill" if train_labels[i] == 1 else "Clean Water"
            axes[row, col].set_title(f'Sample {i+1}: {label_text}')
            axes[row, col].axis('off')
        
        # Hide empty subplots
        for i in range(len(train_images), 12):
            row = i // 4
            col = i % 4
            axes[row, col].axis('off')
        
        plt.suptitle('Sample Images from Training Set', fontsize=16)
        plt.tight_layout()
        plt.show()
        
        print(f"✅ Loaded {len(train_images)} training samples")
        print(f"📊 Label distribution: {np.bincount(train_labels)}")
        
    except Exception as e:
        print(f"❌ Error loading classification data: {e}")
        print("This might be because your dataset structure is different than expected.")

else:
    # Original segmentation visualization code
    fig, axes = plt.subplots(3, 6, figsize=(18, 9))
    
    num_samples = min(6, dataset_info['total_samples'])
    
    for i in range(num_samples):
        try:
            # Load image and mask
            image, mask = loader.load_image_pair(i)
            
            # Original image
            axes[0, i].imshow(image)
            axes[0, i].set_title(f'Image {i+1}')
            axes[0, i].axis('off')
            
            # Mask
            axes[1, i].imshow(mask, cmap='gray')
            axes[1, i].set_title(f'Mask {i+1}')
            axes[1, i].axis('off')
            
            # Overlay
            overlay = image.copy()
            overlay[:, :, 0] = np.where(mask > 0.5, 1.0, overlay[:, :, 0])  # Red overlay for oil spills
            axes[2, i].imshow(overlay)
            axes[2, i].set_title(f'Overlay {i+1}')
            axes[2, i].axis('off')
            
        except Exception as e:
            print(f"Error loading sample {i}: {e}")
            continue
    
    plt.tight_layout()
    plt.show()

## 3. Dataset Statistics Analysis

In [None]:
# Updated statistics calculation for classification datasets
print("📈 Calculating dataset statistics...")

if dataset_info['dataset_type'] == 'classification':
    # Analyze all splits
    all_stats = {}
    
    for split in ['train', 'val', 'test']:
        try:
            images, labels = loader.load_classification_data(split, target_size=(256, 256))
            
            if len(images) > 0:
                # Calculate image statistics
                image_array = np.array(images)
                
                split_stats = {
                    'num_samples': len(images),
                    'num_oil_spill': np.sum(labels),
                    'num_clean_water': len(labels) - np.sum(labels),
                    'mean_intensity': np.mean(image_array),
                    'std_intensity': np.std(image_array),
                    'min_intensity': np.min(image_array),
                    'max_intensity': np.max(image_array)
                }
                
                all_stats[split] = split_stats
                
                print(f"\n📊 {split.upper()} SET STATISTICS:")
                print(f"   Total samples: {split_stats['num_samples']}")
                print(f"   Oil spill samples: {split_stats['num_oil_spill']} ({split_stats['num_oil_spill']/split_stats['num_samples']*100:.1f}%)")
                print(f"   Clean water samples: {split_stats['num_clean_water']} ({split_stats['num_clean_water']/split_stats['num_samples']*100:.1f}%)")
                print(f"   Mean intensity: {split_stats['mean_intensity']:.4f}")
                print(f"   Intensity std: {split_stats['std_intensity']:.4f}")
                
        except Exception as e:
            print(f"❌ Error analyzing {split} set: {e}")
            continue

else:
    # Original segmentation statistics
    stats = loader.get_sample_statistics(num_samples=min(50, dataset_info['total_samples']))
    
    print("Dataset Statistics:")
    print(f"Samples analyzed: {stats['sample_count']}")
    print(f"Average oil spill ratio: {stats['average_spill_ratio']:.4f}")
    print(f"Spill ratio std: {stats['spill_ratio_std']:.4f}")
    print(f"Min spill ratio: {stats['min_spill_ratio']:.4f}")
    print(f"Max spill ratio: {stats['max_spill_ratio']:.4f}")
    print(f"\nImage shapes (first 5): {stats['image_shapes'][:5]}")

## 4. Data Distribution Visualization

In [None]:
# Updated visualization for classification datasets
print("📊 Creating data distribution visualizations...")

if dataset_info['dataset_type'] == 'classification' and 'all_stats' in locals():
    # Create comprehensive visualization for classification
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # Class distribution across splits
    splits = list(all_stats.keys())
    oil_counts = [all_stats[split]['num_oil_spill'] for split in splits]
    clean_counts = [all_stats[split]['num_clean_water'] for split in splits]
    
    x = np.arange(len(splits))
    width = 0.35
    
    axes[0, 0].bar(x - width/2, oil_counts, width, label='Oil Spill', alpha=0.8)
    axes[0, 0].bar(x + width/2, clean_counts, width, label='Clean Water', alpha=0.8)
    axes[0, 0].set_xlabel('Dataset Split')
    axes[0, 0].set_ylabel('Number of Samples')
    axes[0, 0].set_title('Class Distribution Across Splits')
    axes[0, 0].set_xticks(x)
    axes[0, 0].set_xticklabels(splits)
    axes[0, 0].legend()
    
    # Overall class distribution pie chart
    total_oil = sum(oil_counts)
    total_clean = sum(clean_counts)
    
    axes[0, 1].pie([total_oil, total_clean], 
                   labels=['Oil Spill', 'Clean Water'],
                   autopct='%1.1f%%',
                   startangle=90)
    axes[0, 1].set_title('Overall Class Distribution')
    
    # Intensity distribution across splits
    mean_intensities = [all_stats[split]['mean_intensity'] for split in splits]
    std_intensities = [all_stats[split]['std_intensity'] for split in splits]
    
    axes[0, 2].bar(splits, mean_intensities, alpha=0.7, color='orange')
    axes[0, 2].set_title('Mean Intensity by Split')
    axes[0, 2].set_ylabel('Mean Intensity')
    
    # Sample count by split
    sample_counts = [all_stats[split]['num_samples'] for split in splits]
    axes[1, 0].bar(splits, sample_counts, alpha=0.7, color='green')
    axes[1, 0].set_title('Sample Count by Split')
    axes[1, 0].set_ylabel('Number of Samples')
    
    # Class balance visualization
    balance_ratios = [all_stats[split]['num_oil_spill']/all_stats[split]['num_samples'] for split in splits]
    axes[1, 1].plot(splits, balance_ratios, marker='o', linewidth=2, markersize=8)
    axes[1, 1].axhline(y=0.5, color='r', linestyle='--', alpha=0.7, label='Perfect Balance')
    axes[1, 1].set_title('Class Balance Across Splits')
    axes[1, 1].set_ylabel('Oil Spill Ratio')
    axes[1, 1].legend()
    axes[1, 1].set_ylim(0, 1)
    
    # Intensity standard deviation
    axes[1, 2].bar(splits, std_intensities, alpha=0.7, color='purple')
    axes[1, 2].set_title('Intensity Std by Split')
    axes[1, 2].set_ylabel('Std Intensity')
    
    plt.tight_layout()
    plt.show()

else:
    print("⚠️ Visualization not available - using sample data or segmentation dataset")

## 5. Summary and Recommendations

In [None]:
# Updated summary for classification datasets
print("=" * 60)
print("DATA EXPLORATION SUMMARY REPORT")
print("=" * 60)

print(f"\n📊 DATASET OVERVIEW:")
print(f"   • Dataset type: {dataset_info['dataset_type']}")
print(f"   • Total samples: {dataset_info.get('total_samples', 'N/A')}")

if dataset_info['dataset_type'] == 'classification':
    print(f"   • Training samples: {dataset_info.get('train_samples', 0)}")
    print(f"   • Validation samples: {dataset_info.get('val_samples', 0)}")
    print(f"   • Test samples: {dataset_info.get('test_samples', 0)}")
    
    if dataset_info.get('classes'):
        print(f"   • Classes: {dataset_info['classes']}")
        if dataset_info.get('class_distribution'):
            print(f"   • Class distribution: {dataset_info['class_distribution']}")

print(f"\n🎯 RECOMMENDATIONS:")
if dataset_info['dataset_type'] == 'classification':
    total_samples = dataset_info.get('total_samples', 0)
    if total_samples > 100:
        print(f"   • ✅ Dataset size is adequate for training ({total_samples} samples)")
    else:
        print(f"   • ⚠️ Dataset is small ({total_samples} samples) - consider data augmentation")
    
    print(f"   • Use data augmentation: rotation, flipping, brightness/contrast adjustment")
    print(f"   • Apply normalization and preprocessing")
    print(f"   • Consider transfer learning with pre-trained models")
else:
    print(f"   • Segmentation dataset detected")
    print(f"   • Use U-Net or similar architecture for pixel-wise classification")

print(f"\n✅ MILESTONE 1 STATUS: DATA EXPLORATION COMPLETE")
print(f"\n🚀 NEXT STEPS:")
print(f"   1. Run preprocessing notebook (02_preprocessing.ipynb)")
print(f"   2. Implement data augmentation")
print(f"   3. Prepare data loaders for training")
print("=" * 60)

## 6. Save Results

In [None]:
# Create results directory
os.makedirs('../results/figures', exist_ok=True)
os.makedirs('../results/data', exist_ok=True)

# Save dataset information
dataset_info_df = pd.DataFrame([dataset_info])
dataset_info_df.to_csv('../results/data/dataset_info.csv', index=False)

# Save statistics if available
if 'all_stats' in locals():
    stats_df = pd.DataFrame(all_stats).T
    stats_df.to_csv('../results/data/dataset_statistics.csv')
    print("📁 Classification statistics saved")

print("📁 Results saved to ../results/ directory")
print("\n🎉 DATA EXPLORATION COMPLETE!")
print("Next step: Run preprocessing notebook (02_preprocessing.ipynb)")