# Phase 1: Dataset Preparation & Environment Setup
Outdoor Object Detection & Face Recognition System

This notebook handles:
- Environment setup and dependency installation
- Dataset download (LFW, WiderFace, RTTS, BDD100K)
- Preprocessing (resize to 640x640, train/val/test split)
- Data augmentation (fog, rain, low-light, motion blur)
- Dataset statistics and verification

**Runtime**: GPU (T4) recommended for faster processing
**Storage**: Results saved to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_DIR = '/content/drive/MyDrive/computer_vision'
os.makedirs(PROJECT_DIR, exist_ok=True)
print(f"Project directory: {PROJECT_DIR}")

In [None]:
%cd /content
!rm -rf computer_vision_expirement
!git clone https://github.com/Ib-Programmer/computer_vision_expirement.git
%cd computer_vision_expirement
!pip install -q -r requirements.txt
!pip install -q gdown

## 1.1 Dataset Download

In [None]:
%cd /content/computer_vision_expirement
!python scripts/download_datasets.py

## 1.2 Preprocessing

In [None]:
%cd /content/computer_vision_expirement
!python scripts/preprocess_data.py

## 1.3 Augmentation Preview (Visual Only)
Augmentations (fog, rain, low-light, motion blur, combined) are applied **on-the-fly during training** in Phase 3.
This saves ~35GB of disk space and gives better results (different random augmentation every epoch).

Below we generate a small visual preview to verify the augmentations look correct.

In [None]:
%cd /content/computer_vision_expirement
!pip install -q albumentations

import cv2
import numpy as np
import matplotlib.pyplot as plt
import albumentations as A
import glob

# Define outdoor augmentations (same ones used in Phase 3 training)
augmentations = {
    'fog': A.RandomFog(fog_coef_lower=0.3, fog_coef_upper=0.7, alpha_coef=0.08, p=1.0),
    'rain': A.RandomRain(slant_lower=-10, slant_upper=10, drop_length=20, drop_width=1,
                         drop_color=(200, 200, 200), blur_value=3, brightness_coefficient=0.7, p=1.0),
    'low_light': A.RandomBrightnessContrast(brightness_limit=(-0.5, -0.2), contrast_limit=(-0.3, 0.0), p=1.0),
    'motion_blur': A.MotionBlur(blur_limit=(7, 15), p=1.0),
    'combined': A.Compose([
        A.OneOf([
            A.RandomFog(fog_coef_lower=0.2, fog_coef_upper=0.5, alpha_coef=0.08, p=1.0),
            A.RandomRain(slant_lower=-10, slant_upper=10, drop_length=20, drop_width=1,
                         drop_color=(200, 200, 200), blur_value=3, brightness_coefficient=0.7, p=1.0),
        ], p=0.5),
        A.OneOf([
            A.RandomBrightnessContrast(brightness_limit=(-0.4, -0.1), contrast_limit=(-0.2, 0.0), p=1.0),
            A.MotionBlur(blur_limit=(7, 12), p=1.0),
        ], p=0.5),
    ]),
}

# Find sample images from preprocessed datasets
PROJECT_DIR = '/content/drive/MyDrive/computer_vision'
sample_paths = glob.glob(f'{PROJECT_DIR}/datasets/*_processed/train/*.jpg')[:3]
if not sample_paths:
    sample_paths = glob.glob(f'/content/computer_vision_expirement/datasets/*_processed/train/*.jpg')[:3]

if sample_paths:
    aug_names = list(augmentations.keys())
    fig, axes = plt.subplots(len(sample_paths), len(aug_names) + 1, figsize=(20, 4 * len(sample_paths)))
    if len(sample_paths) == 1:
        axes = [axes]

    for row, img_path in enumerate(sample_paths):
        img = cv2.imread(img_path)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        axes[row][0].imshow(img_rgb)
        axes[row][0].set_title('Original', fontsize=10)
        axes[row][0].axis('off')

        for col, aug_name in enumerate(aug_names, 1):
            transform = augmentations[aug_name]
            if isinstance(transform, A.Compose):
                aug_img = transform(image=img_rgb)['image']
            else:
                t = A.Compose([transform])
                aug_img = t(image=img_rgb)['image']
            axes[row][col].imshow(aug_img)
            axes[row][col].set_title(aug_name, fontsize=10)
            axes[row][col].axis('off')

    plt.suptitle('Augmentation Preview (applied on-the-fly during Phase 3 training)', fontsize=14)
    plt.tight_layout()
    plt.savefig(f'{PROJECT_DIR}/results/augmentation_preview.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("Preview saved. Full augmentations run on-the-fly in Phase 3.")
else:
    print("No preprocessed images found. Run preprocessing first.")

## 1.4 Dataset Statistics

In [None]:
%cd /content/computer_vision_expirement
!python scripts/dataset_stats.py

## 1.5 Save to Google Drive

In [None]:
import shutil

src = '/content/computer_vision_expirement/datasets'
dst = f'{PROJECT_DIR}/datasets'

if os.path.exists(dst):
    print(f"Datasets already exist at {dst}, skipping copy")
else:
    print(f"Copying datasets to Google Drive...")
    shutil.copytree(src, dst)
    print("Done!")

# Check Drive usage
import subprocess
result = subprocess.run(['du', '-sh', PROJECT_DIR], capture_output=True, text=True)
print(f"\nDrive usage: {result.stdout.strip()}")

print("\nPhase 1 Complete! Preprocessed data saved to Google Drive.")
print(f"Location: {PROJECT_DIR}")
print("\nNote: Augmentations are applied on-the-fly during Phase 3 training.")
print("No augmented images stored = ~35GB saved!")

## Next Steps
- Open **Phase2_Image_Enhancement.ipynb** to evaluate enhancement models
- Datasets are saved in Google Drive and will persist across sessions