# Phytospatial Pipeline Introduction

This notebook demonstrates a complete workflow using `phytospatial` to process hyperspectral imagery and extract individual tree crown statistics.

In [None]:
import pandas as pd
from pathlib import Path
import logging

# Import phytospatial modules
from phytospatial import raster
from phytospatial import loaders
from phytospatial import vector
from phytospatial import extract

# Configure logging to see progress bars and info
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

## 1. Configuration
Define your input and output paths here.

In [None]:
RAW_HDR_DIR = "./data/input_hdrs"
PROCESSED_TIF_DIR = "./data/output_tifs"
CROWNS_SHP = "./data/crowns.shp"
OUTPUT_PARQUET = "./data/results/final_spectral_data.parquet"
TARGET_CRS = "EPSG:32619" # UTM Zone 19N

## 2. Preprocessing
Convert raw ENVI headers to Cloud-Optimized GeoTIFFs for efficient processing.

In [None]:
raster.convert_envi_to_geotiff(RAW_HDR_DIR, PROCESSED_TIF_DIR)

## 3. Load & Validate Vectors
Load the tree crown polygons. The loader will automatically filter out invalid geometries and warn you about corrupt data.

In [None]:
crowns = loaders.load_crowns(CROWNS_SHP, species_col="species")
print(f"Loaded {len(crowns)} valid tree crowns.")

## 4. Spectral Extraction
Iterate through the processed rasters and extract pixel statistics for each tree.

In [None]:
tif_files = list(Path(PROCESSED_TIF_DIR).glob("*.tif"))
all_results = []

for tif_path in tif_files:
    print(f"Processing raster: {tif_path.name}")
    
    # Initialize the extractor with specific bands if needed
    extractor = extract.BlockExtractor(
        str(tif_path), 
        band_names=["Red", "Green", "Blue"],
        return_raw_pixels=True
    )
    
    # Process crowns using the generator
    for stats in extractor.process_crowns(crowns):
        all_results.append(stats)
        
    extractor.close()

## 5. Save Results
Save the extracted data to Parquet for high-performance analysis.

In [None]:
if all_results:
    df_results = pd.DataFrame(all_results)
    df_results.to_parquet(OUTPUT_PARQUET, index=False, engine='pyarrow', compression='snappy')
    print(f"Saved {len(df_results)} trees to {OUTPUT_PARQUET}")
else:
    print("No results extracted.")