# Setup and Monitoring

The following code generates the ambient Greenspace indicator.

### Google Cloud Setup

There are a few steps to set up Google Cloud for use with Earth Engine and Cloud Storage.

1. **Install Google Cloud SDK**
For example - on macOS with MacPorts

```sudo port install google-cloud-sdk```

2. **Authenticate with Google Cloud**
The following command needs to be run in a terminal and will open a browser window for authentication

```bashgcloud auth login```

3. **Create a Google Cloud Project**

Go to [console.cloud.google.com](https://console.cloud.google.com/)
Create a new project or select an existing one
Note your project ID

4. **Enable Required APIs**

Enable the Earth Engine API in the project
Enable the Cloud Storage API in the project

5. **Register for Earth Engine**

Visit code.earthengine.google.com and register your Google account

6. **Create a Cloud Storage Bucket**

In Cloud Console, go to Cloud Storage and "Create bucket"
The following code expects a bucket named 'a-h-a-h'


In [None]:
from pathlib import Path  # For handling file paths in a cross-platform way
import pandas as pd  # For data manipulation and analysis
import geopandas as gpd  # For geospatial data handling
import requests  # For making HTTP requests
from bs4 import BeautifulSoup  # For parsing HTML content
import re  # For regular expressions
import requests  # Duplicate import (consider removing)
import time  # For time-related functions
from typing import Optional  # For type hints
import io  # For in-memory I/O operations
import os  # For operating system interfaces
import tempfile  # For temporary file creation
import zipfile  # For ZIP file handling
import urllib.request  # For URL retrieval
import ee  # For Google Earth Engine API
import subprocess  # For running subprocesses

## Task Monitoring / Cancelation


In [None]:
# Monitor task progress
result = subprocess.run(['earthengine', 'task', 'list'], capture_output=True, text=True)
print(result.stdout)

In [None]:
# Print task list until a specific task ID is found
result = subprocess.run(['earthengine', 'task', 'list'], capture_output=True, text=True)

for line in result.stdout.split('\n'):
    if '2Q4Y2CEVXZ5UAQWDZ4T6SIJ5' in line:
        break
    print(line)

In [None]:
# Cancel all running or ready tasks
for task in ee.batch.Task.list():
    if task.state in ['RUNNING', 'READY']:
        task.cancel()

# Create UK Boundaries

If the bounrdary shape files are not already in your Earth Engine assets, run the following code to create them.

```python
boundaries_all_UK = gpd.read_parquet(Path("data") / "boundary" / "LSOA_DZ_SDZ_21_22.parquet")

zip_url = "https://www.nisra.gov.uk/files/nisra/publications/geography-sdz2021-esri-shapefile.zip"

# create a temporary directory and download the zip there
tmp_dir = tempfile.mkdtemp()
zip_path = os.path.join(tmp_dir, "sdz2021.zip")
urllib.request.urlretrieve(zip_url, zip_path)

# Extract and import Northern Ireland SDZ 2021 shapefile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(tmp_dir)

# Read the shapefile
NI_boundaries = gpd.read_file(os.path.join(tmp_dir, "SDZ2021.shp"))

NI_boundaries = NI_boundaries.rename(columns={"SDZ2021_cd": "LSOA_DZ_SDZ_21_22"})
NI_boundaries = NI_boundaries[["LSOA_DZ_SDZ_21_22", "geometry"]].copy()

NI_boundaries = NI_boundaries.to_crs(boundaries_all.crs)

boundaries_all_UK = gpd.GeoDataFrame(
    pd.concat([boundaries_all, NI_boundaries], ignore_index=True),
    geometry="geometry",
    crs=boundaries_all.crs
)

# --- Reproject to WGS84 (required for GEE) ---
boundaries_gee = boundaries_all_UK.to_crs('EPSG:4326')

# Export to shapefile
output_dir = Path('./data/raw_data/gee_upload')
output_dir.mkdir(exist_ok=True)

shp_path = output_dir / 'boundaries_uk.shp'
boundaries_gee.to_file(shp_path, driver='ESRI Shapefile')

print(f'Saved to {shp_path}')

# Upload all shapefile components to GCS ---
bucket = 'a-h-a-h'
gcs_folder = 'gee_uploads'

shp_extensions = ['.shp', '.shx', '.dbf', '.prj', '.cpg']

for ext in shp_extensions:
    local_file = shp_path.with_suffix(ext)
    if local_file.exists():
        gcs_path = f'gs://{bucket}/{gcs_folder}/{local_file.name}'
        print(f'Uploading {local_file.name}...')
        subprocess.run(['gsutil', 'cp', str(local_file), gcs_path], check=True)

print('All files uploaded to GCS')

# --- Step 4: Ingest into Earth Engine ---
asset_id = 'projects/ndvi-inspire/assets/boundaries_uk'
gcs_shp = f'gs://{bucket}/{gcs_folder}/boundaries_uk.shp'

cmd = [
    'earthengine',
    'upload',
    'table',
    f'--asset_id={asset_id}',
    gcs_shp
]

print(f'Running: {" ".join(cmd)}')
result = subprocess.run(cmd, capture_output=True, text=True)

print('stdout:', result.stdout)
print('stderr:', result.stderr)

```

In [15]:
shp_path = Path('data/raw_data/gee_upload/boundaries_uk.shp')
if not shp_path.exists():
    raise FileNotFoundError(f"Shapefile not found: {shp_path}")

boundaries_all_UK = gpd.read_file(shp_path)

# Reproject to OSGB (EPSG:27700)
boundaries_all_UK = boundaries_all_UK.to_crs(epsg=27700)

print(f"Loaded {len(boundaries_all_UK)} features; CRS: {boundaries_all_UK.crs}")

Loaded 43914 features; CRS: EPSG:27700


In [4]:
# Initialize Earth Engine with a specific project
ee.Authenticate()
ee.Initialize(project='ndvi-inspire')

# Set the project
result = subprocess.run(['earthengine', 'set_project', 'ndvi-inspire'], capture_output=True, text=True)

# 2025

In [None]:
# Cloud masking for Sentinel-2
def mask_s2_clouds(image):
    """Mask clouds using SCL band, but keep original for pixel counting"""
    scl = image.select('SCL')
    mask = scl.neq(3).And(scl.neq(8)).And(scl.neq(9)).And(scl.neq(10))
    
    total_pixels = ee.Image.constant(1).rename('total_pixels')
    valid_pixels = ee.Image.constant(1).updateMask(mask).rename('valid_pixels')
    
    return image.updateMask(mask).addBands([total_pixels, valid_pixels])


def add_indices(image):
    """Calculate NDVI, EVI, and Fractional Vegetation Cover"""
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
    
    nir = image.select('B8').divide(10000)
    red = image.select('B4').divide(10000)
    blue = image.select('B2').divide(10000)
    
    evi = nir.subtract(red).multiply(2.5).divide(
        nir.add(red.multiply(6)).subtract(blue.multiply(7.5)).add(1)
    ).rename('EVI')
    
    # Fractional Vegetation Cover
    ndvi_soil = 0.2
    ndvi_veg = 0.86
    fvc = ndvi.subtract(ndvi_soil).divide(ndvi_veg - ndvi_soil).pow(2).clamp(0, 1).rename('FVC')
    
    return image.addBands([ndvi, evi, fvc])


# Main processing
start_date = '2025-04-01'
end_date = '2025-10-31'

s2 = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
      .filterDate(start_date, end_date)
      .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20))
      .map(mask_s2_clouds)
      .map(add_indices))

# Create composite
composite = s2.select(['NDVI', 'EVI', 'FVC']).median()

# Pixel counts - sum across collection
pixel_counts = s2.select(['total_pixels', 'valid_pixels']).sum()

# Build combined reducer for all stats
reducer = (
    ee.Reducer.mean()
    .combine(ee.Reducer.median(), sharedInputs=True)
    .combine(ee.Reducer.stdDev(), sharedInputs=True)
    .combine(ee.Reducer.max(), sharedInputs=True)
    .combine(ee.Reducer.min(), sharedInputs=True)
    .combine(ee.Reducer.count(), sharedInputs=True)
)

# Load the boundaries
boundaries = ee.FeatureCollection('projects/ndvi-inspire/assets/boundaries_uk')

print(f'Loaded {boundaries.size().getInfo()} features')

# Extract statistics
veg_stats = composite.reduceRegions(
    collection=boundaries,
    reducer=reducer,
    scale=10
)

pixel_stats = pixel_counts.reduceRegions(
    collection=boundaries,
    reducer=ee.Reducer.sum(),
    scale=10
)

# Export to your GCS bucket
task1 = ee.batch.Export.table.toCloudStorage(
    collection=veg_stats,
    description='uk_veg_stats',
    bucket='a-h-a-h',
    fileNamePrefix='gee_exports/uk_veg_stats',
    fileFormat='CSV'
)

task2 = ee.batch.Export.table.toCloudStorage(
    collection=pixel_stats,
    description='uk_pixel_counts',
    bucket='a-h-a-h',
    fileNamePrefix='gee_exports/uk_pixel_counts',
    fileFormat='CSV'
)

task1.start()
task2.start()

print('Export tasks started')

In [None]:
# Create the output directory for GEE exports if it doesn't exist
output_dir = Path('data/raw_data/gee_exports')
output_dir.mkdir(parents=True, exist_ok=True)

# Download the vegetation statistics CSV from Google Cloud Storage
subprocess.run(['gsutil', 'cp', 'gs://a-h-a-h/gee_exports/uk_veg_stats.csv', str(output_dir)], check=True)
# Download the pixel counts CSV from Google Cloud Storage
subprocess.run(['gsutil', 'cp', 'gs://a-h-a-h/gee_exports/uk_pixel_counts.csv', str(output_dir)], check=True)

# Read the downloaded CSV files into pandas DataFrames
veg_stats = pd.read_csv(output_dir / 'uk_veg_stats.csv')
pixel_counts = pd.read_csv(output_dir / 'uk_pixel_counts.csv')


In [None]:
# Clean and merge
veg_stats = veg_stats[['LSOA_DZ_SD', 'EVI_count', 'EVI_max', 'EVI_mean', 'EVI_median', 'EVI_min', 'EVI_stdDev', 'FVC_count', 'FVC_max', 'FVC_mean', 'FVC_median', 'FVC_min', 'FVC_stdDev', 'NDVI_count', 'NDVI_max', 'NDVI_mean', 'NDVI_median', 'NDVI_min', 'NDVI_stdDev']]
pixel_counts = pixel_counts.drop(columns=['system:index', '.geo'])
veg_stats = veg_stats.merge(pixel_counts, on='LSOA_DZ_SD', how='left')

# Merge veg_stats into boundaries_all_UK by matching LSOA IDs
boundaries_all_UK = boundaries_all_UK.merge(
    veg_stats,
    left_on="LSOA_DZ_SDZ_21_22",
    right_on="LSOA_DZ_SD",
    how="left"
).drop(columns=["LSOA_DZ_SD"], errors="ignore")

out_dir = Path("data") / "raw_data" / "gee_exports"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "Vegetation_Indicies_2025.parquet"
boundaries_all_UK.to_parquet(out_path, index=False)
print("Saved geoparquet to", out_path)


# 2024

In [24]:
# Cloud masking for Sentinel-2
def mask_s2_clouds(image):
    """Mask clouds using SCL band, but keep original for pixel counting"""
    scl = image.select('SCL')
    mask = scl.neq(3).And(scl.neq(8)).And(scl.neq(9)).And(scl.neq(10))
    
    total_pixels = ee.Image.constant(1).rename('total_pixels')
    valid_pixels = ee.Image.constant(1).updateMask(mask).rename('valid_pixels')
    
    return image.updateMask(mask).addBands([total_pixels, valid_pixels])


def add_indices(image):
    """Calculate NDVI, EVI, and Fractional Vegetation Cover"""
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
    
    nir = image.select('B8').divide(10000)
    red = image.select('B4').divide(10000)
    blue = image.select('B2').divide(10000)
    
    evi = nir.subtract(red).multiply(2.5).divide(
        nir.add(red.multiply(6)).subtract(blue.multiply(7.5)).add(1)
    ).rename('EVI')
    
    # Fractional Vegetation Cover
    ndvi_soil = 0.2
    ndvi_veg = 0.86
    fvc = ndvi.subtract(ndvi_soil).divide(ndvi_veg - ndvi_soil).pow(2).clamp(0, 1).rename('FVC')
    
    return image.addBands([ndvi, evi, fvc])


# Main processing
start_date = '2024-04-01'
end_date = '2024-10-31'

s2 = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
      .filterDate(start_date, end_date)
      .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 40))
      .map(mask_s2_clouds)
      .map(add_indices))

# Create composite
composite = s2.select(['NDVI', 'EVI', 'FVC']).median()

# Pixel counts - sum across collection
pixel_counts = s2.select(['total_pixels', 'valid_pixels']).sum()

# Build combined reducer for all stats
reducer = (
    ee.Reducer.mean()
    .combine(ee.Reducer.median(), sharedInputs=True)
    .combine(ee.Reducer.stdDev(), sharedInputs=True)
    .combine(ee.Reducer.max(), sharedInputs=True)
    .combine(ee.Reducer.min(), sharedInputs=True)
    .combine(ee.Reducer.count(), sharedInputs=True)
)

# Load the boundaries
boundaries = ee.FeatureCollection('projects/ndvi-inspire/assets/boundaries_uk')

print(f'Loaded {boundaries.size().getInfo()} features')

# Extract statistics
veg_stats = composite.reduceRegions(
    collection=boundaries,
    reducer=reducer,
    scale=10
)

pixel_stats = pixel_counts.reduceRegions(
    collection=boundaries,
    reducer=ee.Reducer.sum(),
    scale=10
)

# Export to your GCS bucket
task1 = ee.batch.Export.table.toCloudStorage(
    collection=veg_stats,
    description='uk_veg_stats_2024',
    bucket='a-h-a-h',
    fileNamePrefix='gee_exports/uk_veg_stats_2024',
    fileFormat='CSV'
)

task2 = ee.batch.Export.table.toCloudStorage(
    collection=pixel_stats,
    description='uk_pixel_counts_2024',
    bucket='a-h-a-h',
    fileNamePrefix='gee_exports/uk_pixel_counts_2024',
    fileFormat='CSV'
)

task1.start()
task2.start()

print('Export tasks started')

Loaded 43914 features
Export tasks started


In [None]:
# Create the output directory for GEE exports if it doesn't exist
output_dir = Path('data/raw_data/gee_exports')
output_dir.mkdir(parents=True, exist_ok=True)

# Download the vegetation statistics CSV from Google Cloud Storage
subprocess.run(['gsutil', 'cp', 'gs://a-h-a-h/gee_exports/uk_veg_stats_2024.csv', str(output_dir)], check=True)
# Download the pixel counts CSV from Google Cloud Storage
subprocess.run(['gsutil', 'cp', 'gs://a-h-a-h/gee_exports/uk_pixel_counts_2024.csv', str(output_dir)], check=True)

# Read the downloaded CSV files into pandas DataFrames
veg_stats = pd.read_csv(output_dir / 'uk_veg_stats_2024.csv')
pixel_counts = pd.read_csv(output_dir / 'uk_pixel_counts_2024.csv')


Copying gs://a-h-a-h/gee_exports/uk_veg_stats_2024.csv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

| [1 files][283.0 MiB/283.0 MiB]                                                
Operation completed over 1 objects/283.0 MiB.                                    
Copying gs://a-h-a-h/gee_exports/uk_pixel_counts_2024.csv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

/ [1 files][272.1 MiB/272.1 MiB]                                                
Operation completed over 1 objects/272.1 MiB.                                    


## 2023

In [30]:
# Cloud masking for Sentinel-2
def mask_s2_clouds(image):
    """Mask clouds using SCL band, but keep original for pixel counting"""
    scl = image.select('SCL')
    mask = scl.neq(3).And(scl.neq(8)).And(scl.neq(9)).And(scl.neq(10))
    
    total_pixels = ee.Image.constant(1).rename('total_pixels')
    valid_pixels = ee.Image.constant(1).updateMask(mask).rename('valid_pixels')
    
    return image.updateMask(mask).addBands([total_pixels, valid_pixels])


def add_indices(image):
    """Calculate NDVI, EVI, and Fractional Vegetation Cover"""
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
    
    nir = image.select('B8').divide(10000)
    red = image.select('B4').divide(10000)
    blue = image.select('B2').divide(10000)
    
    evi = nir.subtract(red).multiply(2.5).divide(
        nir.add(red.multiply(6)).subtract(blue.multiply(7.5)).add(1)
    ).rename('EVI')
    
    # Fractional Vegetation Cover
    ndvi_soil = 0.2
    ndvi_veg = 0.86
    fvc = ndvi.subtract(ndvi_soil).divide(ndvi_veg - ndvi_soil).pow(2).clamp(0, 1).rename('FVC')
    
    return image.addBands([ndvi, evi, fvc])


# Main processing
start_date = '2023-04-01'
end_date = '2023-10-31'

s2 = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
      .filterDate(start_date, end_date)
      .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 40))
      .map(mask_s2_clouds)
      .map(add_indices))

# Create composite
composite = s2.select(['NDVI', 'EVI', 'FVC']).median()

# Pixel counts - sum across collection
pixel_counts = s2.select(['total_pixels', 'valid_pixels']).sum()

# Build combined reducer for all stats
reducer = (
    ee.Reducer.mean()
    .combine(ee.Reducer.median(), sharedInputs=True)
    .combine(ee.Reducer.stdDev(), sharedInputs=True)
    .combine(ee.Reducer.max(), sharedInputs=True)
    .combine(ee.Reducer.min(), sharedInputs=True)
    .combine(ee.Reducer.count(), sharedInputs=True)
)

# Load the boundaries
boundaries = ee.FeatureCollection('projects/ndvi-inspire/assets/boundaries_uk')

print(f'Loaded {boundaries.size().getInfo()} features')

# Extract statistics
veg_stats = composite.reduceRegions(
    collection=boundaries,
    reducer=reducer,
    scale=10
)

pixel_stats = pixel_counts.reduceRegions(
    collection=boundaries,
    reducer=ee.Reducer.sum(),
    scale=10
)

# Export to your GCS bucket
task1 = ee.batch.Export.table.toCloudStorage(
    collection=veg_stats,
    description='uk_veg_stats_2023',
    bucket='a-h-a-h',
    fileNamePrefix='gee_exports/uk_veg_stats_2023',
    fileFormat='CSV'
)

task2 = ee.batch.Export.table.toCloudStorage(
    collection=pixel_stats,
    description='uk_pixel_counts_2023',
    bucket='a-h-a-h',
    fileNamePrefix='gee_exports/uk_pixel_counts_2023',
    fileFormat='CSV'
)

task1.start()
task2.start()

print('Export tasks started')

Loaded 43914 features
Export tasks started


In [35]:
import subprocess

# Create the output directory for GEE exports if it doesn't exist
output_dir = Path('data/raw_data/gee_exports')
output_dir.mkdir(parents=True, exist_ok=True)

# Download the vegetation statistics CSV from Google Cloud Storage
subprocess.run(['gsutil', 'cp', 'gs://a-h-a-h/gee_exports/uk_veg_stats_2023.csv', str(output_dir)], check=True)
# Download the pixel counts CSV from Google Cloud Storage
subprocess.run(['gsutil', 'cp', 'gs://a-h-a-h/gee_exports/uk_pixel_counts_2023.csv', str(output_dir)], check=True)

# Read the downloaded CSV files into pandas DataFrames
veg_stats = pd.read_csv(output_dir / 'uk_veg_stats_2023.csv')
pixel_counts = pd.read_csv(output_dir / 'uk_pixel_counts_2023.csv')


Copying gs://a-h-a-h/gee_exports/uk_veg_stats_2023.csv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

/ [1 files][283.0 MiB/283.0 MiB]                                                
Operation completed over 1 objects/283.0 MiB.                                    
Copying gs://a-h-a-h/gee_exports/uk_pixel_counts_2023.csv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

\ [1 files][272.1 MiB/272.1 MiB]                                                
Operation completed over 1 objects/272.1 MiB.                                    


# 2022

In [37]:
# Cloud masking for Sentinel-2
def mask_s2_clouds(image):
    """Mask clouds using SCL band, but keep original for pixel counting"""
    scl = image.select('SCL')
    mask = scl.neq(3).And(scl.neq(8)).And(scl.neq(9)).And(scl.neq(10))
    
    total_pixels = ee.Image.constant(1).rename('total_pixels')
    valid_pixels = ee.Image.constant(1).updateMask(mask).rename('valid_pixels')
    
    return image.updateMask(mask).addBands([total_pixels, valid_pixels])


def add_indices(image):
    """Calculate NDVI, EVI, and Fractional Vegetation Cover"""
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
    
    nir = image.select('B8').divide(10000)
    red = image.select('B4').divide(10000)
    blue = image.select('B2').divide(10000)
    
    evi = nir.subtract(red).multiply(2.5).divide(
        nir.add(red.multiply(6)).subtract(blue.multiply(7.5)).add(1)
    ).rename('EVI')
    
    # Fractional Vegetation Cover
    ndvi_soil = 0.2
    ndvi_veg = 0.86
    fvc = ndvi.subtract(ndvi_soil).divide(ndvi_veg - ndvi_soil).pow(2).clamp(0, 1).rename('FVC')
    
    return image.addBands([ndvi, evi, fvc])


# Main processing
start_date = '2022-04-01'
end_date = '2022-10-31'

s2 = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
      .filterDate(start_date, end_date)
      .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 40))
      .map(mask_s2_clouds)
      .map(add_indices))

# Create composite
composite = s2.select(['NDVI', 'EVI', 'FVC']).median()

# Pixel counts - sum across collection
pixel_counts = s2.select(['total_pixels', 'valid_pixels']).sum()

# Build combined reducer for all stats
reducer = (
    ee.Reducer.mean()
    .combine(ee.Reducer.median(), sharedInputs=True)
    .combine(ee.Reducer.stdDev(), sharedInputs=True)
    .combine(ee.Reducer.max(), sharedInputs=True)
    .combine(ee.Reducer.min(), sharedInputs=True)
    .combine(ee.Reducer.count(), sharedInputs=True)
)

# Load the boundaries
boundaries = ee.FeatureCollection('projects/ndvi-inspire/assets/boundaries_uk')

print(f'Loaded {boundaries.size().getInfo()} features')

# Extract statistics
veg_stats = composite.reduceRegions(
    collection=boundaries,
    reducer=reducer,
    scale=10
)

pixel_stats = pixel_counts.reduceRegions(
    collection=boundaries,
    reducer=ee.Reducer.sum(),
    scale=10
)

# Export to your GCS bucket
task1 = ee.batch.Export.table.toCloudStorage(
    collection=veg_stats,
    description='uk_veg_stats_2022',
    bucket='a-h-a-h',
    fileNamePrefix='gee_exports/uk_veg_stats_2022',
    fileFormat='CSV'
)

task2 = ee.batch.Export.table.toCloudStorage(
    collection=pixel_stats,
    description='uk_pixel_counts_2022',
    bucket='a-h-a-h',
    fileNamePrefix='gee_exports/uk_pixel_counts_2022',
    fileFormat='CSV'
)

task1.start()
task2.start()

print('Export tasks started')

Loaded 43914 features
Export tasks started


In [40]:
import subprocess

# Create the output directory for GEE exports if it doesn't exist
output_dir = Path('data/raw_data/gee_exports')
output_dir.mkdir(parents=True, exist_ok=True)

# Download the vegetation statistics CSV from Google Cloud Storage
subprocess.run(['gsutil', 'cp', 'gs://a-h-a-h/gee_exports/uk_veg_stats_2022.csv', str(output_dir)], check=True)
# Download the pixel counts CSV from Google Cloud Storage
subprocess.run(['gsutil', 'cp', 'gs://a-h-a-h/gee_exports/uk_pixel_counts_2022.csv', str(output_dir)], check=True)

# Read the downloaded CSV files into pandas DataFrames
veg_stats = pd.read_csv(output_dir / 'uk_veg_stats_2022.csv')
pixel_counts = pd.read_csv(output_dir / 'uk_pixel_counts_2022.csv')


Copying gs://a-h-a-h/gee_exports/uk_veg_stats_2022.csv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

\ [1 files][283.0 MiB/283.0 MiB]                                                
Operation completed over 1 objects/283.0 MiB.                                    
Copying gs://a-h-a-h/gee_exports/uk_pixel_counts_2022.csv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

\ [1 files][272.1 MiB/272.1 MiB]                                                
Operation completed over 1 objects/272.1 MiB.                                    


# 2021

In [42]:
# Cloud masking for Sentinel-2
def mask_s2_clouds(image):
    """Mask clouds using SCL band, but keep original for pixel counting"""
    scl = image.select('SCL')
    mask = scl.neq(3).And(scl.neq(8)).And(scl.neq(9)).And(scl.neq(10))
    
    total_pixels = ee.Image.constant(1).rename('total_pixels')
    valid_pixels = ee.Image.constant(1).updateMask(mask).rename('valid_pixels')
    
    return image.updateMask(mask).addBands([total_pixels, valid_pixels])


def add_indices(image):
    """Calculate NDVI, EVI, and Fractional Vegetation Cover"""
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
    
    nir = image.select('B8').divide(10000)
    red = image.select('B4').divide(10000)
    blue = image.select('B2').divide(10000)
    
    evi = nir.subtract(red).multiply(2.5).divide(
        nir.add(red.multiply(6)).subtract(blue.multiply(7.5)).add(1)
    ).rename('EVI')
    
    # Fractional Vegetation Cover
    ndvi_soil = 0.2
    ndvi_veg = 0.86
    fvc = ndvi.subtract(ndvi_soil).divide(ndvi_veg - ndvi_soil).pow(2).clamp(0, 1).rename('FVC')
    
    return image.addBands([ndvi, evi, fvc])


# Main processing
start_date = '2021-04-01'
end_date = '2021-10-31'

s2 = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
      .filterDate(start_date, end_date)
      .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 40))
      .map(mask_s2_clouds)
      .map(add_indices))

# Create composite
composite = s2.select(['NDVI', 'EVI', 'FVC']).median()

# Pixel counts - sum across collection
pixel_counts = s2.select(['total_pixels', 'valid_pixels']).sum()

# Build combined reducer for all stats
reducer = (
    ee.Reducer.mean()
    .combine(ee.Reducer.median(), sharedInputs=True)
    .combine(ee.Reducer.stdDev(), sharedInputs=True)
    .combine(ee.Reducer.max(), sharedInputs=True)
    .combine(ee.Reducer.min(), sharedInputs=True)
    .combine(ee.Reducer.count(), sharedInputs=True)
)

# Load the boundaries
boundaries = ee.FeatureCollection('projects/ndvi-inspire/assets/boundaries_uk')

print(f'Loaded {boundaries.size().getInfo()} features')

# Extract statistics
veg_stats = composite.reduceRegions(
    collection=boundaries,
    reducer=reducer,
    scale=10
)

pixel_stats = pixel_counts.reduceRegions(
    collection=boundaries,
    reducer=ee.Reducer.sum(),
    scale=10
)

# Export to your GCS bucket
task1 = ee.batch.Export.table.toCloudStorage(
    collection=veg_stats,
    description='uk_veg_stats_2021',
    bucket='a-h-a-h',
    fileNamePrefix='gee_exports/uk_veg_stats_2021',
    fileFormat='CSV'
)

task2 = ee.batch.Export.table.toCloudStorage(
    collection=pixel_stats,
    description='uk_pixel_counts_2021',
    bucket='a-h-a-h',
    fileNamePrefix='gee_exports/uk_pixel_counts_2021',
    fileFormat='CSV'
)

task1.start()
task2.start()

print('Export tasks started')

Loaded 43914 features
Export tasks started


In [45]:
import subprocess

# Create the output directory for GEE exports if it doesn't exist
output_dir = Path('data/raw_data/gee_exports')
output_dir.mkdir(parents=True, exist_ok=True)

# Download the vegetation statistics CSV from Google Cloud Storage
subprocess.run(['gsutil', 'cp', 'gs://a-h-a-h/gee_exports/uk_veg_stats_2021.csv', str(output_dir)], check=True)
# Download the pixel counts CSV from Google Cloud Storage
subprocess.run(['gsutil', 'cp', 'gs://a-h-a-h/gee_exports/uk_pixel_counts_2021.csv', str(output_dir)], check=True)

# Read the downloaded CSV files into pandas DataFrames
veg_stats = pd.read_csv(output_dir / 'uk_veg_stats_2021.csv')
pixel_counts = pd.read_csv(output_dir / 'uk_pixel_counts_2021.csv')


Copying gs://a-h-a-h/gee_exports/uk_veg_stats_2021.csv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

/ [1 files][283.1 MiB/283.1 MiB]                                                
Operation completed over 1 objects/283.1 MiB.                                    
Copying gs://a-h-a-h/gee_exports/uk_pixel_counts_2021.csv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

/ [1 files][272.1 MiB/272.1 MiB]                                                
Operation completed over 1 objects/272.1 MiB.                                    


# Create Output Files from CSV

In [None]:

def process_vegetation_data(year: int, boundaries_gdf, output_dir: Path = None):
    """
    Process GEE vegetation exports for a given year and merge with boundaries.
    
    Parameters
    ----------
    year : int
        Year to process (e.g., 2021, 2022, 2023, 2024)
    boundaries_gdf : GeoDataFrame
        UK boundaries with LSOA_DZ_SD column
    output_dir : Path, optional
        Directory containing GEE exports. Defaults to 'data/raw_data/gee_exports'
    
    Returns
    -------
    GeoDataFrame
        Boundaries merged with vegetation indices
    """
    if output_dir is None:
        output_dir = Path('data/raw_data/gee_exports')
    
    # Read the CSV files for this year
    pixel_counts = pd.read_csv(output_dir / f'uk_pixel_counts_{year}.csv')
    veg_indices = pd.read_csv(output_dir / f'uk_veg_stats_{year}.csv')
    
    # Select relevant columns from vegetation indices
    veg_cols = ['LSOA_DZ_SD']
    for index in ['EVI', 'FVC', 'NDVI']:
        for stat in ['count', 'max', 'mean', 'median', 'min', 'stdDev']:
            veg_cols.append(f'{index}_{stat}')
    
    veg_indices = veg_indices[veg_cols]
    
    # Clean pixel counts
    pixel_counts = pixel_counts.drop(columns=['system:index', '.geo'], errors='ignore')
    
    # Merge vegetation indices with pixel counts
    veg_indices = veg_indices.merge(pixel_counts, on='LSOA_DZ_SD', how='left')
    
    # Merge with boundaries
    result = boundaries_gdf.merge(
        veg_indices,
        on='LSOA_DZ_SD',
        how='left'
    )
    
    # Save to parquet
    out_path = output_dir / f'Vegetation_Indices_{year}.parquet'
    result.to_parquet(out_path, index=False)
    print(f"Saved geoparquet to {out_path}")
    
    return result


# Process all years
output_dir = Path('data/raw_data/gee_exports')
output_dir.mkdir(parents=True, exist_ok=True)

years = [2021, 2022, 2023, 2024]
results = {}

for year in years:
    results[year] = process_vegetation_data(
        year=year,
        boundaries_gdf=boundaries_all_UK.copy(),  # Use copy to avoid modifying original
        output_dir=output_dir
    )

Saved geoparquet to data/raw_data/gee_exports/Vegetation_Indices_2021.parquet
Saved geoparquet to data/raw_data/gee_exports/Vegetation_Indices_2022.parquet
Saved geoparquet to data/raw_data/gee_exports/Vegetation_Indices_2023.parquet
Saved geoparquet to data/raw_data/gee_exports/Vegetation_Indices_2024.parquet


# 3 Year Average (2023-2025)

In [None]:
# Cloud masking for Sentinel-2
def mask_s2_clouds(image):
    """Mask clouds using SCL band"""
    scl = image.select('SCL')
    mask = scl.neq(3).And(scl.neq(8)).And(scl.neq(9)).And(scl.neq(10))
    return image.updateMask(mask)


def add_ndvi(image):
    """Calculate NDVI"""
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
    return image.addBands(ndvi)


# 3-year window configuration
years = [2023, 2024, 2025]
start_month = 6  # June
end_month = 9    # September

# Build collection across 3 years, June-September only
def get_summer_images(year):
    """Get June-September images for a given year"""
    start_date = ee.Date.fromYMD(year, start_month, 1)
    end_date = ee.Date.fromYMD(year, end_month + 1, 1)
    
    return (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
            .filterDate(start_date, end_date)
            .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 40)))

# Combine all years
s2_collections = [get_summer_images(year) for year in years]
s2 = ee.ImageCollection(s2_collections[0])
for col in s2_collections[1:]:
    s2 = s2.merge(col)

# Apply masking and NDVI
s2 = s2.map(mask_s2_clouds).map(add_ndvi)

# Create composites: median, 75th, and 90th percentile
ndvi_median = s2.select('NDVI').median().rename('NDVI_median')
ndvi_p75 = s2.select('NDVI').reduce(ee.Reducer.percentile([75])).rename('NDVI_p75')
ndvi_p90 = s2.select('NDVI').reduce(ee.Reducer.percentile([90])).rename('NDVI_p90')

composite = ndvi_median.addBands(ndvi_p75).addBands(ndvi_p90)

# Reducer for zonal statistics
reducer = (
    ee.Reducer.median()
)

# Load the boundaries
boundaries = ee.FeatureCollection('projects/ndvi-inspire/assets/boundaries_uk')

total_features = boundaries.size().getInfo()
print(f'Loaded {total_features} features')
print(f'Processing {s2.size().getInfo()} images across {years[0]}-{years[-1]} (Jun-Sep)')

# Batching configuration
batch_size = 500
year_range = f'{years[0]}_{years[-1]}'

boundaries_list = boundaries.toList(total_features)
num_batches = (total_features + batch_size - 1) // batch_size
print(f'Splitting into {num_batches} batches of {batch_size}')

tasks = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, total_features)
    
    batch = ee.FeatureCollection(boundaries_list.slice(start_idx, end_idx))
    
    ndvi_stats = composite.reduceRegions(
        collection=batch,
        reducer=reducer,
        scale=10
    )
    
    task = ee.batch.Export.table.toCloudStorage(
        collection=ndvi_stats,
        description=f'uk_ndvi_3yr_summer_{year_range}_batch{i:03d}',
        bucket='a-h-a-h',
        fileNamePrefix=f'gee_exports/ndvi_3yr_summer_{year_range}/ndvi_stats_batch{i:03d}',
        fileFormat='CSV'
    )
    
    task.start()
    tasks.append(task)
    
    print(f'Started batch {i+1}/{num_batches} (features {start_idx}-{end_idx})')

print(f'\nStarted {len(tasks)} export tasks')

Loaded 43914 features
Processing 2071217 images across 2023-2025 (Jun-Sep)
Splitting into 88 batches of 500
Started batch 1/88 (features 0-500)
Started batch 2/88 (features 500-1000)
Started batch 3/88 (features 1000-1500)
Started batch 4/88 (features 1500-2000)
Started batch 5/88 (features 2000-2500)
Started batch 6/88 (features 2500-3000)
Started batch 7/88 (features 3000-3500)
Started batch 8/88 (features 3500-4000)
Started batch 9/88 (features 4000-4500)
Started batch 10/88 (features 4500-5000)
Started batch 11/88 (features 5000-5500)
Started batch 12/88 (features 5500-6000)
Started batch 13/88 (features 6000-6500)
Started batch 14/88 (features 6500-7000)
Started batch 15/88 (features 7000-7500)
Started batch 16/88 (features 7500-8000)
Started batch 17/88 (features 8000-8500)
Started batch 18/88 (features 8500-9000)
Started batch 19/88 (features 9000-9500)
Started batch 20/88 (features 9500-10000)
Started batch 21/88 (features 10000-10500)
Started batch 22/88 (features 10500-11000)

In [69]:
# Output directory
output_dir = Path('data/raw_data/gee_exports/ndvi_3yr_summer_2023_2025')
output_dir.mkdir(parents=True, exist_ok=True)

# Download all batch files from GCS
subprocess.run([
    'gsutil', 'cp', 
    'gs://a-h-a-h/gee_exports/ndvi_3yr_summer_2023_2025/*.csv', 
    str(output_dir)
], check=True)

# Combine all batch files
csv_files = sorted(output_dir.glob('ndvi_stats_batch*.csv'))
print(f'Found {len(csv_files)} batch files')

dfs = [pd.read_csv(f) for f in csv_files]
combined = pd.concat(dfs, ignore_index=True)

# Save combined file
combined.to_csv(output_dir / 'ndvi_stats_combined_2023_2025.csv', index=False)
print(f'Combined {len(combined)} features to ndvi_stats_combined_2023_2025.csv')

Copying gs://a-h-a-h/gee_exports/ndvi_3yr_summer_2023_2025/ndvi_stats_batch000.csv...
Copying gs://a-h-a-h/gee_exports/ndvi_3yr_summer_2023_2025/ndvi_stats_batch001.csv...
Copying gs://a-h-a-h/gee_exports/ndvi_3yr_summer_2023_2025/ndvi_stats_batch002.csv...
Copying gs://a-h-a-h/gee_exports/ndvi_3yr_summer_2023_2025/ndvi_stats_batch003.csv...
| [4 files][  4.6 MiB/  4.6 MiB]   84.6 KiB/s                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://a-h-a-h/gee_exports/ndvi_3yr_summer_2023_2025/ndvi_stats_batch004.csv...
Copying gs://a-h-a-h/gee_exports/ndvi_3yr_summer_2023_2025/ndvi_stats_batch005.csv...
Copying gs://a-h-a-h/gee_exports/ndvi_3yr_summer_2023_2025/ndvi_stats_batch006.csv...
Copying gs://a-h-a-h/gee_exports/ndvi_3yr_summer_2023_2025/ndvi_s

Found 88 batch files
Combined 43914 features to ndvi_stats_combined_2023_2025.csv


In [None]:
combined_ndvi = pd.read_csv(output_dir / 'ndvi_stats_combined_2023_2025.csv')

# keep LSOA_DZ_SD and any NDVI* columns, merge with boundaries, save
ndvi_cols = [c for c in combined_ndvi.columns if c == 'LSOA_DZ_SD' or c.startswith('NDVI')]
combined_ndvi_sub = combined_ndvi[ndvi_cols].copy()

combined_gdf = boundaries_all_UK.merge(combined_ndvi_sub, on='LSOA_DZ_SD', how='left')

out_path = Path('data/raw_data/gee_exports/ndvi_3yr_summer_2023_2025.parquet')
combined_gdf.to_parquet(out_path, index=False)

print(f"Saved {out_path}")

Saved data/raw_data/gee_exports/ndvi_3yr_summer_2023_2025.parquet


# Create Outputs for Each Year

In [1]:
import geopandas as gpd
from pathlib import Path

# Load Vegetation Indices for each year
veg_2021 = gpd.read_parquet(Path('data/raw_data/gee_exports/Vegetation_Indices_2021.parquet'))
veg_2022 = gpd.read_parquet(Path('data/raw_data/gee_exports/Vegetation_Indices_2022.parquet'))
veg_2023 = gpd.read_parquet(Path('data/raw_data/gee_exports/Vegetation_Indices_2023.parquet'))
veg_2024 = gpd.read_parquet(Path('data/raw_data/gee_exports/Vegetation_Indices_2024.parquet'))
veg_2025 = gpd.read_parquet(Path('data/raw_data/gee_exports/Vegetation_Indices_2025.parquet'))

In [5]:
# Export vegetation data for each year as CSV files

for year, veg_df in zip([2021, 2022, 2023, 2024, 2025], [veg_2021, veg_2022, veg_2023, veg_2024, veg_2025]):
    # Drop geometry column for CSV export
    csv_df = veg_df.drop(columns=['geometry'])
    csv_path = f'Vegetation_Indices_{year}.csv'
    csv_df.to_csv(csv_path, index=False)
    print(f"Saved {csv_path}")

Saved Vegetation_Indices_2021.csv
Saved Vegetation_Indices_2022.csv
Saved Vegetation_Indices_2023.csv
Saved Vegetation_Indices_2024.csv
Saved Vegetation_Indices_2025.csv
