<a href="https://colab.research.google.com/github/ulfboge/temporal-landcover-vectorizer/blob/main/scripts/python/stratified_biomass_sampler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spatial Data Sampling and Integration

This notebook:
1. Processes multiple CSV files containing temporal spatial data
2. Creates a unified long-format table
3. Samples biomass data from 2021 in equal intervals
4. Integrates corresponding NDVI and NDFI values for all years using the same pixel_ids

## Setup
First, let's mount Google Drive and import required libraries.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Set the working directory
csv_dir = "/content/drive/MyDrive/earthengine/conversion/csv"
output_dir = "/content/drive/MyDrive/earthengine/conversion/output"
os.makedirs(output_dir, exist_ok=True)

## Process CSV Files
Read and transform CSV files into long format

In [None]:
def process_csv_to_long_format(file_path):
    # Extract the data type from filename (NDVI, NDFI, or biomass)
    data_type = Path(file_path).stem.split('_')[0].upper()

    # Read CSV
    df = pd.read_csv(file_path)

    # Define target years based on data type
    target_years = ['y2015', 'y2017', 'y2019', 'y2021']

    # Filter only the columns we want
    columns_to_keep = ['pixel_id'] + target_years
    available_columns = [col for col in columns_to_keep if col in df.columns]
    df = df[available_columns]

    # Melt the dataframe to long format
    long_df = pd.melt(
        df,
        id_vars=['pixel_id'],
        value_vars=[col for col in target_years if col in df.columns],
        var_name='year',
        value_name=data_type
    )

    # Convert year format from 'y2013' to '2013'
    long_df['year'] = long_df['year'].str.replace('y', '')

    return long_df

# Process all CSV files
dfs = {}
for file in os.listdir(csv_dir):
    if file.endswith('.csv'):
        file_path = os.path.join(csv_dir, file)
        data_type = Path(file).stem.split('_')[0].upper()
        dfs[data_type] = process_csv_to_long_format(file_path)
        print(f"Processed {file}")

## Sample Biomass Data from 2021
Sample 100 points from biomass data with equal distribution across intervals

In [None]:
# Define intervals
intervals = [(5, 34), (35, 64), (65, 94), (95, 124), (125, 150)]
samples_per_interval = 20  # 20 samples per interval = 100 total

# Filter biomass data for 2021
biomass_df = dfs['BIOMASS']
biomass_2021 = biomass_df[biomass_df['year'] == '2021']
sampled_pixels = []

for start, end in intervals:
    # Filter data within interval for 2021
    interval_data = biomass_2021[
        (biomass_2021['BIOMASS'] >= start) &
        (biomass_2021['BIOMASS'] <= end)
    ]

    # Sample from this interval
    if len(interval_data) >= samples_per_interval:
        sampled = interval_data.sample(n=samples_per_interval, random_state=42)
    else:
        sampled = interval_data  # Take all available if less than needed
        print(f"Warning: Only {len(interval_data)} samples available for interval {start}-{end}")

    sampled_pixels.extend(sampled['pixel_id'].unique())

print(f"Total unique pixels sampled from 2021: {len(set(sampled_pixels))}")

## Integrate Data
Combine data from all sources using sampled pixel IDs

In [None]:
# Filter each dataset for sampled pixels
filtered_dfs = {}
for data_type, df in dfs.items():
    filtered_dfs[data_type] = df[df['pixel_id'].isin(sampled_pixels)]

# Merge all datasets
result = filtered_dfs['BIOMASS'][['pixel_id', 'year', 'BIOMASS']]
for data_type in ['NDVI', 'NDFI']:
    if data_type in filtered_dfs:
        result = result.merge(
            filtered_dfs[data_type][['pixel_id', 'year', data_type]],
            on=['pixel_id', 'year'],
            how='left'
        )

# Sort the results
result = result.sort_values(['pixel_id', 'year'])

# Save the results
output_path = os.path.join(output_dir, 'sampled_data.csv')
result.to_csv(output_path, index=False)

print(f"Results saved to: {output_path}")
print("\nFirst few rows of the result:")
display(result.head())

## Summary Statistics
Display summary of the sampled data

In [None]:
print("Summary of sampled data:")
print(f"Total unique pixels: {result['pixel_id'].nunique()}")
print(f"Years covered: {sorted(result['year'].unique())}")
print("\nValue ranges:")

# Add year-specific statistics
for year in sorted(result['year'].unique()):
    year_data = result[result['year'] == year]
    print(f"\nYear {year}:")
    for column in ['BIOMASS', 'NDVI', 'NDFI']:
        if column in result.columns:
            print(f"{column}:")
            print(f"  Min: {year_data[column].min():.2f}")
            print(f"  Max: {year_data[column].max():.2f}")
            print(f"  Mean: {year_data[column].mean():.2f}")