<a href="https://colab.research.google.com/github/ulfboge/temporal-landcover-vectorizer/blob/main/scripts/python/stratified_biomass_sampler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spatial Data Sampling and Integration

This notebook:
1. Processes multiple CSV files containing temporal spatial data
2. Samples biomass data from 2021 in equal intervals
3. Creates two unified long-format tables using the sampled pixel IDs:
   - One without coordinates (original format)
   - One with coordinates (x_coord, y_coord)
4. Integrates corresponding NDVI and NDFI values for years 2015-2021

## Setup
First, let's mount Google Drive and import required libraries.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Set the working directory - adjust these paths according to your Google Drive structure
csv_dir = "/content/drive/MyDrive/earthengine/conversion/csv"
output_dir = "/content/drive/MyDrive/earthengine/conversion/output"
os.makedirs(output_dir, exist_ok=True)

# Define the years we want to process
target_years = ['y2015', 'y2017', 'y2019', 'y2021']

## Define Processing Functions
Create function to process CSV files into long format, with option to include coordinates.

In [None]:
def process_csv_to_long_format(file_path, include_coords=False):
    # Extract the data type from filename (NDVI, NDFI, or biomass)
    data_type = Path(file_path).stem.split('_')[0].upper()

    # Read CSV
    df = pd.read_csv(file_path)

    # Identify year columns (starting with 'y') and filter for target years
    year_cols = [col for col in df.columns if col in target_years]

    # Define id_vars based on whether coordinates should be included
    id_vars = ['pixel_id']
    if include_coords:
        id_vars.extend(['x_coord', 'y_coord'])

    # Melt the dataframe to long format
    long_df = pd.melt(
        df,
        id_vars=id_vars,
        value_vars=year_cols,
        var_name='year',
        value_name=data_type
    )

    # Convert year format from 'y2013' to '2013'
    long_df['year'] = long_df['year'].str.replace('y', '')

    return long_df

def get_2021_biomass_data(file_path):
    """Extract 2021 biomass data for initial sampling"""
    df = pd.read_csv(file_path)
    return df[['pixel_id', 'y2021']].rename(columns={'y2021': 'BIOMASS'})

## Sample from 2021 Biomass Data
Sample 100 points from 2021 biomass data with equal distribution across intervals.

In [None]:
# Find biomass file
biomass_file = None
for file in os.listdir(csv_dir):
    if file.startswith('biomass') and file.endswith('.csv'):
        biomass_file = os.path.join(csv_dir, file)
        break

if not biomass_file:
    raise FileNotFoundError("No biomass CSV file found")

# Get 2021 biomass data for sampling
biomass_2021_df = get_2021_biomass_data(biomass_file)

# Define intervals and sample biomass data
intervals = [(5, 34), (35, 64), (65, 94), (95, 124), (125, 150)]
samples_per_interval = 20  # 20 samples per interval = 100 total
sampled_pixels = []

for start, end in intervals:
    # Filter data within interval
    interval_data = biomass_2021_df[
        (biomass_2021_df['BIOMASS'] >= start) &
        (biomass_2021_df['BIOMASS'] <= end)
    ]

    # Sample from this interval
    if len(interval_data) >= samples_per_interval:
        sampled = interval_data.sample(n=samples_per_interval, random_state=42)
    else:
        sampled = interval_data  # Take all available if less than needed
        print(f"Warning: Only {len(interval_data)} samples available for interval {start}-{end}")

    sampled_pixels.extend(sampled['pixel_id'].unique())

print(f"Total unique pixels sampled from 2021 data: {len(set(sampled_pixels))}")

## Process All CSV Files
Process all CSV files and create two versions of each dataset - with and without coordinates.

In [None]:
# Process all CSV files - create two versions
dfs = {'with_coords': {}, 'without_coords': {}}
for file in os.listdir(csv_dir):
    if file.endswith('.csv'):
        file_path = os.path.join(csv_dir, file)
        data_type = Path(file).stem.split('_')[0].upper()
        dfs['with_coords'][data_type] = process_csv_to_long_format(file_path, include_coords=True)
        dfs['without_coords'][data_type] = process_csv_to_long_format(file_path, include_coords=False)
        print(f"Processed {file}")

## Filter and Merge Data
Create filtered datasets using the sampled pixel IDs and merge them to create final results.

In [None]:
# Create two versions of the final result
filtered_dfs = {
    'with_coords': {},
    'without_coords': {}
}

# Filter each dataset for sampled pixels
for version in ['with_coords', 'without_coords']:
    for data_type, df in dfs[version].items():
        filtered_dfs[version][data_type] = df[df['pixel_id'].isin(sampled_pixels)]

# Create and save both versions
results = {}

# Version without coordinates (original)
result_without_coords = filtered_dfs['without_coords']['BIOMASS'][['pixel_id', 'year', 'BIOMASS']]
for data_type in ['NDVI', 'NDFI']:
    if data_type in filtered_dfs['without_coords']:
        result_without_coords = result_without_coords.merge(
            filtered_dfs['without_coords'][data_type][['pixel_id', 'year', data_type]],
            on=['pixel_id', 'year'],
            how='left'
        )
results['without_coords'] = result_without_coords.sort_values(['pixel_id', 'year'])

# Version with coordinates
result_with_coords = filtered_dfs['with_coords']['BIOMASS'][['pixel_id', 'x_coord', 'y_coord', 'year', 'BIOMASS']]
for data_type in ['NDVI', 'NDFI']:
    if data_type in filtered_dfs['with_coords']:
        result_with_coords = result_with_coords.merge(
            filtered_dfs['with_coords'][data_type][['pixel_id', 'year', data_type]],
            on=['pixel_id', 'year'],
            how='left'
        )
results['with_coords'] = result_with_coords.sort_values(['pixel_id', 'year'])

## Save Results and Display Summary
Save both versions of the results and display summary statistics.

In [None]:
# Save both versions
output_path_without_coords = os.path.join(output_dir, 'sampled_data.csv')
output_path_with_coords = os.path.join(output_dir, 'sampled_data_with_coords.csv')

results['without_coords'].to_csv(output_path_without_coords, index=False)
results['with_coords'].to_csv(output_path_with_coords, index=False)

print(f"\nResults saved to:")
print(f"1. Without coordinates: {output_path_without_coords}")
print(f"2. With coordinates: {output_path_with_coords}")

# Print summary statistics
print("\nSummary of sampled data:")
print(f"Total unique pixels: {results['with_coords']['pixel_id'].nunique()}")
print(f"Years covered: {sorted(results['with_coords']['year'].unique())}")
print("\nValue ranges by year:")
for year in sorted(results['with_coords']['year'].unique()):
    print(f"\nYear {year}:")
    year_data = results['with_coords'][results['with_coords']['year'] == year]
    for column in ['BIOMASS', 'NDVI', 'NDFI']:
        if column in year_data.columns:
            print(f"  {column}:")
            print(f"    Min: {year_data[column].min():.2f}")
            print(f"    Max: {year_data[column].max():.2f}")
            print(f"    Mean: {year_data[column].mean():.2f}")