<a href="https://colab.research.google.com/github/ulfboge/temporal-landcover-vectorizer/blob/main/scripts/python/stratified_biomass_sampler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spatial Data Sampling and Integration

This notebook:
 1. Processes multiple CSV files containing temporal spatial data
 2. Samples biomass data from 2021 in equal intervals based on coordinates
 3. Creates a unified long-format table with coordinates (x_coord, y_coord)
 4. Integrates corresponding NDVI and NDFI values for years 2015-2021

## Setup
 First, let's mount Google Drive and import required libraries.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Set the working directory - adjust these paths according to your Google Drive structure
csv_dir = "/content/drive/MyDrive/earthengine/conversion/csv"
output_dir = "/content/drive/MyDrive/earthengine/conversion/output"
os.makedirs(output_dir, exist_ok=True)

# Define the years we want to process
target_years = ['y2015', 'y2017', 'y2019', 'y2021']

## Define Processing Functions
Create function to process CSV files into long format, with option to include coordinates.

In [None]:
def process_csv_to_long_format(file_path):
    # Extract the data type from filename (NDVI, NDFI, or biomass)
    data_type = Path(file_path).stem.split('_')[0].upper()

    # Read CSV
    df = pd.read_csv(file_path)

    # Identify year columns (starting with 'y') and filter for target years
    year_cols = [col for col in df.columns if col in target_years]

    # Always include coordinates
    id_vars = ['x_coord', 'y_coord']

    # Melt the dataframe to long format
    long_df = pd.melt(
        df,
        id_vars=id_vars,
        value_vars=year_cols,
        var_name='year',
        value_name=data_type
    )

    # Convert year format from 'y2013' to '2013'
    long_df['year'] = long_df['year'].str.replace('y', '')

    return long_df

# Function to get 2021 biomass data for initial sampling

In [None]:
def get_2021_biomass_data(file_path):
    """Extract 2021 biomass data for initial sampling"""
    df = pd.read_csv(file_path)
    return df[['x_coord', 'y_coord', 'y2021']].rename(columns={'y2021': 'BIOMASS'})

## Sample from 2021 Biomass Data
# Sample 100 points from 2021 biomass data with equal distribution across intervals based on coordinates.

# Find biomass file
biomass_file = None
for file in os.listdir(csv_dir):
    if file.startswith('biomass') and file.endswith('.csv'):
        biomass_file = os.path.join(csv_dir, file)
        break

if not biomass_file:
    raise FileNotFoundError("No biomass CSV file found")

# Get 2021 biomass data for sampling
biomass_2021_df = get_2021_biomass_data(biomass_file)

# Define intervals and sample biomass data
intervals = [(5, 34), (35, 64), (65, 94), (95, 124), (125, 150)]
samples_per_interval = 40  # 20 samples per interval = 100 total
sampled_coords = [] # Store sampled coordinates as tuples


for start, end in intervals:
    # Filter data within interval
    interval_data = biomass_2021_df[
        (biomass_2021_df['BIOMASS'] >= start) &
        (biomass_2021_df['BIOMASS'] <= end)
    ]

    # Sample from this interval
    if len(interval_data) >= samples_per_interval:
        sampled = interval_data.sample(n=samples_per_interval, random_state=42)
    else:
        sampled = interval_data  # Take all available if less than needed
        print(f"Warning: Only {len(interval_data)} samples available for interval {start}-{end}")
    # Add the selected coordinate tuples to the list
    for index in sampled.index:
        sampled_coords.append(tuple(sampled.loc[index, ['x_coord', 'y_coord']]))

print(f"Total coordinate pairs sampled from 2021 data: {len(set(sampled_coords))}")

## Process All CSV Files
Process all CSV files and create datasets with coordinates.

In [None]:
dfs = {'with_coords': {}}
for file in os.listdir(csv_dir):
    if file.endswith('.csv'):
        file_path = os.path.join(csv_dir, file)
        data_type = Path(file).stem.split('_')[0].upper()
        dfs['with_coords'][data_type] = process_csv_to_long_format(file_path)
        print(f"Processed {file}")

## Filter and Merge Data
Create filtered datasets using the sampled coordinates and merge them to create final results.

In [None]:
# Filter each dataset for sampled pixels
filtered_dfs = {}  # No need for nested dictionary

for data_type, df in dfs['with_coords'].items():
    # Filter by coordinate pairs
    filtered_dfs[data_type] = df[
        df[['x_coord', 'y_coord']].apply(tuple, axis=1).isin(sampled_coords)
    ]

# Create the final result with coordinates
result_with_coords = filtered_dfs['BIOMASS'][['x_coord', 'y_coord', 'year', 'BIOMASS']]
for data_type in ['NDVI', 'NDFI']:
    if data_type in filtered_dfs:
        result_with_coords = result_with_coords.merge(
            filtered_dfs[data_type][['x_coord', 'y_coord', 'year', data_type]],
            on=['x_coord', 'y_coord', 'year'],
            how='left'
        )

results = {'with_coords': result_with_coords.sort_values(['x_coord', 'y_coord', 'year'])}

## Save Results and Display Summary
Save the results and display summary statistics.

In [None]:
# Save results
output_path = os.path.join(output_dir, 'sampled_data_with_coords.csv')  # Simplified name

results['with_coords'].to_csv(output_path, index=False)

print(f"\nResults saved to: {output_path}")

# Print summary statistics
print("\nSummary of sampled data:")
print(f"Total coordinate pairs: {results['with_coords'][['x_coord', 'y_coord']].apply(tuple, axis=1).nunique()}")
print(f"Years covered: {sorted(results['with_coords']['year'].unique())}")
print("\nValue ranges by year:")
for year in sorted(results['with_coords']['year'].unique()):
    print(f"\nYear {year}:")
    year_data = results['with_coords'][results['with_coords']['year'] == year]
    for column in ['BIOMASS', 'NDVI', 'NDFI']:
        if column in year_data.columns:
            print(f"  {column}:")
            print(f"    Min: {year_data[column].min():.2f}")
            print(f"    Max: {year_data[column].max():.2f}")
            print(f"    Mean: {year_data[column].mean():.2f}")