<a href="https://colab.research.google.com/github/ulfboge/temporal-landcover-vectorizer/blob/main/scripts/python/stratified_biomass_sampler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spatial Data Sampling and Integration

This notebook:
1. Processes multiple CSV files containing temporal spatial data
2. Creates a unified long-format table
3. Samples biomass data in equal intervals
4. Integrates corresponding NDVI and NDFI values

## Setup
First, let's mount Google Drive and import required libraries.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Set the working directory
csv_dir = "/content/drive/MyDrive/earthengine/conversion/csv"
output_dir = "/content/drive/MyDrive/earthengine/conversion/output"
os.makedirs(output_dir, exist_ok=True)

## Process CSV Files
Read and transform CSV files into long format

In [3]:
def process_csv_to_long_format(file_path):
    # Extract the data type from filename (NDVI, NDFI, or biomass)
    data_type = Path(file_path).stem.split('_')[0].upper()

    # Read CSV
    df = pd.read_csv(file_path)

    # Identify year columns (starting with 'y')
    year_cols = [col for col in df.columns if col.startswith('y')]

    # Melt the dataframe to long format
    long_df = pd.melt(
        df,
        id_vars=['pixel_id', 'x_coord', 'y_coord'],
        value_vars=year_cols,
        var_name='year',
        value_name=data_type
    )

    # Convert year format from 'y2013' to '2013'
    long_df['year'] = long_df['year'].str.replace('y', '')

    return long_df

# Process all CSV files
dfs = {}
for file in os.listdir(csv_dir):
    if file.endswith('.csv'):
        file_path = os.path.join(csv_dir, file)
        data_type = Path(file).stem.split('_')[0].upper()
        dfs[data_type] = process_csv_to_long_format(file_path)
        print(f"Processed {file}")

Processed biomass_Area_6_vectorized.csv
Processed biomass_Area_5_vectorized.csv
Processed biomass_Area_7_vectorized.csv
Processed biomass_Area_8_vectorized.csv


## Sample Biomass Data
Sample 100 points from biomass data with equal distribution across intervals

In [4]:
# Define intervals
intervals = [(5, 34), (35, 64), (65, 94), (95, 124), (125, 150)]
samples_per_interval = 20  # 20 samples per interval = 100 total

# Filter and sample biomass data
biomass_df = dfs['BIOMASS']
sampled_pixels = []

for start, end in intervals:
    # Filter data within interval
    interval_data = biomass_df[
        (biomass_df['BIOMASS'] >= start) &
        (biomass_df['BIOMASS'] <= end)
    ]

    # Sample from this interval
    if len(interval_data) >= samples_per_interval:
        sampled = interval_data.sample(n=samples_per_interval, random_state=42)
    else:
        sampled = interval_data  # Take all available if less than needed
        print(f"Warning: Only {len(interval_data)} samples available for interval {start}-{end}")

    sampled_pixels.extend(sampled['pixel_id'].unique())

print(f"Total unique pixels sampled: {len(set(sampled_pixels))}")

Total unique pixels sampled: 100


## Integrate Data
Combine data from all sources using sampled pixel IDs

In [5]:
# Filter each dataset for sampled pixels
filtered_dfs = {}
for data_type, df in dfs.items():
    filtered_dfs[data_type] = df[df['pixel_id'].isin(sampled_pixels)]

# Merge all datasets
result = filtered_dfs['BIOMASS'][['pixel_id', 'x_coord', 'y_coord', 'year', 'BIOMASS']]
for data_type in ['NDVI', 'NDFI']:
    if data_type in filtered_dfs:
        result = result.merge(
            filtered_dfs[data_type][['pixel_id', 'year', data_type]],
            on=['pixel_id', 'year'],
            how='left'
        )

# Sort the results
result = result.sort_values(['pixel_id', 'year'])

# Save the results
output_path = os.path.join(output_dir, 'sampled_data.csv')
result.to_csv(output_path, index=False)

print(f"Results saved to: {output_path}")
print("\nFirst few rows of the result:")
display(result.head())

Results saved to: /content/drive/MyDrive/earthengine/conversion/output/sampled_data.csv

First few rows of the result:


Unnamed: 0,pixel_id,x_coord,y_coord,year,BIOMASS
14258,14259,16.004385,-8.260009,2013,62
2882752,14259,16.004385,-8.260009,2015,28
5751246,14259,16.004385,-8.260009,2017,61
8619740,14259,16.004385,-8.260009,2019,56
30283,30284,16.122064,-8.282467,2013,114


## Summary Statistics
Display summary of the sampled data

In [6]:
print("Summary of sampled data:")
print(f"Total unique pixels: {result['pixel_id'].nunique()}")
print(f"Years covered: {sorted(result['year'].unique())}")
print("\nValue ranges:")
for column in ['BIOMASS', 'NDVI', 'NDFI']:
    if column in result.columns:
        print(f"{column}:")
        print(f"  Min: {result[column].min():.2f}")
        print(f"  Max: {result[column].max():.2f}")
        print(f"  Mean: {result[column].mean():.2f}")

Summary of sampled data:
Total unique pixels: 100
Years covered: ['2013', '2015', '2017', '2019']

Value ranges:
BIOMASS:
  Min: 3.00
  Max: 206.00
  Mean: 77.27
