In [83]:
# MapBiomas Soil (beta): Script 04a. Environmental covariates - get GEE data
# Alessandro Samuel-Rosa & Taciara Zborowski Horst
# 2024 CC-BY

# Import necessary libraries
import os
import pandas as pd

# Identify working directory, saving the path to a variable
src_dir = os.getcwd()
work_dir = os.path.dirname(src_dir)

# Read the TXT file '03-febr-data.txt' with the soil data from the 'data' folder
# Field separator: tab. Decimal separator: comma
file_path = os.path.join(work_dir, 'data', '03-febr-data.txt')
soildata_df = pd.read_csv(file_path, sep='\t', decimal=',', low_memory=False)
print(soildata_df.shape)

# Read the TXT file '04a-febr-data.txt' with the coordinates from the 'data' folder
# Field separator: tab. Decimal separator: comma
file_path = os.path.join(work_dir, 'data', '04a-febr-data.txt')
df2 = pd.read_csv(file_path, sep='\t', decimal=',', low_memory=False)
print(df2.shape)

# Compare the two data frames to check if there were changes in the data
# Use equals() method to compare the two data frames
# The result is a boolean value, True if the data frames are equal, False otherwise
# We consider only the following variables in the comparison:
# dataset_id, observacao_id, coord_x, coord_y, data_coleta_ano
df1 = soildata_df[['dataset_id', 'observacao_id', 'coord_x', 'coord_y', 'data_coleta_ano']]
df2 = df2[['dataset_id', 'observacao_id', 'coord_x', 'coord_y', 'data_coleta_ano']]
if df1.equals(df2):
    print("The DataFrames are identical.")
else:
    print("The DataFrames are different.")
del df1, df2
# 17 606 layers

(17606, 61)
(17606, 112)
The DataFrames are different.


In [84]:
# Filter out soil layers without geographic coordinates
# coord_x and coord_y are the columns with the geographic coordinates
soildata_xy = soildata_df[soildata_df['coord_x'].notnull()]
soildata_xy = soildata_xy[soildata_xy['coord_y'].notnull()]

# Remove all duplicates based on the following columns:
# "dataset_id", "observacao_id", "coord_x", "coord_y", "data_coleta_ano"
# The first occurrence is kept, and the others are removed
soildata_xy = soildata_xy[['dataset_id', 'observacao_id', 'coord_x', 'coord_y', 'data_coleta_ano']]
soildata_xy = soildata_xy.drop_duplicates()
target = 11312
print(
  'There should be', target, 'events:', target == soildata_xy.shape[0],
  '\nThere are', soildata_xy.shape[0], 'events')

There should be 11312 events: True 
There are 11312 events


In [85]:
# Import necessary libraries
import ee
import geemap

# Initialize the Earth Engine API
ee.Initialize()

# Convert DataFrame to Earth Engine Feature Collection
soildata_fc = geemap.df_to_ee(soildata_xy, latitude = 'coord_y', longitude = 'coord_x')
# soildata_fc = geemap.df_to_ee(soildata_xy.sample(n=100), latitude = 'coord_y', longitude = 'coord_x')

In [86]:
# Soil Grids 250m v2.0
# This takes about 30 minutes to run

# Soil Grids 250m v2.0: bdod_mean (bulk density)
image = ee.Image("projects/soilgrids-isric/bdod_mean")
bdod_mean = geemap.extract_values_to_points(soildata_fc, image, scale = 250)
bdod_mean = geemap.ee_to_df(bdod_mean)

# Soil Grids 250m v2.0: clay_mean (clay content)
image = ee.Image("projects/soilgrids-isric/clay_mean")
clay_mean = geemap.extract_values_to_points(soildata_fc, image, scale = 250)
clay_mean = geemap.ee_to_df(clay_mean)

# Soil Grids 250m v2.0: sand_mean (sand content)
image = ee.Image("projects/soilgrids-isric/sand_mean")
sand_mean = geemap.extract_values_to_points(soildata_fc, image, scale = 250)
sand_mean = geemap.ee_to_df(sand_mean)

# Soil Grids 250m v2.0: soc_mean (soil organic carbon)
image = ee.Image("projects/soilgrids-isric/soc_mean")
soc_mean = geemap.extract_values_to_points(soildata_fc, image, scale = 250)
soc_mean = geemap.ee_to_df(soc_mean)

# Soil Grids 250m v2.0: cfvo_mean (coarse fragments volume)
image = ee.Image("projects/soilgrids-isric/cfvo_mean")
cfvo_mean = geemap.extract_values_to_points(soildata_fc, image, scale = 250)
cfvo_mean = geemap.ee_to_df(cfvo_mean)

# Merge dataframes into soilgrids_df
merge_columns = ['dataset_id', 'observacao_id', 'coord_x', 'coord_y', 'data_coleta_ano']
soilgrids_df = bdod_mean.merge(clay_mean, on = merge_columns)
soilgrids_df = soilgrids_df.merge(sand_mean, on = merge_columns)
soilgrids_df = soilgrids_df.merge(soc_mean, on = merge_columns)
soilgrids_df = soilgrids_df.merge(cfvo_mean, on = merge_columns)

# Rename columns
# Replace dash with underscores
soilgrids_df.columns = soilgrids_df.columns.str.replace('-', '_')
# Remove _mean from column names
soilgrids_df.columns = soilgrids_df.columns.str.replace('_mean', '')

# Drop columns with 30_60cm, 60_100cm or 100_200cm in the name
soilgrids_df = soilgrids_df[soilgrids_df.columns.drop(list(soilgrids_df.filter(regex='30_60cm')))]
soilgrids_df = soilgrids_df[soilgrids_df.columns.drop(list(soilgrids_df.filter(regex='60_100cm')))]
soilgrids_df = soilgrids_df[soilgrids_df.columns.drop(list(soilgrids_df.filter(regex='100_200cm')))]

# Check if number of rows of soildata_df is the same as that of soildata_xy
# If the number of rows is the same, the merge was successful
target = soildata_xy.shape[0]
print(
  'There should be', target, 'events:',
  target == soilgrids_df.shape[0], '\nThere are', soilgrids_df.shape[0], 'events')

There should be 11312 events: True 
There are 11312 events


In [87]:
from datetime import datetime

# MapBiomas LULC Collection 7.1
# This takes about xx minutes to run

# Import the MapBiomas Collection 7.1
collection = 'projects/mapbiomas-workspace/public/collection7_1/mapbiomas_collection71_integration_v1'
image = ee.Image(collection)

# Extract the land cover information from the MapBiomas Collection 7.1
mapbiomas = geemap.extract_values_to_points(soildata_fc, image, scale = 30)
mapbiomas_df = geemap.ee_to_df(mapbiomas)

# Rename columns
# Remove classification_ prefix from column names
mapbiomas_df.columns = mapbiomas_df.columns.str.replace('classification_', '')

# Get LULC class at the year of sampling (data_coleta_ano)
# Each column in the MapBiomas dataset represents a year. The column name is the year of the
# classification. The value is the class code. We need to extract the class code for the year of
# sampling, which is stored in the data_coleta_ano column.
# Step 1: Create a new column YEAR based on data_coleta_ano
mapbiomas_df['YEAR'] = mapbiomas_df['data_coleta_ano']
# Step 2: If YEAR is less than 1985, set it to 0 (no data)
mapbiomas_df.loc[mapbiomas_df['YEAR'] < 1985, 'YEAR'] = 0
# Step 3: Find the column index for each YEAR
lulc_idx = mapbiomas_df.columns.get_indexer(mapbiomas_df['YEAR'].astype(str))
# Step 4: Extract the class code for each row based on the data_coleta_ano column
lulc = mapbiomas_df.to_numpy()
lulc = lulc[range(len(lulc)), lulc_idx]
# Step 5: Convert the extracted class codes to strings and assign them to a new column lulc
mapbiomas_df['lulc'] = lulc.astype(str)
# Step 6: Drop the YEAR column
mapbiomas_df = mapbiomas_df.drop(columns = ['YEAR'])

# Some columns of mapbiomas_df are named with the year of the classification, ranging from 1985 to
# the present year. This columns need to be dropped.
# Drop columns with years as column names
current_year = datetime.now().year
years_to_drop = [str(year) for year in range(1985, current_year + 1)]
mapbiomas_df.drop(columns=years_to_drop, inplace=True, errors='ignore')

# Reclassify the land cover classes
forest_codes = ['1', '3', '4', '5', '49']
nonforest_codes = ['10', '11', '12', '32', '29', '50', '13']
pasture_codes = ['15']
agriculture_codes = ['14', '18', '19', '39', '20', '40', '62', '41', '36', '46', '47', '48', '21']
forestry_codes = ['9']
nonvegetation_codes = ['22', '23', '24', '30', '25', '26', '33', '31', '27']
unknown_codes = ['0']
mapbiomas_df['lulc'] = mapbiomas_df['lulc'].replace(forest_codes, 'forest')
mapbiomas_df['lulc'] = mapbiomas_df['lulc'].replace(nonforest_codes, 'nonforest')
mapbiomas_df['lulc'] = mapbiomas_df['lulc'].replace(pasture_codes, 'pasture')
mapbiomas_df['lulc'] = mapbiomas_df['lulc'].replace(agriculture_codes, 'agriculture')
mapbiomas_df['lulc'] = mapbiomas_df['lulc'].replace(forestry_codes, 'forestry')
mapbiomas_df['lulc'] = mapbiomas_df['lulc'].replace(nonvegetation_codes, 'nonvegetation')
mapbiomas_df['lulc'] = mapbiomas_df['lulc'].replace(unknown_codes, 'unknown')

# Print summary of the land cover classes
print('\nDistribution of land cover/land use classes:')
print(mapbiomas_df['lulc'].value_counts())

# Check if number of rows of mapbiomas_df is the same as that of soildata_xy
# If the number of rows is the same, the sampling was successful
target = soildata_xy.shape[0]
print(
  '\nThere should be', target, 'events:',
  target == mapbiomas_df.shape[0], '\nThere are', mapbiomas_df.shape[0], 'events')

Exception: Computation timed out.

In [82]:
# Merge data sampled from SoilGrids and MapBiomas into soildata_xy
if soilgrids_df.shape[0] != soildata_xy.shape[0] or mapbiomas_df.shape[0] != soildata_xy.shape[0]:
    raise ValueError('The number of rows of soilgrids_df, mapbiomas_df, and soildata_xy must be the same.')

# Merge dataframes into soildata_xy
merge_columns = ['dataset_id', 'observacao_id', 'coord_x', 'coord_y', 'data_coleta_ano']
soildata_xy = soildata_xy.merge(soilgrids_df, on = merge_columns)
soildata_xy = soildata_xy.merge(mapbiomas_df, on = merge_columns)

# Check if number of rows of soildata_xy is the same as that of soilgrids_df and mapbiomas_df
# If the number of rows is the same, the merge was successful
target = soilgrids_df.shape[0]
print(
  '\nThere should be', target, 'events:',
  target == soildata_xy.shape[0], '\nThere are', soildata_xy.shape[0], 'events')

ValueError: The number of rows of soilgrids_df, mapbiomas_df, and soildata_xy must be the same.