# Samples in areas of stable LULC

Alessandro Samuel-Rosa, Taciara Zborowski Horst

2025-06-20

This script samples the MapBiomas land use and land cover classification dataset (Collection 9) and identifies the soil samples that fall within areas of stable land use and land cover. As area of stable LULC, we consider areas that have not changed their land use or land cover for at least 20 years.

License: MIT

In [1]:
# Import necessary libraries
import os
import pandas as pd

# Identify working directory, saving the path to a variable
src_dir = os.getcwd()
work_dir = os.path.dirname(src_dir)

# Read the soil data processed in the previous script
# folder: data
# file: 40_soildata_soc.txt
# separator: tab
file_path = os.path.join(work_dir, 'data', '40_soildata_soc.txt')
soildata = pd.read_csv(file_path, sep='\t', low_memory=False)
print(soildata.shape)

# Print the first 5 rows of the data frame
print(soildata)

(12666, 7)
                                      id    coord_x    coord_y  year  \
0                    ctb0003-sm-dnos-001 -53.794645 -29.651271  2009   
1                    ctb0003-sm-dnos-002 -53.793987 -29.650564  2009   
2                    ctb0003-sm-dnos-003 -53.793993 -29.650232  2009   
3                    ctb0003-sm-dnos-004 -53.794345 -29.650311  2009   
4                    ctb0003-sm-dnos-005 -53.792947 -29.650838  2009   
...                                  ...        ...        ...   ...   
12661                  ctb0717-22-XYREP4 -72.650107  -7.048733  1985   
12662  ctb0053-RO_567_INDEFORMADA-XYREP1 -62.640199 -12.960279  2015   
12663  ctb0053-RO_567_INDEFORMADA-XYREP2 -62.639774 -12.959962  2015   
12664  ctb0053-RO_567_INDEFORMADA-XYREP3 -62.640601 -12.959971  2015   
12665  ctb0053-RO_567_INDEFORMADA-XYREP4 -62.640242 -12.959482  2015   

       soc_stock_g_m2  IFN_index  YEAR_index  
0                1341          0           0  
1                2246         

In [None]:
# (this takes a couple of minutes to run)

# Import necessary libraries
import ee
import geemap

# Initialize the Earth Engine API
# ee.Authenticate()
ee.Initialize(project='mapbiomas-solos-workspace')

# Convert DataFrame to Earth Engine Feature Collection
soildata_fc = geemap.df_to_ee(soildata, latitude = 'coord_y', longitude = 'coord_x')
print(soildata_fc.size().getInfo(), 'features')

# Function to split a feature collection (sampling points) into chunks
def split_sampling_points(fc, chunk_size):
    features = fc.toList(fc.size())
    chunks = [features.slice(i, i + chunk_size) for i in range(0, features.size().getInfo(), chunk_size)]
    return [ee.FeatureCollection(chunk) for chunk in chunks]

# Split the sample points into subsets of 1000 points each
# This is necessary to avoid timeout errors
chunk_size = 1000
soildata_chunks = split_sampling_points(soildata_fc, chunk_size)
print(len(soildata_chunks), 'chunks')

12666 features
13 chunks


In [3]:
# (this takes about 10 minutes to run)

# MapBiomas LULC Collection

# Import the MapBiomas Collection 9.0
collection = 'projects/mapbiomas-public/assets/brazil/lulc/collection9/mapbiomas_collection90_integration_v1'
mapbiomas_image = ee.Image(collection)

# Initialize an empty list to store DataFrames
dataframes = []

# Loop over each subset and sample the data
for chunk in soildata_chunks:
    sampled_points = geemap.extract_values_to_points(chunk, mapbiomas_image, scale=30)
    sampled_df = geemap.ee_to_df(sampled_points)
    dataframes.append(sampled_df)

# Concatenate all DataFrames into a single DataFrame
mapbiomas_df = pd.concat(dataframes, ignore_index=True)

# Rename columns
# Remove classification_ prefix from column names
mapbiomas_df.columns = mapbiomas_df.columns.str.replace('classification_', '')

# Save the sampled data to a CSV file
output_file = os.path.join(work_dir, 'data', '41_soildata_soc.txt')
mapbiomas_df.to_csv(output_file, sep='\t', index=False)
# Print the shape of the final DataFrame
print(mapbiomas_df.shape)
# Print the first 5 rows of the final DataFrame
print(mapbiomas_df.head())

(12666, 46)
   IFN_index  YEAR_index  1985  1986  1987  1988  1989  1990  1991  1992  ...  \
0          0           0    21    21    21    21    21    21    21    21  ...   
1          0           0    15    15    15    21    21    21    21    21  ...   
2          0           0    21    21    21    21    21    21    21    21  ...   
3          0           0    21    21    21    21    21    21    21    21  ...   
4          0           0    12    12    12    12    12    12    12    12  ...   

   2019  2020  2021  2022  2023    coord_x    coord_y                   id  \
0    21    21    21    21     3 -53.794645 -29.651271  ctb0003-sm-dnos-001   
1    12    12    12    12    12 -53.793987 -29.650564  ctb0003-sm-dnos-002   
2    12    12    12    12    12 -53.793993 -29.650232  ctb0003-sm-dnos-003   
3     3     3     3     3     3 -53.794345 -29.650311  ctb0003-sm-dnos-004   
4    12    12    12    12    12 -53.792947 -29.650838  ctb0003-sm-dnos-005   

   soc_stock_g_m2  year  
0     

In [4]:
# Check if there are soil samples with no information on LULC

# LULC information is stores in columns named '1985' through '2023'
no_lulc = mapbiomas_df[mapbiomas_df.loc[:, '1985':'2023'].isnull().all(axis=1)]
# Print the shape of the DataFrame with no LULC information
print(no_lulc.shape)
# Print the first 5 rows of the DataFrame with no LULC information
print(no_lulc.head())

(0, 46)
Empty DataFrame
Columns: [IFN_index, YEAR_index, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, coord_x, coord_y, id, soc_stock_g_m2, year]
Index: []

[0 rows x 46 columns]


In [14]:
# All features have LULC information, so we can proceed to the next step...

# Identify the features for which the sampling year is greater than 2004 ('year' > 2004)
recent_features = mapbiomas_df[mapbiomas_df['year'] > 2004].copy()
print(recent_features.shape)

# For each feature, retrieve the LULC information for the 19 previous years plus the sampling year
def get_lulc_20_years(row):
    start = str(int(row['year']) - 19)
    end = str(int(row['year']))
    # Ensure columns exist in DataFrame
    cols = [col for col in mapbiomas_df.columns if col.isdigit() and start <= col <= end]
    return row[cols].dropna().tolist()

recent_features['lulc_20_years'] = recent_features.apply(get_lulc_20_years, axis=1)

# Check if the LULC information for the last 20 years is all the same
recent_features['lulc_20_years_same'] = recent_features['lulc_20_years'].apply(
    lambda x: len(set(x)) == 1 if x else False
)

# For those features where the LULC information for the last 20 years is all the same, retrieve the LULC information for the sampling year
def get_lulc_sampling_year(row):
    if row['lulc_20_years_same']:
        year_col = str(int(row['year']))
        return row[year_col] if year_col in row else None
    else:
        return None

recent_features['lulc_sampling_year'] = recent_features.apply(get_lulc_sampling_year, axis=1)

print(recent_features.shape)
print(recent_features.head())

(4790, 46)
(4790, 49)
   IFN_index  YEAR_index  1985  1986  1987  1988  1989  1990  1991  1992  ...  \
0          0           0    21    21    21    21    21    21    21    21  ...   
1          0           0    15    15    15    21    21    21    21    21  ...   
2          0           0    21    21    21    21    21    21    21    21  ...   
3          0           0    21    21    21    21    21    21    21    21  ...   
4          0           0    12    12    12    12    12    12    12    12  ...   

   2022  2023    coord_x    coord_y                   id  soc_stock_g_m2  \
0    21     3 -53.794645 -29.651271  ctb0003-sm-dnos-001            1341   
1    12    12 -53.793987 -29.650564  ctb0003-sm-dnos-002            2246   
2    12    12 -53.793993 -29.650232  ctb0003-sm-dnos-003            2340   
3     3     3 -53.794345 -29.650311  ctb0003-sm-dnos-004            3933   
4    12    12 -53.792947 -29.650838  ctb0003-sm-dnos-005            2738   

   year                           

In [15]:
# Keep only the features for which the LULC information for the 20 previous years is all the same
recent_features_same_lulc = recent_features[recent_features['lulc_20_years_same']].copy()

# Print the shape of the DataFrame with recent features with same LULC
print(recent_features_same_lulc.shape)

# Print the first 5 rows of the DataFrame with recent features with same LULC
print(recent_features_same_lulc.head())

(2785, 49)
    IFN_index  YEAR_index  1985  1986  1987  1988  1989  1990  1991  1992  \
4           0           0    12    12    12    12    12    12    12    12   
6           0           0    12    12    12    12    12    12    12    12   
7           0           0    12    12    12    12    12    12    12    12   
9           0           0     3     3     3     3     3     3     3     3   
10          0           0     3     3     3     3     3     3     3     3   

    ...  2022  2023    coord_x    coord_y                   id  \
4   ...    12    12 -53.792947 -29.650838  ctb0003-sm-dnos-005   
6   ...    12    12 -53.793272 -29.649962  ctb0003-sm-dnos-007   
7   ...    12    12 -53.793633 -29.648468  ctb0003-sm-dnos-008   
9   ...     3     3 -53.793019 -29.647970  ctb0003-sm-dnos-010   
10  ...     3     3 -53.792816 -29.648040  ctb0003-sm-dnos-011   

    soc_stock_g_m2  year                                      lulc_20_years  \
4             2738  2009  [12, 12, 12, 12, 12, 12,

In [16]:
# Print a count of the LULC classes in the sampling year (sorted by LULC code)
lulc_counts = recent_features_same_lulc['lulc_sampling_year'].value_counts().sort_index()
print(lulc_counts)

lulc_sampling_year
3.0     973
4.0     366
6.0      17
9.0      97
11.0     14
12.0    450
15.0    546
20.0     25
21.0    130
24.0     15
25.0      4
29.0     56
30.0      1
33.0     10
39.0     46
41.0     27
46.0      2
48.0      1
49.0      5
Name: count, dtype: int64


In [18]:
# Keep only those features where the LULC class in the sampling year is:
# 3: Forest Formation
# 4: Savanna Formation
# 6: Savanna Formation
# 11: Wetland
# 12: Grassland
# 29: Rocky Outcrop
# 49: Wooded Sandbank Vegetation

# Keep LULC values as real numbers for comparison
recent_features_same_lulc = recent_features_same_lulc.copy()
lulc_to_keep = [3.0, 4.0, 6.0, 11.0, 12.0, 29.0, 49.0]

recent_features_same_lulc = recent_features_same_lulc[
    recent_features_same_lulc['lulc_sampling_year'].isin(lulc_to_keep)
]

# Print the shape of the DataFrame after keeping the selected LULC classes
print(recent_features_same_lulc.shape)
# Print the first 5 rows of the DataFrame after keeping the

(1881, 49)


In [20]:
# Print a count of the LULC classes in the sampling year
lulc_class_counts = recent_features_same_lulc['lulc_sampling_year'].value_counts().sort_index()
print(lulc_class_counts)

lulc_sampling_year
3.0     973
4.0     366
6.0      17
11.0     14
12.0    450
29.0     56
49.0      5
Name: count, dtype: int64


In [21]:
# Print a count of the sampling years
sampling_year_counts = recent_features_same_lulc['year'].value_counts().sort_index()
print(sampling_year_counts)

year
2005     24
2006     47
2007     55
2008      6
2009    435
2010     64
2011     86
2012     21
2013    231
2014    502
2015    213
2016    165
2017     20
2018      2
2023     10
Name: count, dtype: int64


In [None]:
# Plot the points on a map using geemap

# Import necessary libraries for mapping
import folium
import geemap

# Create the map centered on Brazil
Map = geemap.Map(center=[-15, -55], zoom=4)
Map.add_basemap('SATELLITE')

# MapBiomas Collection 9 code-to-color mapping (only for selected stable LULC classes, keys as floats)
lulc_code_to_color = {
    3.0: "#1f8d49",   # Forest Formation
    4.0: "#7dc975",   # Savanna Formation
    6.0: "#007785",   # Wetland Forest
    11.0: "#519799",  # Wetland
    12.0: "#d6bc74",  # Grassland
    29.0: "#ffaa5f",  # Rocky Outcrop
    49.0: "#02d659"   # Wooded Sandbank Vegetation
}

# Ensure the LULC code column is float type for mapping
recent_features_same_lulc = recent_features_same_lulc.copy()
recent_features_same_lulc['lulc_sampling_year'] = recent_features_same_lulc['lulc_sampling_year'].astype(float)

# Select only the columns needed for the map and popup
columns_to_keep = ["coord_x", "coord_y", "lulc_sampling_year", "id", "year", "soc_stock_g_m2"]
recent_features_map = recent_features_same_lulc[columns_to_keep].copy()
recent_features_map = recent_features_map.reset_index(drop=True)

# Plot points, color by LULC code, show only selected info in popup, no clustering
Map.add_points_from_xy(
    recent_features_map,
    x="coord_x",
    y="coord_y",
    color_column="lulc_sampling_year",
    color_map=lulc_code_to_color,
    layer_name="Stable LULC Sampling Year",
    marker_cluster=False,
    info_columns=["lulc_sampling_year", "id", "year", "soc_stock_g_m2"]
)

# Updated legend for only the selected stable LULC classes
legend_labels = [
    "Forest Formation",        # 3.0
    "Savanna Formation",       # 4.0
    "Wetland Forest",          # 6.0
    "Wetland",                 # 11.0
    "Grassland",               # 12.0
    "Rocky Outcrop",           # 29.0
    "Wooded Sandbank Vegetation" # 49.0
]
legend_colors = [
    "#1f8d49",  # 3.0
    "#7dc975",  # 4.0
    "#007785",  # 6.0
    "#519799",  # 11.0
    "#d6bc74",  # 12.0
    "#ffaa5f",  # 29.0
    "#02d659"   # 49.0
]

Map.add_legend(
    title='Stable LULC Classes (MapBiomas)',
    labels=legend_labels,
    colors=legend_colors
)
Map

The provided color (lightred) is invalid. Using the default black color.
'#lightred' is not in web format. Need 3 or 6 hex digit.


Map(center=[-15, -55], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchDataGUI(ch…

## Distance from anthropogenic areas

In [26]:
# Clean the data frame, keeping only the relevant columns
# id coord_x coord_y year soc_stock_g_m2 lulc_sampling_year IFN_index YEAR_index
columns_to_keep = [
    "id", "coord_x", "coord_y", "year", "soc_stock_g_m2", "lulc_sampling_year", "IFN_index", "YEAR_index"
]
recent_features_cleaned = recent_features_same_lulc[columns_to_keep].copy()
# Reset the index of the cleaned DataFrame
recent_features_cleaned = recent_features_cleaned.reset_index(drop=True)
# Print the shape of the cleaned DataFrame
print(recent_features_cleaned.shape)
# Print the first 5 rows of the cleaned DataFrame
print(recent_features_cleaned.head())


(1881, 8)
                    id    coord_x    coord_y  year  soc_stock_g_m2  \
0  ctb0003-sm-dnos-005 -53.792947 -29.650838  2009            2738   
1  ctb0003-sm-dnos-007 -53.793272 -29.649962  2009            3299   
2  ctb0003-sm-dnos-008 -53.793633 -29.648468  2009            2311   
3  ctb0003-sm-dnos-010 -53.793019 -29.647970  2009            3109   
4  ctb0003-sm-dnos-011 -53.792816 -29.648040  2009            1480   

   lulc_sampling_year  IFN_index  YEAR_index  
0                12.0          0           0  
1                12.0          0           0  
2                12.0          0           0  
3                 3.0          0           0  
4                 3.0          0           0  


In [27]:
# Convert DataFrame to Earth Engine Feature Collection
stable_fc = geemap.df_to_ee(recent_features_cleaned, latitude = 'coord_y', longitude = 'coord_x')
print(stable_fc.size().getInfo(), 'features')

# Function to split a feature collection (sampling points) into chunks
def split_sampling_points(fc, chunk_size):
    features = fc.toList(fc.size())
    chunks = [features.slice(i, i + chunk_size) for i in range(0, features.size().getInfo(), chunk_size)]
    return [ee.FeatureCollection(chunk) for chunk in chunks]

# Split the sample points into subsets of 1000 points each
# This is necessary to avoid timeout errors
chunk_size = 1000
stable_chunks = split_sampling_points(stable_fc, chunk_size)
print(len(stable_chunks), 'chunks')

1881 features
2 chunks


In [None]:
# (this takes a few seconds to run)

# MapBiomas Degradation

# Import the MapBiomas Degradation Collection Beta
collection = 'projects/mapbiomas-workspace/SOLOS/COVARIAVEIS/MB_DEGRADATION_BETA_SUMMED_EDGES'
edges_image = ee.Image(collection)

# Initialize an empty list to store DataFrames
dataframes = []

# Loop over each subset and sample the data
for chunk in stable_chunks:
    sampled_points = geemap.extract_values_to_points(chunk, edges_image, scale=30)
    sampled_df = geemap.ee_to_df(sampled_points)
    dataframes.append(sampled_df)
# Concatenate all DataFrames into a single DataFrame
mapbiomas_degradation_df = pd.concat(dataframes, ignore_index=True)
# Rename columns
mapbiomas_degradation_df.columns = mapbiomas_degradation_df.columns.str.replace('edge_sum_', '')

# Print the shape of the sampled DataFrame
print(mapbiomas_degradation_df.shape)
# Print the first 5 rows of the sampled DataFrame
print(mapbiomas_degradation_df.head())

(1881, 47)
   IFN_index  YEAR_index    coord_x    coord_y  1985  1986  1987  1988  1989  \
0          0           0 -53.792947 -29.650838     5     5     5     5     5   
1          0           0 -53.793272 -29.649962     6     6     6     6     6   
2          0           0 -53.793633 -29.648468     6     6     6     6     6   
3          0           0 -53.793019 -29.647970     4     4     4     4     4   
4          0           0 -53.792816 -29.648040     3     5     3     3     3   

   1990  ...  2018  2019  2020  2021  2022  2023                   id  \
0     5  ...     5     2     2     2     2     2  ctb0003-sm-dnos-005   
1     6  ...     6     6     6     6     6     6  ctb0003-sm-dnos-007   
2     6  ...     6     6     6     6     6     6  ctb0003-sm-dnos-008   
3     4  ...     4     3     3     4     4     4  ctb0003-sm-dnos-010   
4     3  ...     3     3     3     3     3     3  ctb0003-sm-dnos-011   

   lulc_sampling_year  soc_stock_g_m2  year  
0                  12  

In [31]:
# For each feature, retrieve the degradation information for the sampling year
def get_degradation_sampling_year(row):
    year_col = str(int(row['year']))
    return row[year_col] if year_col in row else None

mapbiomas_degradation_df['degradation_sampling_year'] = mapbiomas_degradation_df.apply(get_degradation_sampling_year, axis=1)

# Keep only the features for which the degradation information is not null
stable_degradation = mapbiomas_degradation_df[mapbiomas_degradation_df['degradation_sampling_year'].notnull()].copy()
# Print the shape of the DataFrame with stable features with degradation information
print(stable_degradation.shape)
# Print the first 5 rows of the DataFrame with stable features with degradation information
print(stable_degradation.head())

(1881, 48)
   IFN_index  YEAR_index    coord_x    coord_y  1985  1986  1987  1988  1989  \
0          0           0 -53.792947 -29.650838     5     5     5     5     5   
1          0           0 -53.793272 -29.649962     6     6     6     6     6   
2          0           0 -53.793633 -29.648468     6     6     6     6     6   
3          0           0 -53.793019 -29.647970     4     4     4     4     4   
4          0           0 -53.792816 -29.648040     3     5     3     3     3   

   1990  ...  2019  2020  2021  2022  2023                   id  \
0     5  ...     2     2     2     2     2  ctb0003-sm-dnos-005   
1     6  ...     6     6     6     6     6  ctb0003-sm-dnos-007   
2     6  ...     6     6     6     6     6  ctb0003-sm-dnos-008   
3     4  ...     3     3     4     4     4  ctb0003-sm-dnos-010   
4     3  ...     3     3     3     3     3  ctb0003-sm-dnos-011   

   lulc_sampling_year  soc_stock_g_m2  year  degradation_sampling_year  
0                  12           

In [35]:
# Filter out features with degradation values equal to zero
stable_degradation = stable_degradation[stable_degradation['degradation_sampling_year'] == 0].copy()
# Print the shape of the DataFrame after filtering out features with degradation values greater than 0
print(stable_degradation.shape)
# Print the first 5 rows of the DataFrame after filtering out features with degradation values greater than 0
print(stable_degradation.head())

(586, 48)
     IFN_index  YEAR_index    coord_x    coord_y  1985  1986  1987  1988  \
43           0           0 -53.807537 -29.652779     0     0     0     0   
78           0           0 -53.807366 -29.653931     1     1     1     1   
259          0           0 -53.583600 -29.826100     0     0     0     5   
269          0           0 -53.935600 -29.700500     4     4     3     0   
270          0           0 -53.935300 -29.703100     1     1     0     0   

     1989  1990  ...  2019  2020  2021  2022  2023                   id  \
43      0     0  ...     0     0     0     0     0  ctb0003-sm-dnos-063   
78      1     1  ...     1     1     1     1     1  ctb0003-sm-dnos-113   
259     5     5  ...     0     0     0     0     0      ctb0006-obs_126   
269     0     1  ...     4     4     4     4     4      ctb0006-obs_151   
270     0     0  ...     0     0     1     1     1      ctb0006-obs_152   

     lulc_sampling_year  soc_stock_g_m2  year  degradation_sampling_year  
43     

In [36]:
# Clean up the DataFrame by removing unnecessary columns
stable_degradation_cleaned = stable_degradation[['id', 'coord_x', 'coord_y', 'year', 'soc_stock_g_m2', 'lulc_sampling_year', 'degradation_sampling_year', 'IFN_index', 'YEAR_index']].copy()
# Reset the index of the cleaned DataFrame
stable_degradation_cleaned = stable_degradation_cleaned.reset_index(drop=True)
# Print the shape of the cleaned DataFrame
print(stable_degradation_cleaned.shape)
# Print the first 5 rows of the cleaned DataFrame
print(stable_degradation_cleaned.head())

(586, 9)
                    id    coord_x    coord_y  year  soc_stock_g_m2  \
0  ctb0003-sm-dnos-063 -53.807537 -29.652779  2009            3629   
1  ctb0003-sm-dnos-113 -53.807366 -29.653931  2009            3542   
2      ctb0006-obs_126 -53.583600 -29.826100  2014            4704   
3      ctb0006-obs_151 -53.935600 -29.700500  2014            2614   
4      ctb0006-obs_152 -53.935300 -29.703100  2014            2631   

   lulc_sampling_year  degradation_sampling_year  IFN_index  YEAR_index  
0                  12                          0          0           0  
1                   3                          0          0           0  
2                  12                          0          0           0  
3                   3                          0          0           0  
4                   3                          0          0           0  


In [37]:
# Plot the points on a map using geemap (use the same map as before)
# Create the map centered on Brazil
Map = geemap.Map(center=[-15, -55], zoom=4)
Map.add_basemap('SATELLITE')

# Color coding based on LULC code
lulc_code_to_color = {
    3.0: "#1f8d49",   # Forest Formation
    4.0: "#7dc975",   # Savanna Formation
    6.0: "#007785",   # Wetland Forest
    11.0: "#519799",  # Wetland
    12.0: "#d6bc74",  # Grassland
    29.0: "#ffaa5f",  # Rocky Outcrop
    49.0: "#02d659"   # Wooded Sandbank Vegetation
}
# Ensure the LULC code column is float type for mapping
stable_degradation_cleaned = stable_degradation_cleaned.copy()
stable_degradation_cleaned['lulc_sampling_year'] = stable_degradation_cleaned['lulc_sampling_year'].astype(float)
# Plot points, color by LULC code, show only selected info in popup, no clustering
Map.add_points_from_xy(
    stable_degradation_cleaned,
    x="coord_x",
    y="coord_y",
    color_column="lulc_sampling_year",
    color_map=lulc_code_to_color,
    layer_name="Stable Degradation Sampling Year",
    marker_cluster=False,
    info_columns=["lulc_sampling_year", "id", "year", "soc_stock_g_m2", "degradation_sampling_year"]
)
# Updated legend for only the selected stable LULC classes
legend_labels = [
    "Forest Formation",        # 3.0
    "Savanna Formation",       # 4.0
    "Wetland Forest",          # 6.0
    "Wetland",                 # 11.0
    "Grassland",               # 12.0
    "Rocky Outcrop",           # 29.0
    "Wooded Sandbank Vegetation" # 49.0
]
legend_colors = [
    "#1f8d49",  # 3.0
    "#7dc975",  # 4.0
    "#007785",  # 6.0
    "#519799",  # 11.0
    "#d6bc74",  # 12.0
    "#ffaa5f",  # 29.0
    "#02d659"   # 49.0
]
Map.add_legend(
    title='Stable LULC Classes (MapBiomas)',
    labels=legend_labels,
    colors=legend_colors
)
# Display the mapbiomas_degradation_df
Map

The provided color (lightred) is invalid. Using the default black color.
'#lightred' is not in web format. Need 3 or 6 hex digit.


Map(center=[-15, -55], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchDataGUI(ch…

## Replicate points in stable LULC


In [39]:
# Create 19 copies of stable_degradation_cleaned, each of the i copies with the year updated to the sampling year - i
stable_degradation_copies = []
for i in range(19):
    copy = stable_degradation_cleaned.copy()
    copy['year'] = copy['year'] - i
    copy['id'] = copy['id'].astype(str) + f"-TREP{i+1}"
    stable_degradation_copies.append(copy)
# Concatenate all copies into a single DataFrame
stable_degradation_all_years = pd.concat(stable_degradation_copies, ignore_index=True)
# Print the shape of the DataFrame with all years
print(stable_degradation_all_years.shape)
# Print the first 5 rows of the DataFrame with all
print(stable_degradation_all_years.head())

(11134, 9)
                          id    coord_x    coord_y  year  soc_stock_g_m2  \
0  ctb0003-sm-dnos-063-TREP1 -53.807537 -29.652779  2009            3629   
1  ctb0003-sm-dnos-113-TREP1 -53.807366 -29.653931  2009            3542   
2      ctb0006-obs_126-TREP1 -53.583600 -29.826100  2014            4704   
3      ctb0006-obs_151-TREP1 -53.935600 -29.700500  2014            2614   
4      ctb0006-obs_152-TREP1 -53.935300 -29.703100  2014            2631   

   lulc_sampling_year  degradation_sampling_year  IFN_index  YEAR_index  
0                12.0                          0          0           0  
1                 3.0                          0          0           0  
2                12.0                          0          0           0  
3                 3.0                          0          0           0  
4                 3.0                          0          0           0  


## Merge stable LULC points with original points

In [40]:
# Clean up the DataFrame by removing unnecessary columns
stable_degradation_all_years = stable_degradation_all_years[['id', 'coord_x', 'coord_y', 'year', 'soc_stock_g_m2', 'IFN_index', 'YEAR_index']].copy()
# Reset the index of the cleaned DataFrame
stable_degradation_all_years = stable_degradation_all_years.reset_index(drop=True)
# Print the shape of the cleaned DataFrame
print(stable_degradation_all_years.shape)
# Print the first 5 rows of the cleaned DataFrame
print(stable_degradation_all_years.head())

(11134, 7)
                          id    coord_x    coord_y  year  soc_stock_g_m2  \
0  ctb0003-sm-dnos-063-TREP1 -53.807537 -29.652779  2009            3629   
1  ctb0003-sm-dnos-113-TREP1 -53.807366 -29.653931  2009            3542   
2      ctb0006-obs_126-TREP1 -53.583600 -29.826100  2014            4704   
3      ctb0006-obs_151-TREP1 -53.935600 -29.700500  2014            2614   
4      ctb0006-obs_152-TREP1 -53.935300 -29.703100  2014            2631   

   IFN_index  YEAR_index  
0          0           0  
1          0           0  
2          0           0  
3          0           0  
4          0           0  


In [44]:
# Stack stable_degradation_all_years and soildata
stacked_data = pd.concat([soildata, stable_degradation_all_years], ignore_index=True)
# Sort by 'id'
stacked_data = stacked_data.sort_values(by='id').reset_index(drop=True)
# Print the shape of the stacked DataFrame
print(stacked_data.shape)
# Print the first 5 rows of the stacked DataFrame
print(stacked_data.head())

(23800, 7)
                    id    coord_x    coord_y  year  soc_stock_g_m2  IFN_index  \
0  ctb0003-sm-dnos-001 -53.794645 -29.651271  2009            1341          0   
1  ctb0003-sm-dnos-002 -53.793987 -29.650564  2009            2246          0   
2  ctb0003-sm-dnos-003 -53.793993 -29.650232  2009            2340          0   
3  ctb0003-sm-dnos-004 -53.794345 -29.650311  2009            3933          0   
4  ctb0003-sm-dnos-005 -53.792947 -29.650838  2009            2738          0   

   YEAR_index  
0           0  
1           0  
2           0  
3           0  
4           0  


In [None]:
# Save the cleaned DataFrame to a CSV file
import pandas as pd
output_file = os.path.join(work_dir, 'data', '41_soildata_soc.txt')
stacked_data.to_csv(output_file, sep='\t', index=False)

## Export to Google Earth Engine

In [48]:
# Save the FeatureCollection to a new asset
fc_path = 'projects/mapbiomas-workspace/SOLOS/AMOSTRAS/ORIGINAIS/2025-06-20-organic-carbon-stock-gram-per-square-meter'

# Convert the DataFrame to an Earth Engine Table before export
# (Assuming you want to export the DataFrame as a table, not as a pandas DataFrame)
stable_fc = geemap.df_to_ee(stacked_data, latitude='coord_y', longitude='coord_x')

# Export the FeatureCollection
task = ee.batch.Export.table.toAsset(
    collection=stable_fc,
    description='ASR Field SOC Stock Data',
    assetId=fc_path)
task.start()