In [1]:
version = 'v20250521'

In [2]:
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import copy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold,cross_val_predict
from skmap.misc import find_files, GoogleSheet, ttprint
import joblib
import numpy as np

prop = 'soc'
tgt = 'soc_log1p'

## organize covariates

In [3]:
covs = pd.read_csv(f'./metric/feature_selected_soc_{version}.txt', header=None)[0].tolist() 

df = pd.read_parquet(f'./material/agg.pnt.tmp_overlaid_{version}.pq')
print(df.shape)
dff = df.dropna(subset=covs,how='any')
print(dff.shape)
dff = dff.groupby('id').filter(lambda x: len(x) >= 3).reset_index(drop=True)
print(dff.shape)

(2232873, 84)
(2216143, 84)
(2216142, 84)


## fit the model

In [4]:
# read in model
model = joblib.load(f'./model/model_rf.soc_ccc_{version}.joblib')
model.n_jobs = 90

# use all the points to train model
train = pd.read_parquet(f'./material/soc.topsoil_organized_{version}.pq')
train[tgt] = np.log1p(train[prop])
train = train.dropna(subset=covs+[tgt],how='any').reset_index(drop=True)

# fit
model.fit(train[covs], train[tgt])

# cast the model
import copy
from trees_rf import cast_tree_rf, cast_node_rf, pad_leaf_outputs_to_array
import warnings
warnings.filterwarnings("ignore", message="X has feature names, but")
ttprint('casting the model')
model_copy = copy.deepcopy(model)
modeln = cast_node_rf(model_copy, train[covs], train[tgt])
ttprint('finish casting')

[13:11:51] casting the model
[13:12:06] finish casting


## get prediction and uncertainty estimation for each point

In [5]:
# prediction
ttprint('start prediction')
y_pred = model.predict(dff[covs])
y_pred = np.expm1(y_pred)
dff[f'pred'] = y_pred
ttprint('finish prediction')

[13:12:06] start prediction
[13:12:07] finish prediction


In [6]:
# do the predictions in chunck
chunk_size = 10000
num_chunks = int(np.ceil(len(dff) / chunk_size))

pred_std_chunks = []

ttprint(f'In total {num_chunks} chunks')

for i in range(num_chunks):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, len(dff))
    ttprint(f'chunk {i + 1}/{num_chunks} ({start}:{end})--------')

    chunk = dff.iloc[start:end]

    ttprint('start predicting and padding')
    node_preds_chunk = modeln.predict(chunk[covs])
    nodes_chunk = pad_leaf_outputs_to_array(node_preds_chunk, pad_value=np.nan)
    nodes_chunk = np.expm1(nodes_chunk)
    
    ttprint('start calculating std')
    std_chunk = np.nanstd(nodes_chunk.T, axis=0)
    pred_std_chunks.append(std_chunk)

ttprint(f'finish all-------------------------------')

dff['pred_std'] = np.concatenate(pred_std_chunks)



[13:12:07] In total 222 chunks
[13:12:07] chunk 1/222 (0:10000)--------
[13:12:07] start predicting and padding
[13:12:58] start calculating std
[13:12:59] chunk 2/222 (10000:20000)--------
[13:12:59] start predicting and padding
[13:13:49] start calculating std
[13:13:50] chunk 3/222 (20000:30000)--------
[13:13:50] start predicting and padding
[13:14:40] start calculating std
[13:14:41] chunk 4/222 (30000:40000)--------
[13:14:41] start predicting and padding
[13:15:32] start calculating std
[13:15:33] chunk 5/222 (40000:50000)--------
[13:15:33] start predicting and padding
[13:16:23] start calculating std
[13:16:24] chunk 6/222 (50000:60000)--------
[13:16:24] start predicting and padding
[13:17:15] start calculating std
[13:17:16] chunk 7/222 (60000:70000)--------
[13:17:16] start predicting and padding
[13:18:06] start calculating std
[13:18:07] chunk 8/222 (70000:80000)--------
[13:18:07] start predicting and padding
[13:18:57] start calculating std
[13:18:58] chunk 9/222 (80000

In [7]:
dff.to_parquet('./material/pnt_agg.predicted_all.v20250521.gpkg')

In [8]:
dff

Unnamed: 0,id,nuts0,50km,100km,200km,id_50km,id_100km,id_200km,time,wv_mcd19a2v061.seasconv.m.yearly_p50_1km_s_YYYY0101_YYYY1231_go_epsg.4326_v20230619,...,max.curv.bareearth_ensemble_m_120m_s_20000101_20221231_eu_epsg.3035_v20240501,CHELSA_cmi_min_1981-2010_V.2.1,soil.moisture_s1.clms.qr.4.p0.95_m_1km_20140101_20241231_eu_epsg3035_v20250211,CHELSA_rsds_1981-2010_min_V.2.1,twi.bareearth_ensemble_m_960m_s_20000101_20221231_eu_epsg.3035_v20240501,min.curv.bareearth_ensemble_m_120m_s_20000101_20221231_eu_epsg.3035_v20240501,lon,lat,pred,pred_std
0,322,DE,1,0,0,280.0,93.0,32.0,2009,1268.0,...,-11.0,-501.0,95.0,1975.0,1225.0,-66.0,4.046952e+06,3.192086e+06,16.066618,7.319217
1,323,DE,1,0,0,280.0,93.0,32.0,2009,1254.0,...,82.0,-418.0,92.0,1970.0,1209.0,-176.0,4.046952e+06,3.191086e+06,24.059321,30.668980
2,324,DE,1,0,0,280.0,93.0,32.0,2009,1219.0,...,49.0,-369.0,89.0,2012.0,1369.0,-25.0,4.046952e+06,3.190086e+06,32.421245,28.815367
3,325,DE,1,0,0,280.0,93.0,32.0,2009,1245.0,...,69.0,-360.0,88.0,2088.0,1231.0,4.0,4.046952e+06,3.189086e+06,38.163748,34.862160
4,326,DE,1,0,0,280.0,93.0,32.0,2009,1262.0,...,19.0,-368.0,94.0,2133.0,1197.0,-40.0,4.046952e+06,3.188086e+06,41.198004,54.757160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2216137,861775,ES,1,0,0,264.0,83.0,27.0,2018,1749.0,...,217.0,-1622.0,84.0,7292.0,1441.0,-8.0,3.745821e+06,1.857326e+06,19.966059,15.115372
2216138,861776,ES,1,0,0,264.0,83.0,27.0,2018,1758.0,...,22.0,-1629.0,87.0,7261.0,1394.0,-15.0,3.745821e+06,1.856326e+06,13.368139,11.533063
2216139,861777,ES,1,0,0,264.0,83.0,27.0,2018,1753.0,...,2.0,-1627.0,87.0,7251.0,1310.0,-26.0,3.745821e+06,1.855326e+06,16.748508,24.245417
2216140,861778,ES,1,0,0,264.0,83.0,27.0,2018,1713.0,...,39.0,-1623.0,78.0,7253.0,1264.0,-39.0,3.745821e+06,1.854326e+06,14.711147,25.263575
