In [1]:
import numpy as np
import matplotlib.pyplot as plt
import multiprocess as mp
import glob
import time
from tqdm import tqdm
import os
import sys
import pandas as pd
from eumap.misc import find_files, nan_percentile, GoogleSheet, ttprint
from eumap.raster import read_rasters, save_rasters
from eumap.mapper import SpaceOverlay
import geopandas as gpd
from pathlib import Path
from minio import Minio
import rasterio
import pyproj
from shapely.geometry import Point
import requests
import warnings
warnings.filterwarnings('default')

# os.environ['PROJ_LIB'] = '/opt/conda/share/proj'
folder = '/mnt/primus/xuemeng_tmp_harbour/soc'

# /home/opengeohub/.local/bin

### check if what is need to be overlayed

In [2]:
from shapely.geometry import Point
from geopandas import gpd

# def get_data_to_be_overlayed(whole=False):
#     df4326 = gpd.read_file(f'{folder}/data/soil_overlay.4326.gpkg')
#     df3035 = gpd.read_file(f'{folder}/data/soil_overlay.3035.gpkg')
    
#     keys = ['sample_id', 'lat', 'lon', 'time', 'hzn_top', 'hzn_btm', 'ref']
#     if whole:
#         return df4326,df3035
#     else:
#         new4326 = gpd.read_file(f'{folder_path}/data/soil_overlay.4326_v2.gpkg')
#         merge4326 = pd.merge(df4326, new4326, on=keys, how='outer', indicator=True)
#         different_4326 = merge4326[merge4326['_merge'] != 'both']
        
#         new3035 = gpd.read_file(f'{folder_path}/data/soil_overlay.3035_v2.gpkg')
#         merge3035 = pd.merge(df3035, new3035, on=keys, how='outer', indicator=True)
#         different_3035 = merge3035[merge3035['_merge'] != 'both']
#         return different_4326,different_3035

# df4326, df3035 = get_data_to_be_overlayed(whole=True)     

# # create gpkg
added_covar = 1 # wether overlay from scratch or from overlayed version

if added_covar:
    df = pd.read_csv(f'{folder}/data/test_covar_overlayed.csv', low_memory=False)
else:
    df = pd.read_csv(f'{folder}/data/000_soil.full_qa.controlled.csv', low_memory=False)

    
geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]

df4326 = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")
df3035 = df4326.to_crs("EPSG:3035")


### generate overlay links
- read in the files specified by Google Sheet
- convert the files into readable linkes for overlay

In [3]:
# read in potential usable overlay files
key_file = '/mnt/inca/soc_eu_model/gaia-319808-913d36b5fca4.json'
url = 'https://docs.google.com/spreadsheets/d/1eIoPAvWM5jrhLrr25jwguAIR0YxOh3f5-CdXwpcOIz8/edit#gid=0'

gsheet = GoogleSheet(key_file, url)
covar = gsheet.covar




In [4]:
# function to generate file paths by year, and check if the urls are valid
def generate_overlay_path(row,year,filt=None):
            
    # determine if static variable
    if row['temporal resolution'] == 'static':
        return [row['path']],[row['path']]
    
    if row['temporal resolution'] == 'long term':
        perc_list = row['perc'].split(',')
        output_paths = [row['path'].replace('{perc}', perc) for perc in perc_list]
        return output_paths, output_paths
        
    # determine if the year is ahead of the availibility of the variable
    if year>int(row['end year']):
        year = int(row['end year'])
    
    # determine if it's an annual variable or (bi)monthly variable
    if '{start_m}' not in row['path']:
        output_paths = [row['path'].replace('{year}',f'{int(year)}')]
    else:
        output_paths = []
        start_list = row['start_m'].split(', ')
        end_list = row['end_m'].split(', ')
        output_paths = [row['path'].replace('{year}',f'{int(year)}').replace('{start_m}',start_list[i]).replace('{end_m}',end_list[i]) for i in range(len(end_list))]
    
    if '{perc}' in row['path']:
        perc_list = row['perc'].split(',')
        output_paths = [p.replace('{perc}', perc) for p in output_paths for perc in perc_list]
        
    if (row['leap year'] == '1') & (year%4==0):
        output_paths = [p.replace('0228', '0229') if '0228' in p else p for p in output_paths]
    
    return output_paths, [i.replace(str(int(year)),'{year}') for i in output_paths]
    
def check_path(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        # Check if the status code is not 200 (OK). You might want to specifically check for 404 or other error codes.
        if response.status_code == 404:
            print(f"{url} returned HTTP 404 Not Found")
            return url
        elif response.status_code != 200:
            print(f"{url} returned HTTP {response.status_code}")
            return url
        return None  # URL is fine (HTTP 200), or you might want to handle redirections (HTTP 3xx) separately if needed.
    except requests.RequestException as e:
        print(f"Failed to retrieve {url}: {str(e)}")
        return url
    
# # check function validity
# # generate paths
# paths = []
# for index,row in covar.iterrows():
#     paths.extend(generate_overlay_path(row,2000))
    
pathl = []
namel = []
year = 2000
for index,row in covar.iterrows():
    if row['need update in overlay']=='1':
        paths, names = generate_overlay_path(row, year)
        pathl.extend(paths)
        namel.extend(names)
    
for i in pathl:
    check_path(i)

In [5]:
print(len(df3035.columns))
dropl = []
for i in namel:
    if i in df3035.columns:
        dropl.append(i)
df3035 = df3035.drop(columns=dropl)
print(len(df3035.columns))

461
461


#### mend overlay on long term covars

In [11]:
path_stem = [i.split('/')[-1][0:-4] for i in pathl]
namel = [i.split('/')[-1][0:-4] for i in namel]
name_mapping = dict(zip(path_stem,namel))
    
df_overlay = df3035
        
ttprint(f'start overlaying for static, size: {len(df_overlay)}, column num: {len(pathl)}')
pathl = [Path(ii) for ii in pathl]
dfo = SpaceOverlay(df_overlay, fn_layers=pathl, max_workers=90, verbose=False)
temp = dfo.run()

[14:20:58] start overlaying for static, size: 394643, column num: 4


### overlay year by year
- divide soil data by year
- overlay the soil data in each year with corresponding covariates

In [6]:
import warnings
warnings.filterwarnings("ignore")

# epsg 3035 overlay
co3035 = covar.loc[covar['epsg']=='3035']

# for year in np.arange(2000,2023,1):
for year in np.arange(2000, 2024, 1):
    pathl = []
    namel = []
    for index,row in co3035.iterrows():
        if row['need update in overlay']=='1':
            paths, names = generate_overlay_path(row, year)
            pathl.extend(paths)
            namel.extend(names)
            # path3035.extend(generate_overlay_path(row,year))
    for iii in pathl:
        check_path(iii)
    path_stem = [i.split('/')[-1][0:-4] for i in pathl]
    namel = [i.split('/')[-1][0:-4] for i in namel]
    name_mapping = dict(zip(path_stem,namel))
    
    df_overlay = df3035.loc[df3035['time']==year]
    if len(df_overlay)==0:
        print(f'no data for year {year}')
        continue
        
    ttprint(f'start overlaying for year {str(int(year))}, size: {len(df_overlay)}, column num: {len(pathl)}')
    pathl = [Path(ii) for ii in pathl]
    dfo = SpaceOverlay(df_overlay, fn_layers=pathl, max_workers=90, verbose=False)
    temp = dfo.run()
    
    temp = temp.rename(columns=name_mapping)
    temp=temp.drop(columns=['overlay_id'])
    # temp = pd.read_csv(f'/mnt/inca/soc_eu_model/overlay_intermediate/dft_{str(int(tt))}_3035.csv',index=False)
    temp.to_csv(f'{folder}/overlay_intermediate/dft_{str(int(year))}.mend_3035.csv',index=False)
    ttprint(f'finish overlaying for year {str(int(year))}')

[12:12:51] start overlaying for year 2000, size: 26304, column num: 31
[12:13:28] finish overlaying for year 2000
[12:13:28] start overlaying for year 2001, size: 12777, column num: 31
[12:13:54] finish overlaying for year 2001
[12:13:54] start overlaying for year 2002, size: 11718, column num: 31
[12:14:18] finish overlaying for year 2002
[12:14:18] start overlaying for year 2003, size: 13497, column num: 31
[12:14:45] finish overlaying for year 2003
[12:14:45] start overlaying for year 2004, size: 7649, column num: 31
[12:15:01] finish overlaying for year 2004
[12:15:01] start overlaying for year 2005, size: 12987, column num: 31
[12:15:27] finish overlaying for year 2005
[12:15:27] start overlaying for year 2006, size: 13075, column num: 31
[12:15:50] finish overlaying for year 2006
[12:15:50] start overlaying for year 2007, size: 18561, column num: 31
[12:16:19] finish overlaying for year 2007
[12:16:19] start overlaying for year 2008, size: 11983, column num: 31
[12:17:01] finish 

### assemble the overlayed annual datasets
- read in the overlayed soil data (with covariates) for each year
- combine them into a whole dataset

In [14]:
## read in 3035 datasets
mended = '.mend'
tl = []
for year in np.arange(2000,2023,1):
    temp = pd.read_csv(f'{folder}/overlay_intermediate/dft_{str(int(year))}{mended}_3035.csv',low_memory=False)
    print(f'{year}, {len(temp.columns)}, {len(temp)}')
    tl.append(temp)

df3035 = pd.concat(tl)
print(f'whole 3035, cols:{len(df3035.columns)}, size:{len(df3035)}')


2000, 491, 26304
2001, 491, 12777
2002, 491, 11718
2003, 491, 13497
2004, 491, 7649
2005, 491, 12987
2006, 491, 13075
2007, 491, 18561
2008, 491, 11983
2009, 491, 25942
2010, 491, 12330
2011, 491, 15374
2012, 491, 26062
2013, 491, 21428
2014, 491, 16922
2015, 491, 33628
2016, 491, 16177
2017, 491, 15843
2018, 491, 42918
2019, 491, 15088
2020, 491, 10200
2021, 491, 10886
2022, 491, 3294
whole 3035, cols:491, size:394643


In [25]:
col = 'cropland.extent_glad.interpolate_p_30m_s_{year}0101_{year}1231_eu_epsg.3035_v20240604'
for year in np.arange(2000,2023,1):
    temp = df3035.loc[df3035['time']==year]
    aaa = temp[col].isna().sum()/len(temp)
    print(aaa, year, temp[col].min())

0.7101201338199513 2000 100.0
0.8049620411677233 2001 100.0
0.6483188257381806 2002 100.0
0.4095724975920575 2003 100.0
0.7522551967577461 2004 25.0
0.7871717871717872 2005 50.0
0.6450478011472275 2006 25.0
0.6639189698830882 2007 100.0
0.6091129099557707 2008 25.0
0.5784442217253875 2009 50.0
0.5128953771289537 2010 25.0
0.5467022245349291 2011 100.0
0.6755429360755122 2012 25.0
0.4654657457532201 2013 50.0
0.6197848954024348 2014 25.0
0.578684429641965 2015 100.0
0.6435062125239538 2016 25.0
0.4782553809253298 2017 50.0
0.5796868446805536 2018 25.0
0.7391304347826086 2019 100.0
0.7472549019607844 2020 100.0
0.6805989344111704 2021 100.0
0.8269581056466302 2022 100.0


In [26]:
dff = df3035
meta_list = ['id', 'lat', 'lon', 'time', 'hzn_top', 'hzn_btm', 'ref', 'nuts0', 'oc',
       'ph_h2o', 'ph_cacl2', 'bulk_density', 'clay', 'silt', 'sand', 'caco3',
       'N', 'K', 'P', 'CEC', 'EC', 'oc_qa', 'N_qa', 'caco3_qa',
       'bulk_density_qa', 'clay_qa', 'silt_qa', 'sand_qa', 'ph_h2o_qa',
       'ph_cacl2_qa', 'P_qa', 'K_qa', 'EC_qa', 'CEC_qa', 'geometry']
### check covariates availability
drop_list = []
for col in dff.columns:
    if col in meta_list:
        continue
    if (dff[col].isna().sum()/len(dff))>0.02:
        
        if 'longterm' not in col:
            print(col, dff[col].isna().sum())
            drop_list.append(col)
        
print(f'remove covariates with more than 2% data unavailable')
dff = dff.drop(columns = drop_list)

lcv_wilderness_li2022.human.footprint_p_1km_s0..0cm_{year}_v16022022 17059
fgd_chelsa_m_1km_s_19810101_20101231_eu_epsg.3035_v20240531 264869
fcf_chelsa_m_1km_s_19810101_20101231_eu_epsg.3035_v20240531 198538
lgd_chelsa_m_1km_s_19810101_20101231_eu_epsg.3035_v20240531 264869
wv_mcd19a2v061.seasconv_m_1km_s_{year}0101_{year}0131_eu_epsg.3035_v20230619 99338
wv_mcd19a2v061.seasconv_m_1km_s_{year}0201_{year}0228_eu_epsg.3035_v20230619 99686
wv_mcd19a2v061.seasconv_m_1km_s_{year}0301_{year}0331_eu_epsg.3035_v20230619 110373
wv_mcd19a2v061.seasconv_m_1km_s_{year}0501_{year}0531_eu_epsg.3035_v20230619 72025
wv_mcd19a2v061.seasconv_m_1km_s_{year}0401_{year}0430_eu_epsg.3035_v20230619 113298
wv_mcd19a2v061.seasconv_m_1km_s_{year}0701_{year}0731_eu_epsg.3035_v20230619 91872
wv_mcd19a2v061.seasconv_m_1km_s_{year}0801_{year}0831_eu_epsg.3035_v20230619 47493
wv_mcd19a2v061.seasconv_m_1km_s_{year}0601_{year}0630_eu_epsg.3035_v20230619 16928
wv_mcd19a2v061.seasconv_sd_1km_s_{year}1101_{year}1130_eu_

In [29]:
dff = dff.drop(columns=['geometry'])
dff.to_csv(f'{folder}/data/001_covar_overlayed.csv',index=False)

### Assign spatial blocking ID

In [17]:
# create a tiling system first
from eumap.parallel import TilingProcessing
from pathlib import Path
import rasterio
from shapely.geometry import box
import numpy as np
import pandas as pd
import geopandas as gpd

# raster_layer_fn = f'http://192.168.1.30:8333/ai4sh-landmasked/ndvi/ndvi_glad.landast.ard2.seasconv.m.yearly_p75_30m_s_20220101_20221231_eu_epsg.3035_v20231127.tif'
# ds = rasterio.open(raster_layer_fn)
# tiles_size = ds.transform[0] * 1000 # 30m -> 30km

# tiling_system = TilingProcessing.generate_tiles(tiles_size, extent=ds.bounds, crs=ds.crs, raster_layer_fn=raster_layer_fn)
# tiling_system = tiling_system.to_crs("EPSG:4326")
# tiling_system = tiling_system[['tile_id','geometry']]
# tiling_system.to_file('/mnt/inca/soc_eu_model/data/000_tile_eu4326.gpkg',  driver="GPKG")
# # tiling_system[tiling_system['raster_mode_count'] > 0].to_file('/mnt/inca/soc_eu_model/data/000_tile_eu4326.gpkg.gpkg',  driver="GPKG")

df = pd.read_csv(f'{folder}/data/test_covar_overlayed.csv',low_memory=False)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))

from shapely.geometry import Point
tiles = gpd.read_file(f'{folder}/data/000_tile_eu4326.gpkg')

gdf.crs = tiles.crs
joined_gdf = gpd.sjoin(gdf, tiles, how="left", op='within')
joined_gdf = joined_gdf.drop(columns=['index_right','geometry'])


  if await self.run_code(code, result, async_=asy):


In [21]:
joined_gdf.to_csv(f'{folder}/data/test_covar_overlayed.csv',index=False)

### epsg 4326

In [None]:
# import warnings
# warnings.filterwarnings("ignore")

# # epsg 4326 overlay
# co4326 = covar.loc[covar['epsg']=='4326']
# path_ori = [i.split('/')[-1][0:-4] for i in co4326['path']]

# for year in np.arange(2000,2024,1):
#     pathl = []
#     namel = []
#     for index,row in co4326.iterrows():
#         paths, names = generate_overlay_path(row, year)
#         pathl.extend(paths)
#         namel.extend(names)
#         # path4326.extend(generate_overlay_path(row,year))
        
# #     for i in pathl:
# #         check_path(i)
        
#     path_stem = [i.split('/')[-1][0:-4] for i in pathl]
#     namel = [i.split('/')[-1][0:-4] for i in namel]
#     name_mapping = dict(zip(path_stem,namel))
    
#     df_overlay = df4326.loc[df4326['time']==year]
#     if len(df_overlay)==0:
#         print(f'no data for year {year}')
#         continue
        
#     ttprint(f'start overlaying for year {str(int(year))}, size: {len(df_overlay)}')
#     pathl = [Path(ii) for ii in pathl]
#     dfo = SpaceOverlay(df_overlay, fn_layers=pathl, max_workers=90, verbose=False)
#     temp = dfo.run()
    
#     temp = temp.rename(columns=name_mapping)
#     temp=temp.drop(columns=['overlay_id'])
#     # temp = pd.read_csv(f'/mnt/inca/soc_eu_model/overlay_intermediate/dft_{str(int(tt))}_4326.csv',index=False)
#     temp.to_csv(f'{folder}/overlay_intermediate/dft_{str(int(year))}_4326.csv',index=False)
#     ttprint(f'finish overlaying for year {str(int(year))}')


# # read in 4326 datasets
# tl = []

# for year in np.arange(2000,2023,1):
#     temp = pd.read_csv(f'{folder}/overlay_intermediate/dft_{str(int(year))}_4326.csv',low_memory=False)
#     temp = temp.rename(columns=name_mapping)
#     print(f'{year}, {len(temp.columns)}, {len(temp)}')
#     tl.append(temp)

# df4326 = pd.concat(tl)
# print(f'whole 4326, {len(df4326.columns)}, {len(df4326)}')


# # merge to merge
# cols_list = df3035.columns.values.tolist()
# meta_list = ['lat', 'lon', 'oc', 'ph_h2o', 'ph_cacl2', 'bulk_density', 'clay','silt', 'sand', 'caco3','N',
#              'K', 'P', 'CEC', 'EC', 'nuts0', 'time','hzn_top','hzn_btm','ref','sample_id']
# dff = pd.merge(df3035,df4326,on = meta_list, how='inner')
