In [1]:
import numpy as np
import matplotlib.pyplot as plt
import multiprocess as mp
import glob
import time
from tqdm import tqdm
import os
import sys
import pandas as pd
from eumap.misc import find_files, nan_percentile, GoogleSheet
from eumap.raster import read_rasters, save_rasters
from eumap.mapper import SpaceOverlay
import geopandas as gpd
from pathlib import Path
from minio import Minio
import rasterio
import pyproj
from shapely.geometry import Point

# os.environ['PROJ_LIB'] = '/opt/conda/share/proj'
key_file = '/mnt/apollo/stac/gaia-319808-913d36b5fca4.json'
url = 'https://docs.google.com/spreadsheets/d/1AGUnfC1EilHn-7e3kzeLH03rmEkUGEHHtNhE61V-TKE/edit?usp=sharing'

In [2]:
whole = gpd.read_file('/mnt/inca/soc_eu_model/SOC_overlay_epsg.3035.gpkg')
a = len(whole)
print(f'original {a} total')

# df = pd.read_csv('/mnt/inca/soc_eu_model/training_points_v2_true_coor.csv',low_memory=False)
# df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

whole = whole.dropna(subset=['gps_lat'])
b = len(whole)
print(f'left {b}, minus {a-b} data without explicit coordinates')
a = len(whole)

whole = whole.dropna(subset=['time'])
whole = whole.drop(whole.loc[whole['time']==0].index)
whole = whole.drop(whole.loc[whole['time']>2021].index)
b = len(whole)
print(f'left {b}, minus {a-b} data without explicit time')

whole.reset_index(drop=True, inplace=True)

original 132217 total
left 131367, minus 850 data without explicit coordinates
left 125963, minus 5404 data without explicit time


In [3]:
gsheet = GoogleSheet(key_file, url)
covar = gsheet.covariates

# os.environ['PROJ_LIB'] = '/opt/conda/share/proj'

cols = ['ph_h2o','ph_ca','oc','gps_lat','gps_long','time','hzn_top','hzn_bottom','ref','ph_kcl','country','geometry','overlay_id']

In [4]:


def complete_path(dfi,year,name=0):

    if dfi['frequency'] == 'static':
        fn = dfi['path']
        return fn
    
    if 'year' in dfi['path']:
        if name==0:
            fn = dfi['path'].replace('{year}', str(year)).replace('{year_minus_1}', str(year-1)).replace('{year_plus_1}', str(year+1))
        else:
            fn = dfi['path']
        if '{oper}' not in fn:
            return fn
        else:
            oper_list = dfi['oper'].split(',')
            fn_list = []
            for o in oper_list:
                fn_list.append(fn.replace('{oper}',o))
            return fn_list
    
    if '{oper}' in dfi['path']:
        oper_list = dfi['oper'].split(',')
        fn_list = []
        for o in oper_list:
            fn_list.append(dfi['path'].replace('{oper}',o))
        return fn_list
            
    return(dfi['path'])

def paths(year,covar,name=0):
    copath = []
    for index, row in covar.iterrows():
        if row['use'] == '0':
            continue
        fn = complete_path(row,year,name)
        if isinstance(fn,list):
            for i in fn:
                copath.append(i)
        else:
            copath.append(fn)
                
    return copath

In [5]:
tl = whole['time'].unique().tolist()
tl = [i for i in tl if i >= 2000]
tl.append(2000.0)
tl = sorted(tl)

In [12]:
# a = ['http://192.168.1.30:8333/tmp-ai4sh-layers/veg/fapar/monthly/fapar_essd.lstm_p95_250m_s_20081101_20081130_eu_epsg.3035_v20230817.tif']
# a = [Path(i) for i in a]
# dfo = SpaceOverlay(f'/mnt/inca/soc_eu_model/overlay_history_data/dft_2008.gpkg', fn_layers=a, max_workers=50, verbose=False)
# temp = dfo.run()
# tm = pd.read_csv('/mnt/inca/soc_eu_model/overlay_history_data/dft_2008_veg.csv')
# temp=temp.drop(columns=['geometry','overlay_id'])
# tm = pd.read_csv('/mnt/inca/soc_eu_model/overlay_history_data/dft_2008_veg.csv')

# tm['fapar_essd.lstm_p95_250m_s_20081101_20081130_eu_epsg.3035_v20230817']
# tm.to_csv('/mnt/inca/soc_eu_model/overlay_history_data/dft_2008_veg.csv')

In [32]:
copath = a
for tt in [2003]:
    start = time.time()
    print(tt)
    
    copath = paths(int(tt),covar)
    copath = [i.replace('0228', '0229') if 'fapar_essd.lstm' in i and tt % 4 == 0 else i for i in copath]
    
    if tt==2000.0:
        dft = whole.loc[whole['time']<=tt]
    else:
        dft = whole.loc[whole['time']==tt]
    
    dft.to_file(f'dft_{str(int(tt))}.gpkg')
    
    for clss in ['clm']: #['dtm','veg','lcv','clm','bioblim']
        copath_c = [i for i in copath if clss in i]
        copath_c = [Path(s) for s in copath_c]
        print(clss)
        dfo = SpaceOverlay(f'/mnt/inca/soc_eu_model/overlay_history_data/dft_{str(int(tt))}.gpkg', fn_layers=copath_c, max_workers=50, verbose=False)
        temp = dfo.run()
        
        temp=temp.drop(columns=['geometry','overlay_id'])
        temp.to_csv(f'/mnt/inca/soc_eu_model/overlay_history_data/dft_{str(int(tt))}_{clss}.csv')

    end = time.time()
#     tra.to_file('overlay_output.gpkg', index=False, driver='GPKG')
    print(f'{tt} finished in {(end-start)/60} mins')

2003
clm
2003 finished in 1.6274741411209106 mins


  super().__setitem__(key, value)


In [44]:
cols = ['ph_h2o','ph_ca','oc','gps_lat','gps_long','time','hzn_top','hzn_bottom','ref','ph_kcl','country']
nancol = []
for tt in tl:
    print(tt)
    
    ini = pd.read_csv(f'/mnt/inca/soc_eu_model/overlay_history_data/dft_{str(int(tt))}_dtm.csv')
    filtered_columns = [col for col in ini.columns if 'Unname' not in col]
    ini = ini[filtered_columns]
#     dft.to_file(f'dft_{str(int(tt))}.gpkg')
    
    for clss in ['veg','lcv','clm','bioclim']:
        temp = pd.read_csv(f'/mnt/inca/soc_eu_model/overlay_history_data/dft_{str(int(tt))}_{clss}.csv')
        filtered_columns = [col for col in temp.columns if 'Unname' not in col]
        temp = temp[filtered_columns]
        if clss != 'clm':
            temp = temp.drop(columns=cols)
        ini = pd.concat([ini, temp], axis=1)
        
    nancol.append((ini.columns[ini.isna().all()],tt))
                           
    ini.to_csv(f'dft_{str(int(tt))}.csv')
    print(len(ini.columns.values.tolist()))

2000.0
854
2003.0
854
2004.0
854
2007.0
854
2008.0
854
2009.0
854
2010.0
854
2011.0
854
2012.0


  exec(code_obj, self.user_global_ns, self.user_ns)


854
2013.0
854
2014.0
854
2015.0
854
2016.0
854
2017.0
854
2018.0
854


In [56]:
coname = paths(int(tt),covar,name=1)
# coname = [i.replace('0228', '0229') if 'fapar_essd.lstm' in i and tt % 4 == 0 else i for i in coname]
coname = [i.split('/')[-1][0:-27] if 'v202308' in i else i.split('/')[-1][0:-24] for i in coname]
# coname = [i.replace('0229', '0228') for i in coname]

tt=2000
# ini = pd.read_csv(f'dft_2000.csv')
copath = paths(2000,covar)
copath = [i.replace('0228', '0229') if 'fapar_essd.lstm' in i and tt % 4 == 0 else i for i in copath]
copath = [i.split('/')[-1][0:-4] for i in copath]
name_mapping = dict(zip(copath, coname))
ini = ini.rename(columns=name_mapping)

for tt in tl[1::]:
    copath = paths(int(tt),covar)
    copath = [i.replace('0228', '0229') if 'fapar_essd.lstm' in i and tt % 4 == 0 else i for i in copath]
    copath = [i.split('/')[-1][0:-4] for i in copath]
    name_mapping = dict(zip(copath, coname))
    
    temp = pd.read_csv(f'dft_{str(int(tt))}.csv')
    temp = temp.rename(columns=name_mapping)
#     temp.to_csv(f'dft_{str(int(tt))}_rename.csv')
    ini = pd.concat([ini, temp], axis=0, ignore_index=True)
ini.to_csv('/mnt/inca/soc_eu_model/training_points_v3.1_covar.csv')

ERROR:asyncio:Task was destroyed but it is pending!
task: <Task pending name='Task-2' coro=<Kernel.poll_control_queue() running at /opt/conda/lib/python3.8/site-packages/ipykernel/kernelbase.py:227> wait_for=<Future finished result=[<zmq.sugar.fr...x7f8bf3b22880>, <zmq.sugar.fr...x7f8bb27eeeb0>, <zmq.sugar.fr...x7f8ba41b81a0>, <zmq.sugar.fr...x7f8ba41b8250>, <zmq.sugar.fr...x7f8ba41b8930>, <zmq.sugar.fr...x7f8bab29b0f0>, ...]> cb=[_chain_future.<locals>._call_set_state() at /opt/conda/lib/python3.8/asyncio/futures.py:367]>


In [None]:
# ## test if a file exists or not
# year = 2000
# copath = paths(year,covar)
# copath = [i.replace('0228', '0229') if 'fapar_essd.lstm' in i and year % 4 == 0 else i for i in copath]
# len(copath)

# def test(path):
# #     s3_config = {
# #         'access_key': 'iwum9G1fEQ920lYV4ol9',
# #         'secret_access_key': 'GMBME3Wsm8S7mBXw3U4CNWurkzWMqGZ0n2rXHggS0',
# #         'host': '192.168.1.30:8333',
# #         'bucket': 'tmp-ai4sh-layers'}
    
# #     client = Minio(s3_config['host'], s3_config['access_key'], s3_config['secret_access_key'], secure=False)

# #     f = path.split('/')[-4]+ '/' +path.split('/')[-3]+ '/' +path.split('/')[-2]+ '/' + path.split('/')[-1]
#     try:
#         src = rasterio.open(path)
# #         client.stat_object(s3_config['bucket'], f)
#     except:
#         return path
    
# with mp.Pool(processes=30) as pool:
#     a = pool.map(test, copath)
    
# b = [i for i in a if i]
# b

In [8]:
# # match crs
# df = pd.read_csv('/mnt/inca/soc_eu_model/training_points_v2_true_coor.csv',low_memory=False)
# df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
# src_crs = pyproj.CRS('EPSG:4326')
# target_crs = pyproj.CRS('EPSG:3035')
# transformer = pyproj.Transformer.from_crs(src_crs, target_crs, always_xy=True)
# geometry = [Point(transformer.transform(lon, lat)) for lon, lat in zip(df['gps_long'], df['gps_lat'])]
# gdf = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:3035')
# gdf.to_file('SOC_overlay_epsg.3035.gpkg', driver='GPKG')
# gdf

Unnamed: 0,ph_h2o,ph_ca,oc,gps_lat,gps_long,time,hzn_top,hzn_bottom,ref,ph_kcl,country,geometry
0,5.29,4.48,21.83,54.859897,8.411382,2015.0,0.0,10.0,https://literatur.thuenen.de/digbib_extern/dn0...,,germany,POINT (4218956.122 3529371.392)
1,5.11,4.39,20.75,54.859897,8.411382,2015.0,10.0,30.0,https://literatur.thuenen.de/digbib_extern/dn0...,,germany,POINT (4218956.122 3529371.392)
2,4.63,3.98,11.51,54.859897,8.411382,2015.0,30.0,50.0,https://literatur.thuenen.de/digbib_extern/dn0...,,germany,POINT (4218956.122 3529371.392)
3,4.73,4.15,6.909999999999999,54.859897,8.411382,2015.0,50.0,70.0,https://literatur.thuenen.de/digbib_extern/dn0...,,germany,POINT (4218956.122 3529371.392)
4,4.90,4.23,1.57,54.859897,8.411382,2015.0,70.0,100.0,https://literatur.thuenen.de/digbib_extern/dn0...,,germany,POINT (4218956.122 3529371.392)
...,...,...,...,...,...,...,...,...,...,...,...,...
132212,7.12,7.00,29,47.638800,8.682200,2015.0,0.0,20.0,https://esdac.jrc.ec.europa.eu/content/lucas-2...,,swiss,POINT (4221916.673 2725902.636)
132213,7.54,7.30,33.3,47.642000,9.029200,2015.0,0.0,20.0,https://esdac.jrc.ec.europa.eu/content/lucas-2...,,swiss,POINT (4248009.571 2725854.717)
132214,7.82,7.40,17.9,47.675200,8.758100,2015.0,0.0,20.0,https://esdac.jrc.ec.europa.eu/content/lucas-2...,,swiss,POINT (4227688.779 2729847.993)
132215,7.45,7.20,23.6,47.711800,8.813400,2015.0,0.0,20.0,https://esdac.jrc.ec.europa.eu/content/lucas-2...,,swiss,POINT (4231906.747 2733846.196)
