In [1]:
import numpy as np
import matplotlib.pyplot as plt
import multiprocess as mp
import glob
import time
from tqdm import tqdm
import os
import sys

import pandas as pd
from eumap.misc import find_files, nan_percentile, GoogleSheet, ttprint
from eumap.raster import read_rasters, save_rasters
from eumap.mapper import SpaceOverlay
import geopandas as gpd
from pathlib import Path
from minio import Minio
import rasterio
import pyproj
from shapely.geometry import Point

# os.environ['PROJ_LIB'] = '/opt/conda/share/proj'
key_file = '/mnt/apollo/stac/gaia-319808-913d36b5fca4.json'
url = 'https://docs.google.com/spreadsheets/d/1AGUnfC1EilHn-7e3kzeLH03rmEkUGEHHtNhE61V-TKE/edit?usp=sharing'

In [2]:
whole = gpd.read_file('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/training_point_overlay_3035.gpkg')
a = len(whole)
print(f'original {a} total')

pre = whole.loc[whole['time']<=2000]
aft = whole.loc[whole['time']>2000]


original 122942 total


In [3]:
gsheet = GoogleSheet(key_file, url)
covar = gsheet.covariates

def path_fill_oper(covar):
    copath = []
    for index, row in covar.iterrows():
        if row['exist']=='0':
            continue
        if '{oper}' in row['path']:
            oper_list = row['oper'].split(',')
            for o in oper_list:
                copath.append(row['path'].replace('{oper}',o))
        else:
            copath.append(row['path'])
                
    return copath

def path_fill_time(paths,year):
    copath = []
    for i in paths:
        if '{year}' in i:
            copath.append(i.replace('{year}', str(year)).replace('{year_minus_1}', str(year-1)).replace('{year_plus_1}', str(year+1)))
        else:
            copath.append(i)
    return copath

paths = path_fill_oper(covar)
print(f'{len(paths)} covariate in total')

def get_name(url):
    filename = url.split('/')[-1]
    parts = filename.split('_')
    return '_'.join(parts[:-3])
names = [get_name(i) for i in paths]

886 covariate in total


In [4]:
# pre 2000 overlay
# files = path_fill_time(paths,2000)
# files = [i.replace('0228', '0229') if 'fapar_essd.lstm' in i else i for i in files ]
# fnames = [i.split('/')[-1][0:-4] for i in files]
# name_mapping = dict(zip(fnames, names))
# files = [Path(i) for i in files]

# for coun in pre['country'].unique().tolist():
#     dfc = pre.loc[pre['country']==coun]
#     ttprint(f'pre 2000, {coun}, size:{len(dfc)}')
#     dfc.to_file(f"/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/soc_overlay_pre2000_{coun}_3035.gpkg")
    
#     dfo = SpaceOverlay(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/soc_overlay_pre2000_{coun}_3035.gpkg', fn_layers=files, max_workers=50, verbose=False)
#     temp = dfo.run()
    
#     ttprint('finish overlay')
#     temp = temp.rename(columns=name_mapping)
#     temp=temp.drop(columns=['geometry','overlay_id'])
    
#     temp.to_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/dft_pre2000_{coun}.csv')
#     print()

In [6]:
# import warnings
# warnings.simplefilter("ignore")

# tl = aft['time'].unique().tolist()
# tl = [int(i) for i in tl]
# tl = sorted(tl)

# # after 2000 overlay
# for tt in tl:
#     dft = aft.loc[aft['time']==tt]
#     ttprint(f'aft 2000, {tt}, size:{len(dft)}')
#     dft.to_file(f"/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/soc_overlay_aft2000_{tt}_3035.gpkg")
    
#     files = path_fill_time(paths,tt)
#     files = [i.replace('0228', '0229') if 'fapar_essd.lstm' in i and tt % 4 == 0 else i for i in files]
#     fnames = [i.split('/')[-1][0:-4] for i in files]
#     name_mapping = dict(zip(fnames, names))
#     files = [Path(i) for i in files]
    
#     dfo = SpaceOverlay(f"/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/soc_overlay_aft2000_{tt}_3035.gpkg", fn_layers=files, max_workers=50, verbose=False)
#     temp = dfo.run()
    
#     temp = temp.rename(columns=name_mapping)
#     ttprint('finish overlay')
#     temp=temp.drop(columns=['geometry','overlay_id'])
#     temp.to_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/dft_aft2000_{tt}.csv')
#     print()
    

In [50]:
cols = ['ph_h2o','ph_ca','oc','gps_lat','gps_long','time','hzn_top','hzn_btm','ref','ph_kcl','country','point_index'] + names
ini = pd.DataFrame(columns=cols)

files = path_fill_time(paths,2000)
files = [i.replace('0228', '0229') if 'fapar_essd.lstm' in i else i for i in files ]
fnames = [i.split('/')[-1][0:-4] for i in files]
name_mapping = dict(zip(fnames, names))
print(len(ini.columns))
for coun in pre['country'].unique().tolist():
    temp = pd.read_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/dft_pre2000_{coun}.csv')
    temp = temp.rename(columns=name_mapping)
    temp = temp.drop(columns=['Unnamed: 0'])
    ini = pd.concat([ini,temp])
    print(coun,len(ini.columns))



898
belgium 898
scotland 898
crotia 898


In [52]:
cols = ['ph_h2o','ph_ca','oc','gps_lat','gps_long','time','hzn_top','hzn_btm','ref','ph_kcl','country','point_index'] + names
ini2 = pd.DataFrame(columns=cols)
print(len(ini2.columns))
for tt in tl:
    temp = pd.read_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/dft_aft2000_{tt}.csv')
    temp = temp.drop(columns=['Unnamed: 0'])
    ini2 = pd.concat([ini2,temp])
    print(tt, len(ini2.columns))

898
2003 898
2004 898
2007 898
2008 898
2009 898
2010 898
2011 898
2012 898
2013 898
2014 898
2015 898
2016 898
2017 898
2018 898


In [54]:
ini = pd.concat([ini,ini2])
ini.to_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/overlayed_v1_full.csv',index=False)

In [64]:
# ini = pd.read_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/overlayed_v1_full.csv')
# lack = []
# for i in ini.columns.values.tolist():
#     m = ini[i].isna().sum()
#     if m>1000:
#         lack.append(i)
#         # print(i,m)
        
# lack.remove('oc')
ini = ini.drop(columns=lack)
ini.to_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/overlayed_v2_drop.nan.csv',index=False)
print(f'- {len(lack)} cols with more than 1000 nan rows')

- 66 cols with more than 1000 nan rows


In [65]:
# ini = ini.drop(columns=['ph_h2o','ph_ca','ref','ph_kcl'])
a = len(ini)
ini = ini.dropna(how='any')
ini.to_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/train_v1.csv',index=False)
print(f'- {len(ini)-a} rows with nan')

- -2120 rows with nan


In [66]:
122942-2120

120822

In [None]:
# ## test if a file exists or not
# year = 2000
# copath = paths(year,covar)
# copath = [i.replace('0228', '0229') if 'fapar_essd.lstm' in i and year % 4 == 0 else i for i in copath]
# len(copath)

# def test(path):
# #     s3_config = {
# #         'access_key': 'iwum9G1fEQ920lYV4ol9',
# #         'secret_access_key': 'GMBME3Wsm8S7mBXw3U4CNWurkzWMqGZ0n2rXHggS0',
# #         'host': '192.168.1.30:8333',
# #         'bucket': 'tmp-ai4sh-layers'}
    
# #     client = Minio(s3_config['host'], s3_config['access_key'], s3_config['secret_access_key'], secure=False)

# #     f = path.split('/')[-4]+ '/' +path.split('/')[-3]+ '/' +path.split('/')[-2]+ '/' + path.split('/')[-1]
#     try:
#         src = rasterio.open(path)
# #         client.stat_object(s3_config['bucket'], f)
#     except:
#         return path
    
# with mp.Pool(processes=30) as pool:
#     a = pool.map(test, copath)
    
# b = [i for i in a if i]
# b

In [8]:
# # match crs
# df = pd.read_csv('/mnt/inca/soc_eu_model/training_points_v2_true_coor.csv',low_memory=False)
# df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
# src_crs = pyproj.CRS('EPSG:4326')
# target_crs = pyproj.CRS('EPSG:3035')
# transformer = pyproj.Transformer.from_crs(src_crs, target_crs, always_xy=True)
# geometry = [Point(transformer.transform(lon, lat)) for lon, lat in zip(df['gps_long'], df['gps_lat'])]
# gdf = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:3035')
# gdf.to_file('SOC_overlay_epsg.3035.gpkg', driver='GPKG')
# gdf

Unnamed: 0,ph_h2o,ph_ca,oc,gps_lat,gps_long,time,hzn_top,hzn_bottom,ref,ph_kcl,country,geometry
0,5.29,4.48,21.83,54.859897,8.411382,2015.0,0.0,10.0,https://literatur.thuenen.de/digbib_extern/dn0...,,germany,POINT (4218956.122 3529371.392)
1,5.11,4.39,20.75,54.859897,8.411382,2015.0,10.0,30.0,https://literatur.thuenen.de/digbib_extern/dn0...,,germany,POINT (4218956.122 3529371.392)
2,4.63,3.98,11.51,54.859897,8.411382,2015.0,30.0,50.0,https://literatur.thuenen.de/digbib_extern/dn0...,,germany,POINT (4218956.122 3529371.392)
3,4.73,4.15,6.909999999999999,54.859897,8.411382,2015.0,50.0,70.0,https://literatur.thuenen.de/digbib_extern/dn0...,,germany,POINT (4218956.122 3529371.392)
4,4.90,4.23,1.57,54.859897,8.411382,2015.0,70.0,100.0,https://literatur.thuenen.de/digbib_extern/dn0...,,germany,POINT (4218956.122 3529371.392)
...,...,...,...,...,...,...,...,...,...,...,...,...
132212,7.12,7.00,29,47.638800,8.682200,2015.0,0.0,20.0,https://esdac.jrc.ec.europa.eu/content/lucas-2...,,swiss,POINT (4221916.673 2725902.636)
132213,7.54,7.30,33.3,47.642000,9.029200,2015.0,0.0,20.0,https://esdac.jrc.ec.europa.eu/content/lucas-2...,,swiss,POINT (4248009.571 2725854.717)
132214,7.82,7.40,17.9,47.675200,8.758100,2015.0,0.0,20.0,https://esdac.jrc.ec.europa.eu/content/lucas-2...,,swiss,POINT (4227688.779 2729847.993)
132215,7.45,7.20,23.6,47.711800,8.813400,2015.0,0.0,20.0,https://esdac.jrc.ec.europa.eu/content/lucas-2...,,swiss,POINT (4231906.747 2733846.196)


In [6]:
import requests
files = path_fill_time(paths,2000)
files = [i.replace('0228', '0229') if 'fapar_essd.lstm' in i else i for i in files ]
for url in files:
    if 'http' not in url:
        continue
    response = requests.head(url)
    
    if response.status_code == 404:
        print(url)
    elif response.status_code == 200:
        continue
    else:
        print(f"Received unexpected status code: {response.status_code}")


http://192.168.1.30:8333/tmp-ai4sh-layers/veg/fapar/monthly/fapar_essd.lstm.whittaker_p95_250m_s_20000201_20000229_eu_epsg.3035_v20230817.tif
