In [1]:
import numpy as np
import matplotlib.pyplot as plt
import multiprocess as mp
import glob
import time
from tqdm import tqdm
import os
import sys
sys.path.append('/opt/conda/share/proj')
import pandas as pd
import dbfread 
import geopandas as gpd
import warnings
import matplotlib.pyplot as plt
import csv
import pyproj
from simpledbf import Dbf5
from datetime import datetime


#### clean depth, time and coordinates

In [2]:
# crotia, harmonized dataset from MultiOne
crotia = pd.read_excel('/mnt/primus/xuemeng_tmp_harbour/soc_eu/crotia/hr_topsoil_db.xlsx')

# organize the depth
crotia.loc[crotia['dbr'].isna(), 'hzn_top'] = np.nan
crotia.loc[crotia['dbr'].isna(), 'hzn_btm'] = np.nan
crotia.loc[~crotia['dbr'].isna(), 'hzn_top'] = crotia.loc[~crotia['dbr'].isna(), 'dbr'] - 10
crotia.loc[~crotia['dbr'].isna(), 'hzn_btm'] = crotia.loc[~crotia['dbr'].isna(), 'dbr'] + 10
crotia.loc[crotia['source_db'].isin(['agricultural_2013','azo_2016']), 'hzn_btm'] = 30
crotia.loc[crotia['source_db'].isin(['agricultural_2013','azo_2016']), 'hzn_top'] = 0
crotia.loc[crotia['source_db'] == 'azo_2013', 'hzn_top'] = 0
crotia.loc[crotia['source_db'] == 'azo_2013', 'hzn_btm'] = 25

column_names = ['lat','lon','time','hzn_top','hzn_btm','ref']
temp = pd.DataFrame(columns=column_names)
temp['lat'] = crotia['latitude_decimal_degrees']
temp['lon'] = crotia['longitude_decimal_degrees']
temp['nuts0'] = 'HR'
temp['time'] = crotia['site_obsdate']
temp['hzn_top'] = crotia['hzn_top']
temp['hzn_btm'] = crotia['hzn_btm']
temp['ref'] = 'croatia.multione-'+ crotia['source_db']
temp['oc'] = crotia['oc']*10 # % -> g/kg
temp['ph_cacl2'] = (crotia['ph_kcl']+0.09)*0.987+0.321 # convert from ph_kcl to ph_cacl2
temp['ph_h2o'] = crotia['ph_h2o']
# temp['ph_cacl2'] = np.nan
temp['bulk_density'] = crotia['db_od']
temp['clay'] = crotia['clay_tot_psa']
temp['silt'] = crotia['silt_tot_psa']
temp['sand'] = crotia['sand_tot_psa']
temp['caco3'] = crotia['caco3']*10 # % -> g/kg
temp['N'] = crotia['n_tot_ncs']*10 # % -> g/kg
temp['K'] = crotia['k_mehlich3']*0.965 + 7.13 # mehlich convert to AAE
# temp['P'] = crotia['p_mehlich3'] # mehlich3 - olsen method  not convertable

# basic info
print(f'{len(temp)} data in total')

na = temp['time'].isna().sum()
print(f'{na} data with no time info')

na = len(temp[temp['hzn_btm'].isna() | temp['hzn_top'].isna()])
print(f'{na} data with no depth info')

na = len(temp[temp['lat'].isna() | temp['lon'].isna()])
print(f'{na} data with no coordinate info')

temp.to_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/croatia_harmonized_v1.csv',index=False)

6271 data in total
2556 data with no time info
339 data with no depth info
0 data with no coordinate info


In [3]:
# germany
germany = pd.read_excel(r'/mnt/diskstation/data/soil_points/Germany/LABORATORY_DATA.xlsx', engine='openpyxl')
germany_site = pd.read_excel(r'/mnt/diskstation/data/Soil_points/Germany/SITE.xlsx', engine='openpyxl')
germany = germany.merge(germany_site, on="PointID", how="inner")
utm_projection = pyproj.CRS.from_string(f'+proj=utm +zone={32} +ellps=WGS84')
gps_projection = pyproj.CRS.from_epsg(4326)
# Create transformer objects for the coordinate conversion
transformer = pyproj.Transformer.from_crs(utm_projection, gps_projection)
# Convert UTM coordinates to GPS latitude and longitude
germany['lat'], germany['lon'] = transformer.transform(germany['xcoord'], germany['ycoord'])

column_names = ['lat','lon','time','hzn_top','hzn_btm','ref']
temp = pd.DataFrame(columns=column_names)
temp['time'] = germany['Sampling_year']
temp['hzn_top'] = germany['Layer upper limit']
temp['hzn_btm'] = germany['Layer lower limit']
temp['ref'] = 'germany.thuenen-'+germany['County_x']
temp['lat'] = germany['lat']
temp['lon'] = germany['lon']
temp['nuts0'] = 'DE'
temp['oc'] = germany['TOC']
temp['N'] = germany['TN']
temp['ph_kcl'] = np.nan
temp['ph_h2o'] = germany['pH_H2O']
temp['ph_cacl2'] = germany['pH_CaCl2']
temp['bulk_density'] = germany['BD_bulk']
temp['clay'] = germany['Clay']
temp['silt'] = germany['Silt']
temp['sand'] = germany['Sand']
temp['caco3'] = np.nan
temp['K'] = np.nan
temp['P'] = np.nan

# basic info
print(f'{len(temp)} data in total')

na = temp['time'].isna().sum()
print(f'{na} data with no time info')

na = len(temp[temp['hzn_btm'].isna() | temp['hzn_top'].isna()])
print(f'{na} data with no depth info')

na = len(temp[temp['lat'].isna() | temp['lon'].isna()])
print(f'{na} data with no coordinate info')

temp.to_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/germany_harmonized_v1.csv',index=False)

17189 data in total
0 data with no time info
0 data with no depth info
0 data with no coordinate info


In [4]:
# +belgium
# read in 2 sites
belgium_p = pd.read_csv('/mnt/diskstation/data/soil_points/Belgium/Vlaanderen/Aardewerk-Vlaanderen-2010_Profiel.csv')
belgium_h = pd.read_csv('/mnt/diskstation/data/soil_points/Belgium/Vlaanderen/Aardewerk-Vlaanderen-2010_Horizont.csv',low_memory=False,encoding = "ISO-8859-1")
# merge 2 sites
belgium_p = belgium_p.rename(columns={'ID': 'Profiel_ID'}) 
belgium = belgium_h.merge(belgium_p, on="Profiel_ID", how="inner")
# Define the coordinate systems
lambert72 = pyproj.CRS.from_epsg(31370)  # Lambert72 CRS
wgs84 = pyproj.CRS.from_epsg(4326)  # WGS84 CRS (GPS)
transformer = pyproj.Transformer.from_crs(lambert72, wgs84)
belgium['lat'], belgium['lon'] = transformer.transform(belgium['Coordinaat_Lambert72_X'], belgium['Coordinaat_Lambert72_Y'])
# belgium['Y'], belgium['X'] = transformer.transform(belgium['Coordinaat_Bonne_E'], belgium['Coordinaat_Bonne_N'])

# convert humus to oc
belgium.loc[belgium['Humus_koolstof_nieuwe_formule']==0, 'Humus'] = belgium.loc[belgium['Humus_koolstof_nieuwe_formule']==0, 'Humus']*4/3/1.724
belgium.loc[belgium['Humus_koolstof_nieuwe_formule']==1, 'Humus'] = belgium.loc[belgium['Humus_koolstof_nieuwe_formule']==1, 'Humus']*4/3/2 # new scaler

# extract time info 
belgium['Profilering_Datum'] = belgium['Profilering_Datum'].str.split(' ').str[0]
belgium['Profilering_Datum'] = belgium['Profilering_Datum'].str.split('-').str[-1].astype(float)
belgium.loc[belgium['Profilering_Datum'] >2020, 'Profilering_Datum'] = np.nan

# extract depth info
belgium['hzn_top'] = np.nanmin(belgium[['Diepte_grens_boven1', 'Diepte_grens_boven2']], axis=1)
belgium['hzn_btm'] = np.nanmax(belgium[['Diepte_grens_onder1', 'Diepte_grens_onder2']], axis=1)
belgium.loc[belgium['hzn_top'] > belgium['hzn_btm'],['hzn_top','hzn_btm']] = np.nan

column_names = ['lat','lon','time','hzn_top','hzn_btm','ref']
temp = pd.DataFrame(columns=column_names)
temp['time'] = belgium['Profilering_Datum']
temp['hzn_top'] = belgium['Diepte_grens_boven1']
temp['hzn_btm'] = belgium['Diepte_grens_onder1']
temp.loc[temp['hzn_top'].isna(),'hzn_top'] = belgium.loc[temp['hzn_top'].isna(),'Diepte_grens_boven2']
temp.loc[temp['hzn_btm'].isna(),'hzn_btm'] = belgium.loc[temp['hzn_btm'].isna(),'Diepte_grens_onder2']
temp['hzn_top'] = belgium['hzn_top'] 
temp['hzn_btm'] = belgium['hzn_btm']   
temp['lat'] = belgium['lat']
temp['lon'] = belgium['lon']
temp['oc'] = belgium['Humus']*10
temp['caco3'] = belgium['Calciumcarbonaatgehalte']*10 # %->g/kg
temp['N'] = np.nan
temp['ph_kcl'] = belgium['pH_KCl']
temp['ph_h2o'] = belgium['pH_H2O']
temp['ph_cacl2'] = np.nan
temp['bulk_density'] = np.nan
temp['clay'] = belgium['T0_2']
temp['silt'] = belgium['T2_10']+belgium['T10_20']+belgium['T20_50']
temp['sand'] = belgium['T50_100']+belgium['T100_200']+belgium['T200_500']+belgium['T500_1000']+belgium['T1000_2000']
temp['K'] = np.nan
temp['P'] = np.nan
temp['ref'] = 'vlaanderen.belgium'
temp['nuts0'] = 'BE'

# possible filter
na = temp['time'].isna().sum()
print(f'{na} data with no time info')

na = len(temp[temp['hzn_btm'].isna() | temp['hzn_top'].isna()])
print(f'{na} data with no depth info')

na = len(temp[temp['lat'].isna() | temp['lon'].isna()])
print(f'{na} data with no coordinate info')

print(f'{len(temp)} in total')
# temp.to_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/belgium_harmonized_v1.csv',index=False)

3541 data with no time info
3172 data with no depth info
740 data with no coordinate info
42529 in total


  belgium['hzn_top'] = np.nanmin(belgium[['Diepte_grens_boven1', 'Diepte_grens_boven2']], axis=1)
  belgium['hzn_btm'] = np.nanmax(belgium[['Diepte_grens_onder1', 'Diepte_grens_onder2']], axis=1)


In [6]:
# # scotland
# scotland = pd.read_excel('/mnt/diskstation/data/soil_points/Scotland/NSIS_1_10km_grid_gh.xlsx', sheet_name='NSIS1_10km')
# osgb36 = pyproj.CRS.from_string("+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +ellps=airy +towgs84=446.448,-125.157,542.060,0.1502,0.2470,0.8421,-20.4894 +units=m +no_defs")
# wgs84 = pyproj.CRS.from_epsg(4326)
# transformer = pyproj.Transformer.from_crs(osgb36, wgs84)
# scotland['lat'], scotland['lon'] = transformer.transform(scotland['EASTING'], scotland['NORTHING'])

# column_names = ['lat','lon','time','hzn_top','hzn_btm','ref']
# temp = pd.DataFrame(columns=column_names)
# temp['lat'] = scotland['lat']
# temp['lon'] = scotland['lon']
# temp['nuts0'] = 'UK-scotland'
# temp['time'] = scotland['PROFILE_DATE'].astype(str).str[-4:]
# temp['hzn_top'] = scotland['HORZ_TOP']
# temp['hzn_btm'] = scotland['HORZ_BOTTOM']
# temp['ref'] = 'scotland.NSIS1-hutton.ac.uk'

# temp['oc'] = scotland['DP1971_ORGANIC_MATTER']*10/1.72
# temp['N'] = scotland['DP1971_NITROGEN']*10 # % -> g/kg
# temp['caco3'] = np.nan
# temp['bulk_density'] = np.nan
# temp['ph_kcl'] = np.nan
# temp['ph_h2o'] = scotland['DP1971_PH_H2O']
# temp['ph_cacl2'] = scotland['DP1971_PH_CACL2']
# temp['clay'] = scotland['DP1971_CLAY']
# temp['silt'] = scotland['DP1971_UBSILT']
# temp['sand'] = scotland['DP1971_UBSAND']
# temp['K'] = scotland['NIPAQUA_POTASSIUM'] # ppm = mg/kg
# temp['P'] = scotland['NIPAQUA_PHOSPHORUS'] # ppm = mg/kg

# # possible filter
# na = temp['time'].isna().sum()
# print(f'{na} data with no time info')

# na = len(temp[temp['hzn_btm'].isna() | temp['hzn_top'].isna()])
# print(f'{na} data with no depth info')

# na = len(temp[temp['lat'].isna() | temp['lon'].isna()])
# print(f'{na} data with no coordinate info')

# print(f'{len(temp)} in total')

# # temp.to_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/scotland_harmonized_v1.csv',index=False)

In [7]:
# estonia
# estonia = gpd.read_file('/mnt/diskstation/data/soil_points/Estonia/export_estonia_public_soil_samples.gpkg')
temp = pd.read_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/estonia_harmonized_v1.csv')
temp['ref'] = 'estonia.kese'
temp['nuts0'] = 'EE'
# column_names = ['lat','lon','time','hzn_top','hzn_btm','ref']
# temp = pd.DataFrame(columns=column_names)
# temp['lat'] = estonia['geometry'].y
# temp['lon'] = estonia['geometry'].x
# temp['nuts0'] = 'EE'
# # temp['time'] = estonia['PROFILE_DATE'].astype(str).str[-4:]
# temp['hzn_top'] = estonia['soil_depth']-1
# temp['hzn_btm'] = estonia['soil_depth']+1
# temp.loc[estonia['soil_depth'].isna(),'hzn_top'] = 0
# temp.loc[estonia['soil_depth'].isna(),'hzn_btm'] = 25
# temp['ref'] = 'https://www.hutton.ac.uk/about/facilities/national-soils-archive/resampling-soils-inventory'

# temp['oc'] = estonia['SOC']*10 # % -> g/kg
# # temp['N'] = estonia['DP1971_NITROGEN']*10 # % -> g/kg
# # temp['caco3'] = np.nan
# # temp['bulk_density'] = np.nan
# # temp['ph_kcl'] = np.nan
# # temp['ph_h2o'] = estonia['DP1971_PH_H2O']
# # temp['ph_cacl2'] = estonia['DP1971_PH_CACL2']
# # temp['clay'] = estonia['DP1971_CLAY']
# # temp['silt'] = estonia['DP1971_UBSILT']
# # temp['sand'] = estonia['DP1971_UBSAND']
# # temp['K'] = estonia['NIPAQUA_POTASSIUM'] # ppm = mg/kg
# # temp['P'] = estonia['NIPAQUA_PHOSPHORUS'] # ppm = mg/kg

# possible filter
na = temp['time'].isna().sum()
print(f'{na} data with no time info')

na = len(temp[temp['hzn_btm'].isna() | temp['hzn_top'].isna()])
print(f'{na} data with no depth info')

na = len(temp[temp['lat'].isna() | temp['lon'].isna()])
print(f'{na} data with no coordinate info')

print(f'{len(temp)} in total')
temp
temp.to_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/estonia_harmonized_v1.csv',index=False)

0 data with no time info
0 data with no depth info
0 data with no coordinate info
3015 in total


In [8]:
# gema
gema = gpd.read_file('/mnt/diskstation/data/soil_points/EU/GEMAS/GEMAS.csv')

column_names = ['lat','lon','time','hzn_top','hzn_btm','ref']
temp = pd.DataFrame(columns=column_names)
temp['lat'] = gema['YCOO']
temp['lon'] = gema['XCOO']
temp['oc'] = gema['TOC'] 
temp['ph_cacl2'] = gema['pH_CaCl2']
temp['clay'] = gema['clay']
temp['silt'] = gema['silt']
temp['time'] = 2008
temp['hzn_top'] = gema['UHDICM']
temp['hzn_btm'] = gema['LHDICM']
temp = temp.apply(pd.to_numeric)
temp['sand'] = 100-temp['clay']-temp['silt']
temp['oc'] = temp['oc']*10 # % -> g/kg

country_to_nuts0 = {
    'GER': 'DE',  # Germany
    'SKA': 'SK',  # Slovakia
    'EST': 'EE',  # Estonia
    'LIT': 'LT',  # Lithuania
    'NOR': 'NO',  # Norway (Note: Norway is not an EU member but is included in some NUTS classifications)
    'PTG': 'PT',  # Portugal
    'POL': 'PL',  # Poland
    'SWE': 'SE',  # Sweden
    'DEN': 'DK',  # Denmark
    'ITA': 'IT',  # Italy
    'FRA': 'FR',  # France
    'FIN': 'FI',  # Finland
    'UKR': 'UA',  # Ukraine (Note: Ukraine is not an EU member and typically not included in NUTS)
    'CRO': 'HR',  # Croatia
    'HEL': 'EL',  # Greece (Note: The code for Greece in the NUTS classification is EL, not GR)
    'HUN': 'HU',  # Hungary
    'SPA': 'ES',  # Spain
    'CYP': 'CY',  # Cyprus
    'BEL': 'BE',  # Belgium
    'UNK': 'UK',  # United Kingdom (Note: The UK left the EU but was previously included in NUTS)
    'LAV': 'LV',  # Latvia
    'SIL': 'SI',  # Slovenia
    'BUL': 'BG',  # Bulgaria
    'SRB': 'RS',  # Serbia (Note: Serbia is a candidate country for EU membership)
    'CZR': 'CZ',  # Czech Republic
    'BOS': 'BA',  # Bosnia and Herzegovina (Note: Bosnia and Herzegovina is not an EU member)
    'FOM': 'MK',  # North Macedonia (Note: The official NUTS code for North Macedonia is MK)
    'AUS': 'AT',  # Austria
    'NEL': 'NL',  # Netherlands
    'SLO': 'SK',  # Slovakia (Note: This seems to be a duplicate of SKA)
    'IRL': 'IE',  # Ireland
    'MON': 'ME',  # Montenegro (Note: Montenegro is a candidate country for EU membership)
    'LUX': 'LU'   # Luxembourg
}

temp['nuts0'] = gema['COUNTRY']
temp['nuts0'] = temp['nuts0'].map(country_to_nuts0)
temp['ref'] = 'gemas'
temp['lc_survey'] = gema['TYPE']
temp.loc[temp['lc_survey']=='Gr','lc_survey'] = 'permanent grassland'
temp.loc[temp['lc_survey']=='Ap','lc_survey'] = 'arable land'

# possible filter
na = temp['time'].isna().sum()
print(f'{na} data with no time info')

na = len(temp[temp['hzn_btm'].isna() | temp['hzn_top'].isna()])
print(f'{na} data with no depth info')

na = len(temp[temp['lat'].isna() | temp['lon'].isna()])
print(f'{na} data with no coordinate info')

print(f'{len(temp)} in total')
temp.to_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/gemas_harmonized_v1.csv',index=False)

ERROR 1: PROJ: proj_create_from_database: Open of /opt/conda/share/proj failed


0 data with no time info
0 data with no depth info
1 data with no coordinate info
4132 in total


In [10]:
# france
france = pd.read_csv('/mnt/diskstation/data/Soil_points/France/RMQS1_analyses_composites_18_11_2021_virgule.csv')
rgf93 = pyproj.CRS.from_epsg(2154)  # RGF 93 coordinate system
wgs84 = pyproj.CRS.from_epsg(4326)  # WGS84 (GPS) coordinate system
transformer = pyproj.Transformer.from_crs(rgf93, wgs84)
france['lat'],france['lon'] = transformer.transform(france['x_theo'], france['y_theo'])

column_names = ['lat','lon','time','hzn_top','hzn_btm','ref']
temp = pd.DataFrame(columns=column_names)
temp['lat'] = france['lat']
temp['lon'] = france['lon']
temp['nuts0'] = 'FR'
temp['time'] = france['date_complete'].str[0:4].astype(float)
france = france.apply(pd.to_numeric, errors='coerce')
temp['hzn_top'] = france['profondeur_hz_sup']
temp['hzn_btm'] = france['profondeur_hz_inf']
temp['ref'] = 'france.RMQS'
temp['oc'] = france['carbone_16_5_1']
temp['N'] = france['n_tot_31_1'] 
temp['caco3'] = france['calc_tot_2_1_2']
temp['bulk_density'] = np.nan
temp['ph_kcl'] = np.nan
temp['ph_h2o'] = france['ph_eau_6_1']
temp['ph_cacl2'] = np.nan
temp['clay'] = france['argile']/10 # g/kg -> %
temp['silt'] = (france['limon_fin']+france['limon_grossier'])/10 # g/kg -> %
temp['sand'] = (france['sable_fin']+france['sable_grossier'])/10 # g/kg -> %
temp['K'] = france['k_tot_hf']*10000 # g/100g -> mg/kg
temp['P'] = france['p_ass_81_1']*1000 # olsen  g/kg -> mg/kg

# possible filter
na = temp['time'].isna().sum()
print(f'{na} data with no time info')

na = len(temp[temp['hzn_btm'].isna() | temp['hzn_top'].isna()])
print(f'{na} data with no depth info')

na = len(temp[temp['lat'].isna() | temp['lon'].isna()])
print(f'{na} data with no coordinate info')

print(f'{len(temp)} in total')

temp.to_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/france_harmonized_v1.csv',index=False)

0 data with no time info
0 data with no depth info
0 data with no coordinate info
4148 in total


In [15]:
# swiss
temp = pd.read_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/swiss_harmonized_v1.csv')
temp['ref'] = 'swiss.nabo'
temp['nuts0'] = 'CH'

# possible filter
na = temp['time'].isna().sum()
print(f'{na} data with no time info')

na = len(temp[temp['hzn_btm'].isna() | temp['hzn_top'].isna()])
print(f'{na} data with no depth info')

na = len(temp[temp['lat'].isna() | temp['lon'].isna()])
print(f'{na} data with no coordinate info')

print(f'{len(temp)} in total')

temp = temp.drop(columns=['anonymization','date'])
temp.to_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/swiss_harmonized_v1.csv',index=False)

0 data with no time info
0 data with no depth info
0 data with no coordinate info
36031 in total


### merge the dataset

In [32]:
# merge all the data
names = ['germany','swiss','croatia','estonia','france','gemas'] #'belgium','ireland','scotland'
column_names = ['lat', 'lon', 'time', 'hzn_top', 'hzn_btm', 'ref', 'oc', 'ph_h2o', 
                'ph_cacl2', 'bulk_density', 'clay', 'silt', 'sand', 'caco3', 'N', 'K', 'P']
data = pd.DataFrame(columns=column_names)

for i in names:
    temp = pd.read_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/{i}_harmonized_v1.csv')
    print(f'{i}:{len(temp)}')
    data = pd.concat([data,temp])
    
lucas = pd.read_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/lucas.full_harmonized_v1.csv')
print(f'lucas: {len(lucas)}')
data = pd.concat([data,lucas])
data = data.drop(columns=['point_id','lc_survey','ph_kcl','ID'])

germany:17189
swiss:36031
croatia:6271
estonia:3015
france:4148
gemas:4132
lucas: 75426


  lucas = pd.read_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/lucas.full_harmonized_v1.csv')


In [33]:
# only keep the data measured after 2000
data = data.loc[data['time']>=2000]

# drop rows without coordinates recorded
data = data.loc[~data['lat'].isna()]

# overview of the dataset
for col in data.columns.values.tolist():
    print(f'{col}: missing {data[col].isna().sum()} data, {round(data[col].isna().sum()*100/len(data))}%')
    
data.to_csv(f'/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/soil.full_harmonized_v1.csv',index=False)


lat: missing 0 data, 0%
lon: missing 0 data, 0%
time: missing 0 data, 0%
hzn_top: missing 0 data, 0%
hzn_btm: missing 0 data, 0%
ref: missing 0 data, 0%
oc: missing 16259 data, 14%
ph_h2o: missing 27763 data, 25%
ph_cacl2: missing 23308 data, 21%
bulk_density: missing 80274 data, 71%
clay: missing 55162 data, 49%
silt: missing 55237 data, 49%
sand: missing 55860 data, 49%
caco3: missing 52402 data, 46%
N: missing 26201 data, 23%
K: missing 41826 data, 37%
P: missing 45943 data, 41%
nuts0: missing 0 data, 0%


In [41]:
# # create gpkg
# from shapely.geometry import Point
# from geopandas import gpd

# df = pd.read_csv('/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/training_point_v2_full.csv', low_memory=False)
# geometry = [Point(xy) for xy in zip(df['gps_long'], df['gps_lat'])]
# gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

# gdf_3035 = gdf.to_crs("EPSG:3035")
# gdf_3035['point_index'] = gdf_3035.index
# gdf_3035.to_file("/mnt/primus/xuemeng_tmp_harbour/soc_eu/data/training_point_overlay_3035.gpkg", driver="GPKG")
