In [1]:
import pandas as pd

# read in data
data_path = '/mnt/inca/ai4sh_data.harmo'
df = pd.read_csv(f'{data_path}/raw_data/SSL_GEOCRADLE/SSL_GEOCRADLE_1.csv', low_memory=False)

# only keep eu countries
df = df.loc[~df['origin'].isin(['Israel','Egypt'])]

# split soil properties and soil type
soil_prop = df[['ID','origin','Latitude','Longitude','Sampling_date','Elevation','Depth','Climate_Koeppen',
         'Sand_Fraction','Clay_Fraction','Silt_Fraction','USDA_texture','OC','OM','CaCO3','CEC','LOI','pH_H2O','pH_KCl','pH_CaCl2',
         'EC_muS','NO3']]

soil_type = df[['ID','origin','Latitude','Longitude','Sampling_date','Elevation','Depth','Soil_type_WRB','Soil_type_extended_WRB',
                'Soil_type_USDA','Soil_extended_WRB','Soil_type_WRB_description','Climate_Koeppen']]
soil_type.to_csv('/mnt/inca/ai4sh_data.harmo/raw_data/SSL_GEOCRADLE/soil_type_SSL_GEOCRADLE.csv',index=False)

# rename the columns
soil_prop = soil_prop.rename(columns={'ID':'sample_id','Latitude':'lat','Longitude':'lon','OC':'oc',
                                     'CaCO3':'caco3','pH_H2O':'ph_h2o','pH_CaCl2':'ph_cacl2','EC_muS':'EC',
                                     'Sand_Fraction':'sand','Clay_Fraction':'clay','Silt_Fraction':'silt'})

In [2]:
# organize time - year
from datetime import datetime, timedelta
soil_prop['time'] = 0

# convert excel serial number
for ss in soil_prop['Sampling_date'].unique():
    start_date = datetime(1899, 12, 31)
    if len(ss)==5:
        actual_date = start_date + timedelta(days=int(ss))
        sd = actual_date.strftime('%d.%m.%Y')
        soil_prop.loc[soil_prop['Sampling_date']==ss,'Sampling_date'] = sd
        
# extract year
for ss in soil_prop['Sampling_date'].unique():
    if len(ss)==8:
        soil_prop.loc[soil_prop['Sampling_date']==ss,'time'] = int(ss.split('-')[2])+2000
    else:
        soil_prop.loc[soil_prop['Sampling_date']==ss,'time'] = int(ss.split('.')[2])



In [3]:
# organize nuts0 info

soil_prop['nuts0'] = 0
soil_prop.loc[soil_prop['origin']=='Albania','nuts0'] = 'AL'  
soil_prop.loc[soil_prop['origin']=='Bulgaria','nuts0'] = 'BG'
soil_prop.loc[soil_prop['origin']=='Cyprus','nuts0'] = 'CY'  
soil_prop.loc[soil_prop['origin']=='FYROM','nuts0'] = 'MK'  
soil_prop.loc[soil_prop['origin']=='Greece','nuts0'] = 'EL'
soil_prop.loc[soil_prop['origin']=='Serbia','nuts0'] = 'RS'
soil_prop.loc[soil_prop['origin']=='Turkey','nuts0'] = 'TR'


In [4]:
# organize properties
soil_prop.loc[soil_prop['oc'].isna(),'oc'] = soil_prop.loc[soil_prop['oc'].isna(),'OM']/1.725 # fill nan values with converted som
soil_prop['oc'] = soil_prop['oc']*10 # % -> g/kg

soil_prop['caco3'] = soil_prop['caco3']*10 # % -> g/kg

soil_prop['EC'] = soil_prop['EC']*0.01 # mS/m -> μS/cm

soil_prop['hzn_top'] = soil_prop['Depth']-10
soil_prop['hzn_btm'] = soil_prop['Depth']
soil_prop.loc[soil_prop['hzn_top']<0,'hzn_top'] = 0

In [5]:
soil_prop = soil_prop.drop(columns=['origin','Sampling_date','Elevation','Depth','Climate_Koeppen','USDA_texture','OM','LOI','pH_KCl','NO3'])

In [6]:
# possible filter
na = soil_prop['time'].isna().sum()
print(f'{na} data with no time info')

na = len(soil_prop[soil_prop['hzn_btm'].isna() | soil_prop['hzn_top'].isna()])
print(f'{na} data with no depth info')

na = len(soil_prop[soil_prop['lat'].isna() | soil_prop['lon'].isna()])
print(f'{na} data with no coordinate info')

print(f'{len(soil_prop)} in total')
soil_prop = soil_prop.dropna(subset=['lat','lon','hzn_top','hzn_btm','time'])


0 data with no time info
6 data with no depth info
0 data with no coordinate info
1522 in total


In [7]:
soil_prop['ref'] = 'geocradle'
soil_prop.to_csv(f'{data_path}/data/geocradle_harmonized_v1.csv',index=False)

In [8]:
soil_prop.head(5)

Unnamed: 0,sample_id,lat,lon,sand,clay,silt,oc,caco3,CEC,ph_h2o,ph_cacl2,EC,time,nuts0,hzn_top,hzn_btm,ref
0,AL-FL-030-00001,41.9617,19.4667,2.4,51.6,46.0,17.48077,,,,,,2017,AL,20.0,30.0,geocradle
1,AL-FL-030-00002,41.908699,19.5592,18.4,21.6,60.0,6.67416,,,,,,2017,AL,20.0,30.0,geocradle
2,AL-FL-030-00003,41.9632,19.507,16.5,24.7,58.8,21.23256,,,,,,2017,AL,20.0,30.0,geocradle
3,AL-FL-030-00004,41.9855,19.550699,32.4,18.2,49.4,9.43953,,,,,,2017,AL,20.0,30.0,geocradle
4,AL-FL-030-00006,41.951401,19.3647,20.0,24.1,55.9,9.03792,,,,,,2017,AL,20.0,30.0,geocradle
