In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from eumap.misc import find_files, ttprint
from eumap.raster import read_rasters, save_rasters
from eumap.mapper import SpaceOverlay
import warnings
import matplotlib

folder = '/mnt/inca/soc_eu_model'

df = pd.read_csv(f'{folder}/data/001_covar_overlayed.csv',low_memory=False)




### convert categorical variables to numerical using dummy

In [2]:
lithology = '''1	alkaliOlivineBasalt
2	amphibolite
3	andesite
4	anorthositicRock
5	arenite
6	ashBrecciaBombOrBlockTephra
7	ashTuffLapillistoneAndLapilliTuff
8	basalt
9	basanite
10	biogenicSilicaSedimentaryRock
11	breccia
12	brecciaGougeSeries
13	carbonateMud
14	carbonateOoze
15	carbonateSedimentaryRock
16	carbonatite
17	chalk
18	chloriteActinoliteEpidoteMetamorphicRock
19	clasticSandstone
20	clasticSediment
21	clasticSedimentaryMaterial
22	clasticSedimentaryRock
23	clay
24	claystone
25	conglomerate
26	dacite
27	diamictite
28	diamicton
29	diorite
30	dioriticRock
31	doleriticRock
32	dolomite
33	dolomiticOrMagnesianSedimentaryRock
34	eclogite
35	exoticCompositionIgneousRock
36	fineGrainedIgneousRock
37	foidBearingSyenite
38	foidSyenitoid
39	foliatedMetamorphicRock
40	fragmentalIgneousMaterial
41	gabbro
42	gabbroicRock
43	gabbroid
44	glaucophaneLawsoniteEpidoteMetamorphicRock
45	gneiss
46	granite
47	granitoid
48	granodiorite
49	granulite
50	gravel
51	gypsumOrAnhydrite
52	hornfels
53	igneousMaterial
54	igneousRock
55	impactGeneratedMaterial
56	impureCarbonateSediment
57	impureCarbonateSedimentaryRock
58	impureDolomite
59	impureLimestone
60	komatiiticRock
61	limestone
62	marble
63	metamorphicRock
64	metasomaticRock
65	micaSchist
66	migmatite
67	monzogranite
68	monzonite
69	mud
70	mudstone
71	myloniticRock
72	nonClasticSiliceousSedimentaryRock
73	organicBearingMudstone
74	organicRichSediment
75	organicRichSedimentaryMaterial
76	orthogneiss
77	paragneiss
78	peat
79	peridotite
80	phaneriticIgneousRock
81	phonolite
82	phyllite
83	phyllonite
84	porphyry
85	pyroclasticMaterial
86	pyroclasticRock
87	pyroxenite
88	quartzDiorite
89	quartzite
90	residualMaterial
91	rhyolite
92	rhyolitoid
93	rock
94	sand
95	sandstone
96	schist
97	sediment
98	sedimentaryMaterial
99	sedimentaryRock
100	serpentinite
101	shale
102	silicateMud
103	silt
104	siltstone
105	skarn
106	slate
107	spilite
108	syenite
109	syeniticRock
110	syenogranite
111	tephrite
112	tholeiiticBasalt
113	tonalite
114	trachyte
115	trachyticRock
116	trachytoid
117	travertine
118	tuffBrecciaAgglomerateOrPyroclasticBreccia
119	ultramaficIgneousRock
120	wacke
121	waste
122	Unknown
123	Unpopulated'''

lithology = lithology.split('\n')
value = [float(i.split('\t')[0]) for i in lithology]
descp = [i.split('\t')[1] for i in lithology]
descp = ['lithology_EGDI_'+i for i in descp]
lithology_map = dict(zip(value,descp))

# some soil parental material = 0, might from the landmask, mixing water and land
spm = 'EGDI_GE_GeologicUnit_EN_1M_Surface_LithologyPolygon_250m_landmasked_gapfilled_epsg.3035_v20240514'
df.loc[df[spm]==0,spm] = np.nan
dummies = pd.get_dummies(df['EGDI_GE_GeologicUnit_EN_1M_Surface_LithologyPolygon_250m_landmasked_gapfilled_epsg.3035_v20240514'])
dummies = dummies.rename(columns=lithology_map)
dff = pd.concat([df, dummies], axis=1)
dff = dff.drop(columns=[spm])

In [3]:
dff.to_csv(f'{folder}/data/002_data_whole.csv',index=False)

### generate training and validation data

In [4]:
df = pd.read_csv(f'{folder}/data/002_data_whole.csv',low_memory=False)

In [5]:
# get the id of full validation points
bd_val = pd.read_csv(f'{folder}/data/003.0_validate.pnts.rob_bd.csv',low_memory=False)
oc_val = pd.read_csv(f'{folder}/data/003.1_validate.pnts.rob_soc.csv',low_memory=False)

idl = bd_val['id'].values.tolist() + oc_val['id'].values.tolist()
idl = [str(i) for i in idl]

val = df.loc[df['id'].isin(idl)]
val.to_csv(f'{folder}/data/004.0_validate.pnts_soc.csv',index=False)

In [6]:
# repeat for the properties appear less
idl = bd_val['id'].values.tolist()
idl = [str(i) for i in idl]

val = df.loc[df['id'].isin(idl)]
val.to_csv(f'{folder}/data/004.1_validate.pnts_prop.less.freq.csv',index=False)

In [7]:
# repeat to get training data, so without
idl = bd_val['id'].values.tolist() + oc_val['id'].values.tolist()
idl = [str(i) for i in idl]

val = df.loc[~df['id'].isin(idl)]
val.to_csv(f'{folder}/data/005.0_train.pnts_soc.csv',index=False)

In [8]:
print(len(df)-len(val))
print(len(val))

5947
388696


### get only the sharable ones

In [9]:
share = pd.read_csv(f'{folder}/data_share/share_or_not.csv')
merged_df = pd.merge(df, share, on='ref', how='left')
share_df = merged_df.loc[merged_df['Can be shared within the consortium']=='Y']

In [10]:
share_df.to_csv(f'{folder}/data_share/AI4SH_WP5_soil.property.with.cov.csv',index=False)

In [11]:
prop = pd.read_csv('/mnt/inca/ai4sh_data.harmo/data/AI4SH_WP5_soil.property.csv')
len(prop)

  prop = pd.read_csv('/mnt/inca/ai4sh_data.harmo/data/AI4SH_WP5_soil.property.csv')


345645

In [12]:
share_df

Unnamed: 0,id,lat,lon,time,hzn_top,hzn_btm,ref,nuts0,oc,ph_h2o,...,lithology_EGDI_syenite,lithology_EGDI_tephrite,lithology_EGDI_tonalite,lithology_EGDI_trachyte,lithology_EGDI_trachytoid,lithology_EGDI_tuffBrecciaAgglomerateOrPyroclasticBreccia,lithology_EGDI_ultramaficIgneousRock,lithology_EGDI_wacke,lithology_EGDI_Unpopulated,Can be shared within the consortium
0,1-1,40.871480,-7.049093,2000.0,0.0,30.0,portugal.infosolo,PT,6.70,5.2,...,False,False,False,False,False,False,False,False,False,Y
1,1-2,40.871480,-7.049093,2000.0,30.0,55.0,portugal.infosolo,PT,3.90,5.5,...,False,False,False,False,False,False,False,False,False,Y
2,2-3,40.583860,-6.952317,2000.0,0.0,13.0,portugal.infosolo,PT,5.90,4.9,...,False,False,False,False,False,False,False,False,False,Y
3,2-4,40.583860,-6.952317,2000.0,13.0,38.0,portugal.infosolo,PT,6.20,5.2,...,False,False,False,False,False,False,False,False,False,Y
4,3-5,40.756090,-6.862586,2000.0,0.0,25.0,portugal.infosolo,PT,5.40,5.4,...,False,False,False,False,False,False,False,False,False,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395398,Diputación de Palencia - DP220201,41.935616,-4.513888,2022.0,0.0,25.0,Castilla.y.Leon,ES,6.55,,...,False,False,False,False,False,False,False,False,False,Y
395399,Diputación de Palencia - DP220202,41.935616,-4.513888,2022.0,0.0,25.0,Castilla.y.Leon,ES,7.35,,...,False,False,False,False,False,False,False,False,False,Y
395400,Diputación de Palencia - DP220030,41.775509,-3.959406,2022.0,0.0,25.0,Castilla.y.Leon,ES,4.30,,...,False,False,False,False,False,False,False,False,False,Y
395401,Diputación de Palencia - DP220090,42.323577,-4.312706,2022.0,0.0,25.0,Castilla.y.Leon,ES,3.60,,...,False,False,False,False,False,False,False,False,False,Y


In [10]:
for i in dff.columns:
    print(i)

id
lat
lon
time
hzn_top
hzn_btm
ref
nuts0
oc
ph_h2o
ph_cacl2
bulk_density
clay
silt
sand
caco3
N
K
P
CEC
EC
oc_qa
N_qa
caco3_qa
bulk_density_qa
clay_qa
silt_qa
sand_qa
ph_h2o_qa
ph_cacl2_qa
P_qa
K_qa
EC_qa
CEC_qa
wv_mcd19a2v061.seasconv_m_1km_s_{year}0701_{year}0731_go_epsg.4326_v20230619
wv_mcd19a2v061.seasconv_sd_1km_s_{year}0201_{year}0228_go_epsg.4326_v20230619
wv_mcd19a2v061.seasconv_m_1km_s_{year}1101_{year}1130_go_epsg.4326_v20230619
wv_mcd19a2v061.seasconv_sd_1km_s_{year}1101_{year}1130_go_epsg.4326_v20230619
wv_mcd19a2v061.seasconv_m_1km_s_{year}0401_{year}0430_go_epsg.4326_v20230619
wv_mcd19a2v061.seasconv_m_1km_s_{year}0601_{year}0630_go_epsg.4326_v20230619
wv_mcd19a2v061.seasconv_sd_1km_s_{year}1201_{year}1231_go_epsg.4326_v20230619
wv_mcd19a2v061.seasconv_sd_1km_s_{year}0101_{year}0131_go_epsg.4326_v20230619
wv_mcd19a2v061.seasconv_sd_1km_s_{year}0401_{year}0430_go_epsg.4326_v20230619
wv_mcd19a2v061.seasconv_sd_1km_s_{year}0801_{year}0831_go_epsg.4326_v20230619
wv_mcd19a2v