# AusPlots Soil Dataset Validation

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from scipy import stats

In [2]:
site_slga_data = pd.read_csv('../DATASETS/Soils_and_Landscape_Grid_of_Australia/Output/site_slga_data.csv', index_col = 0).copy()
site_slga_data.head()

Unnamed: 0,AVP_000_005,AVP_005_015,AVP_015_030,AVP_030_060,AVP_060_100,AVP_100_200,CLY_000_005,CLY_005_015,CLY_015_030,CLY_030_060,...,SLT_005_015,SLT_030_060,SLT_060_100,SLT_100_200,pHc_000_005,pHc_005_015,pHc_015_030,pHc_030_060,pHc_060_100,pHc_100_200
NSABBS0001,41.25,32.25,11.5,8.25,9.0,7.5,15.75,16.75,19.75,22.5,...,12.5,12.25,11.25,13.0,5.911349,5.908731,6.090138,6.149799,6.305011,6.503961
NSABBS0002,41.0,32.75,12.25,8.25,8.25,6.75,16.25,17.5,22.25,27.75,...,13.0,12.25,11.25,13.25,5.641203,5.63668,5.80745,5.846457,6.019695,6.224033
NSABBS0003,8111.0,8107.0,8102.0,8101.833333,8103.666667,8107.0,10.0,12.8,15.0,18.4,...,8.0,8.0,9.6,9.6,5.743322,5.82933,6.072671,6.279499,6.439817,6.459643
NSABBS0004,18.333333,13.5,8.0,7.333333,9.5,13.166667,8.833333,11.833333,13.666667,15.666667,...,7.833334,7.833334,9.0,9.5,5.822774,5.915548,6.17272,6.374738,6.526344,6.53919
NSABBS0005,18.5,14.0,7.0,6.0,8.166667,10.666667,9.833333,12.833333,15.0,13.166667,...,7.833334,7.666666,8.833333,9.0,5.784267,5.874176,6.133616,6.367352,6.542438,6.567882


In [3]:
soils_char = pd.read_csv('../DATASETS/AusPlots_Extracted_Data/Final/extracted_Final_soil_char_2-0-6.csv', index_col = 0).copy()
soils_char.head()

Unnamed: 0,soil.char.site_unique,soil.char.site_location_name,soil.char.site_location_visit_id,soil.char.upper_depth,soil.char.lower_depth,soil.char.horizon,soil.char.texture_grade,soil.char.texture_qualifier,soil.char.texture_modifier,soil.char.colour_when_moist,...,soil.char.ph,soil.char.pedality_grade,soil.char.pedality_fabric,soil.char.next_size_type_2,soil.char.next_size_type_1,soil.char.smallest_size_type_2,soil.char.smallest_size_type_1,soil.char.next_size_2,soil.char.next_size_1,soil.char.layer_barcode
1,NSABHC0013-53608,NSABHC0013,53608,0.5,0.6,NC,NC,,,,...,,NC,NC,NC,NC,NC,NC,,11.0,NSA 050414
2,WAAPIL0030-58057,WAAPIL0030,58057,0.0,0.09,A1,MC,NC,NC,2.5YR2.54,...,6.8,M,R,NC,NC,NC,AB,,,WAA053908
3,WAAPIL0030-58057,WAAPIL0030,58057,0.09,0.45,B21K,MHC,NC,NC,2.5YR34,...,7.9,M,R,NC,NC,NC,AB,,,WAA053909
4,WAAPIL0030-58057,WAAPIL0030,58057,0.45,0.6,B22K,MC,NC,NC,10R34,...,8.6,NC,NC,NC,NC,NC,NC,,,WAA053910
5,NSABHC0013-53608,NSABHC0013,53608,0.3,0.4,NC,NC,,,,...,,NC,NC,NC,NC,NC,NC,,11.0,NSA 050412


In [4]:
classifications = {
    'S': [0, 5],
    'LS': [5, 5],
    'CS': [5, 10],
    'SL': [10, 20],
    'L' : [25, 25],
    'ZL': [25, 25],
    'SCL' : [20, 30],
    'CL': [30, 35],
    'CLS' : [30, 35],
    'ZCL' : [30, 35],
    'LC' : [35, 40],
    'LMC' : [40, 45],
    'MC': [45, 55],
    'MHC': [50, 100],
    'HC': [50, 100],
    'NC': [-1, -1]
}
classifications = pd.DataFrame(classifications)

In [5]:
soils_char

Unnamed: 0,soil.char.site_unique,soil.char.site_location_name,soil.char.site_location_visit_id,soil.char.upper_depth,soil.char.lower_depth,soil.char.horizon,soil.char.texture_grade,soil.char.texture_qualifier,soil.char.texture_modifier,soil.char.colour_when_moist,...,soil.char.ph,soil.char.pedality_grade,soil.char.pedality_fabric,soil.char.next_size_type_2,soil.char.next_size_type_1,soil.char.smallest_size_type_2,soil.char.smallest_size_type_1,soil.char.next_size_2,soil.char.next_size_1,soil.char.layer_barcode
1,NSABHC0013-53608,NSABHC0013,53608,0.50,0.60,NC,NC,,,,...,,NC,NC,NC,NC,NC,NC,,11.0,NSA 050414
2,WAAPIL0030-58057,WAAPIL0030,58057,0.00,0.09,A1,MC,NC,NC,2.5YR2.54,...,6.8,M,R,NC,NC,NC,AB,,,WAA053908
3,WAAPIL0030-58057,WAAPIL0030,58057,0.09,0.45,B21K,MHC,NC,NC,2.5YR34,...,7.9,M,R,NC,NC,NC,AB,,,WAA053909
4,WAAPIL0030-58057,WAAPIL0030,58057,0.45,0.60,B22K,MC,NC,NC,10R34,...,8.6,NC,NC,NC,NC,NC,NC,,,WAA053910
5,NSABHC0013-53608,NSABHC0013,53608,0.30,0.40,NC,NC,,,,...,,NC,NC,NC,NC,NC,NC,,11.0,NSA 050412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4323,NTAPCK0003-58923,NTAPCK0003,58923,0.35,0.40,B22,SL,-,F,10YR 5/8,...,,V,NC,NC,NC,NC,NC,,,
4324,NTAPCK0003-58923,NTAPCK0003,58923,0.40,0.50,B22,SL,-,F,10YR 5/8,...,6.0,V,NC,NC,NC,NC,NC,,,NTA055051
4325,NTAPCK0003-58923,NTAPCK0003,58923,0.50,0.55,B22,SL,-,F,10YR 5/8,...,,V,NC,NC,NC,NC,NC,,,
4326,NTAPCK0003-58923,NTAPCK0003,58923,0.55,0.65,C/B,SCL,-,S,7.5YR 5/8,...,6.0,NC,NC,NC,NC,NC,NC,,,NTA055052


In [6]:
soils_char.loc[soils_char['soil.char.texture_grade'].isna(), 'soil.char.texture_grade'] = 'NC'

In [7]:
cly_min = [] 
cly_max = [] 
for i,v in soils_char.iterrows():
    texture_grade = v['soil.char.texture_grade']
    min_range = np.min(classifications.loc[0, texture_grade])
    max_range = np.max(classifications.loc[1, texture_grade])
    cly_min.append(min_range)
    cly_max.append(max_range)

KeyError: 'LP'

In [None]:
soils_char['cly_min'] = cly_min
soils_char['cly_max'] = cly_max

In [None]:
soils_char['clay_mid'] = (soils_char['cly_min'] + soils_char['cly_max'])/2

In [None]:
soils_char.head()

In [None]:
soils_char['depth_mid'] = (soils_char['soil.char.upper_depth'] + soils_char['soil.char.lower_depth'])/2 

In [None]:
soils_char.head()

In [None]:
depth_intervals = []

for i in soils_char['depth_mid']:
    if i > 0.00 and i < 0.05:
        depth_intervals.append('CLY_000_005')
    elif i >= 0.05 and i < 0.15:
        depth_intervals.append('CLY_005_015')
    elif i >= 0.15 and i < 0.30:
        depth_intervals.append('CLY_015_030')
    elif i >= 0.30 and i < 0.60:
        depth_intervals.append('CLY_030_060')
    elif i >= 0.60 and i < 1.00:
        depth_intervals.append('CLY_060_100')
    elif i >= 1.00 and i < 2.00:
        depth_intervals.append('CLY_100_200')
    else:
        depth_intervals.append(-1)

In [None]:
soils_char['depth_interval'] = depth_intervals
soils_char['depth_length'] = abs(soils_char['soil.char.upper_depth'] - soils_char['soil.char.lower_depth'])

In [None]:
soils_char

In [None]:
soils_char_essential = soils_char[['soil.char.site_location_name', 'soil.char.upper_depth','soil.char.lower_depth','depth_mid','clay_mid', 'depth_interval', 'depth_length', 'soil.char.texture_grade']]
soils_char_essential.head()

In [None]:
soils_char_essential = soils_char_essential[ (soils_char_essential['clay_mid'] != -1.0) & (soils_char_essential['depth_interval'] != -1)]
soils_char_essential.head()

In [None]:
slga_clay = []
for i, v in soils_char_essential.iterrows():
    #print(v['depth_interval'])
    clay_value = site_slga_data.loc[site_slga_data.index == v['soil.char.site_location_name'], v['depth_interval']].values[0]
    slga_clay.append(clay_value)

In [None]:
soils_char_essential['slga_clay'] = slga_clay

In [None]:
soils_char_essential = soils_char_essential[soils_char_essential['slga_clay'].isna() == False]

In [None]:
soils_char_essential.head()

In [None]:
def plot_boxplot(grade_value, classifications, dataset):
    
    dataset = dataset[dataset['soil.char.texture_grade'] == grade_value]
    
    if len(dataset) > 0:
    
        dataset.boxplot(column = ['slga_clay'], by = ['depth_interval'], figsize = (10,5))
        plt.axhline(y = classifications[grade_value][0], color = 'red', linestyle = 'dashed')
        plt.axhline(y = (classifications[grade_value][0] + classifications[grade_value][1])/2, color = 'black', linestyle = 'dashed')
        plt.axhline(y = classifications[grade_value][1], color = 'red', linestyle = 'dashed')
        plt.suptitle(grade_value) 
        plt.ylim(0, 100)
        plt.show()

In [None]:
for i in classifications.keys():
    if i != 'NC':
        plot_boxplot(i, classifications, soils_char_essential)

In [None]:
soils_char_essential['error'] = np.abs(soils_char_essential['slga_clay'] - soils_char_essential['clay_mid'])

In [None]:
soils_char_essential['State'] = [i[:2] for i in soils_char_essential['soil.char.site_location_name'].values]

In [None]:
soils_char_essential[['State','error']].groupby('State').describe()

In [None]:

for i in np.unique(soils_char_essential['depth_interval']):
    subset = soils_char_essential[soils_char_essential['depth_interval'] == i]
    interval = subset.depth_interval[subset.index[0]].split('_')
    interval_upper_lower = interval[1:]
    lo = '-'.join(interval_upper_lower) + ' cm'
    n_sites = len(np.unique(subset['soil.char.site_location_name']))
    n_rows = len(subset)
    subset.plot.scatter(x = 'clay_mid', y = 'slga_clay', title = f'{lo}, n sites = {n_sites}, samples = {n_rows}',
                        ylim = (0,100), xlim = (0,100), 
                        xlabel = "TERN Mean Clay Content (%)", ylabel = "SLGA Mean Clay Content (%)")
    slope, intercept, r, p, se = stats.linregress(x = subset['clay_mid'], y = subset['slga_clay'])
    plt.plot(subset['clay_mid'],intercept + slope*subset['clay_mid'], label='fitted line')
    one_to_one = [i for i in range(101)]
    plt.plot(one_to_one, one_to_one, linestyle = 'dashed')
    plt.annotate(f'$y = {slope:.3f}x + {intercept:.3f}$\n$R^2$ = {r:.3f}\n$p = {p:.5f}$', xy = (10, 80))
    plt.grid(True)

In [None]:
len(np.unique(soils_char_essential['soil.char.site_location_name']))