This notebook outlines the work flow to obtain soil data from NRCS and format it for the whole US in an AnnAGNPS friendly way

In [2]:
import os, subprocess

from pathlib import Path
from tqdm import tqdm

import pandas as pd

Choose if you want to reprocess or work from existing files

In [8]:
reprocess = True

- We assume the raw soil data has been queried and put together (for example from https://sdmdataaccess.nrcs.usda.gov/Query.aspx or gNATSGO tables)

Then we will combine the result of that query sent by Kyle Stephens that also contains the data from Raster Soil Survey

In [15]:
path_to_soil_dir = Path('D:/AIMS/Datasets/Soil/DATABASE_POPULATION_TASKS/SDM_QUERY_AND_NITA_PROCESSING/ALL_US_v3_SSURGO_STATSGO2_RSS/')


path_to_soil_table = path_to_soil_dir / 'raw_query_data' / 'all_raw_soil_data_no_rvindicator_condition_STATSGO2_SSURGO_RSS.parquet'

path_to_split_files = path_to_soil_dir / 'split_files'

path_to_NITA_bin = path_to_soil_dir / 'NITA_v6.00.a.012_release_64-bit_Windows.exe'

Read Soil Data Table (for the entire US)

In [5]:
df = pd.read_parquet(path_to_soil_table)

We need to use the mukey instead of musym as the Soil_ID but NITA expects a column called musym so a trick is performed where the mukey column is renamed musym

In [6]:
df = df.rename(columns={'musym': 'musym_label',
                        'mukey': 'mukey_label'})
df['musym'] = df['mukey_label']

Remove commas from `compname` and `texdesc` columns so that NITA can process it

In [7]:
df['compname'] = df['compname'].str.replace(',','')
df['texdesc'] = df['texdesc'].str.replace(',','')

Write the whole updated table to file but step is not necessary

In [9]:
if reprocess:
    path_to_nasis = path_to_soil_dir / 'all_nasis_mukey_trick.csv'
    df.to_csv(path_to_nasis, index=False)

In [10]:
df

Unnamed: 0,saverest,areasymbol,areaname,musym_label,mukey_label,hydgrp,kwfact,albedodry_r,restrictiondepthr,partdensity,...,fragvol,sandvf_r,caco3_r,ksat_r,wthirdbar_r,wfifteenbar_r,om_r,ph1to1h2o_r,comppct_r,musym
0,8/30/2022 7:52:33 PM,AK600,"Matanuska-Susitna Valley Area, Alaska",101,50226,B,,0.30,>200,1.40,...,0.0,,0.0,26.00,45.0,20.0,60.0,4.3,90,50226
1,8/30/2022 7:52:33 PM,AK600,"Matanuska-Susitna Valley Area, Alaska",101,50226,B,0.37,0.30,>200,2.65,...,2.0,18.0,0.0,9.17,35.0,10.0,7.5,5.0,90,50226
2,8/30/2022 7:52:33 PM,AK600,"Matanuska-Susitna Valley Area, Alaska",101,50226,B,0.43,0.30,>200,2.65,...,2.0,18.0,0.0,9.17,40.0,10.0,3.0,5.6,90,50226
3,8/30/2022 7:52:33 PM,AK600,"Matanuska-Susitna Valley Area, Alaska",101,50226,B,0.05,0.30,>200,2.65,...,10.0,5.0,0.0,28.23,10.0,2.0,0.5,6.1,90,50226
4,8/30/2022 7:52:33 PM,AK600,"Matanuska-Susitna Valley Area, Alaska",102,50227,B,,0.30,>200,1.40,...,0.0,,0.0,26.00,45.0,20.0,60.0,4.3,60,50227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1245922,9/12/2022 2:11:51 PM,WY737,"Sweetwater County Area, Wyoming",4404,3222366,D,0.43,0.23,18,2.62,...,50.0,11.8,10.0,10.00,27.8,13.3,0.2,7.8,45,3222366
1245923,9/12/2022 2:11:51 PM,WY737,"Sweetwater County Area, Wyoming",4404,3222366,D,,0.23,18,,...,,,,0.20,,,,,45,3222366
1245924,9/12/2022 2:11:51 PM,WY737,"Sweetwater County Area, Wyoming",4405,3222367,D,0.37,0.23,16,2.62,...,15.0,6.1,20.0,3.00,26.6,14.7,0.5,8.2,35,3222367
1245925,9/12/2022 2:11:51 PM,WY737,"Sweetwater County Area, Wyoming",4405,3222367,D,0.49,0.23,16,2.62,...,15.0,6.1,20.0,3.00,29.5,16.1,0.2,8.2,35,3222367


Writing multiple files for combining later on

In [16]:
area_symbols = df['areasymbol'].unique()

if reprocess:
    for loc in tqdm(area_symbols):
        outpath = path_to_split_files / f'nasis_{loc}_mukey.csv'
        df[df['areasymbol']==loc].to_csv(outpath, index=False)

  0%|          | 0/3382 [00:00<?, ?it/s]

100%|██████████| 3382/3382 [06:01<00:00,  9.35it/s]


In [17]:
nb_dir = Path('__file__').absolute().parent
os.chdir(nb_dir)

Run NITA seperately for each generated file

In [18]:
if reprocess:
    for label in tqdm(area_symbols):
        os.chdir(nb_dir)
        control_file = path_to_split_files / f'NITA_CONTROL_{label}.csv'

        with control_file.open(mode='a') as file:
            file.write('FILENAME,UNITS_OUT,COMBINE,SAVE_ALL\n')
            file.write(f'nasis_{label}_mukey.csv,1,0,1\n')

        os.chdir(path_to_split_files)
        command = str(path_to_NITA_bin.absolute()) + f' /f:NITA_CONTROL_{label}.csv'
        subprocess.call(command)

        # Rename Excluded Soil Records file so it doesn't get overwritten
        path_to_rename_file = path_to_split_files / 'NITA_Excluded_Soil_Records.csv'
        path_to_rename_file.rename(path_to_rename_file.name.replace('.csv', f'_nasis_{label}_mukey.csv'))

    os.chdir(nb_dir)

100%|██████████| 3382/3382 [11:41<00:00,  4.82it/s]


Combine all `*_soil_data.csv` and `*_soil_layers_data.csv`

In [19]:
soil_data_files = list(path_to_split_files.glob('*_mukey_soil_data.csv'))
soil_layers_data_files = list(path_to_split_files.glob('*_mukey_soil_layers_data.csv'))

for f_d, f_l in zip(soil_data_files,soil_layers_data_files):
    if f_d.stat().st_size*f_l.stat().st_size == 0:
        print(f'Problem with files: {f_d.name} and {f_l.name} (empty)')

In [20]:
df_soil_data = pd.concat([pd.read_csv(f) for f in soil_data_files if f.stat().st_size != 0])
df_soil_layers_data = pd.concat([pd.read_csv(f) for f in soil_layers_data_files if f.stat().st_size != 0])

In [21]:
df_soil_data = df_soil_data.drop_duplicates().reset_index(drop=True)
df_soil_layers_data = df_soil_layers_data.drop_duplicates().reset_index(drop=True)

In [22]:
df_soil_data

Unnamed: 0,Soil_ID,Hydrologic_Soil_Group,K_Factor,Albedo,Time_to_Consolidation,Impervious_Depth,Specific_Gravity,Initial_Soil_Conditions_ID,Soil_Name,Soil_Texture,Number_of_Soil_Layers,Input_Units_Code
0,50226,B,0.0487,0.30,,,,,Benka,Silt loam,3,1
1,50227,B,0.0487,0.30,,,,,Benka,Silt loam,3,1
2,50229,B,0.0487,0.30,,,,,Benka,Silt loam,3,1
3,50231,B,0.0487,0.30,,,,,Benka,Silt loam,3,1
4,50233,B,0.0487,0.23,,1270.0,,,Bodenburg,Silt loam,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
329466,3222363,C,0.0566,0.23,,,,,Saltwells,Loam,7,1
329467,3222364,C,0.0645,0.23,,,,,Saltwells,Loam,6,1
329468,3222365,,-999.0000,,,,,,,Muck,1,1
329469,3222366,D,0.0421,0.23,,180.0,,,Rangecreek,Paragravelly fine sandy loam,2,1


In [23]:
n_soils = len(df_soil_data['Soil_ID'].unique())
print(f'Number of unique Soil_IDs after processing = {n_soils}')

Number of unique Soil_IDs after processing = 329471


In [24]:
df_soil_layers_data

Unnamed: 0,Soil_ID,Layer_Number,Layer_Depth,Bulk_Density,Clay_Ratio,Silt_Ratio,Sand_Ratio,Rock_Ratio,Very_Fine_Sand_Ratio,CaCO3_Content,...,Base_Saturation,Unstable_Aggregate_Ratio,pH,Organic_Matter_Ratio,Organic_N_Ratio,Inorganic_N_Ratio,Organic_P_Ratio,Inorganic_P_Ratio,Soil_Structure_Code,Input_Units_Code
0,50226,1,120.0,0.88,0.05,0.62,0.33,0.02,0.180,0.00,...,,,5.0,0.075,,,,,,1
1,50226,2,760.0,0.88,0.05,0.62,0.33,0.02,0.180,0.00,...,,,5.6,0.030,,,,,,1
2,50226,3,1520.0,1.43,0.02,0.06,0.92,0.10,0.050,0.00,...,,,6.1,0.005,,,,,,1
3,50227,1,120.0,0.88,0.05,0.62,0.33,0.02,0.180,0.00,...,,,5.0,0.075,,,,,,1
4,50227,2,760.0,0.88,0.05,0.62,0.33,0.02,0.180,0.00,...,,,5.6,0.030,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122476,3222365,1,-999.0,-999.00,-999.00,-999.00,-999.00,,,,...,,,,,,,,,,1
1122477,3222366,1,50.0,1.54,0.15,0.15,0.70,0.25,0.171,0.04,...,,,7.8,0.005,,,,,,1
1122478,3222366,2,180.0,1.54,0.23,0.37,0.40,0.50,0.118,0.10,...,,,7.8,0.002,,,,,,1
1122479,3222367,1,50.0,1.58,0.29,0.61,0.10,0.15,0.061,0.20,...,,,8.2,0.005,,,,,,1


In [25]:
if reprocess:
    df_soil_data.to_parquet(path_to_soil_dir / 'all_valid_soil_data.parquet')
    df_soil_layers_data.to_parquet(path_to_soil_dir / 'all_valid_soil_layers_data.parquet')

    df_soil_data.to_csv(path_to_soil_dir / 'all_valid_soil_data.csv', index=False)
    df_soil_layers_data.to_csv(path_to_soil_dir / 'all_valid_soil_layers_data.csv', index=False)

In [38]:
len(df['mukey_label'].unique())

329471

In [37]:
len(df_soil_data['Soil_ID'].unique())

329471

## Post-processing of NITA files

In [45]:
import re

def process_NITA_Excluded_Soil_Records_file(f):
    """ f is a Path object """
    soil_ids = []
    reason_ignored = []

    count = 0
    with f.open() as file:
        while True:
            line = file.readline()
            count += 1
            if not line:
                break
            
            known_reason = False

            if 'There was no valid data processed from the input file.' in line:
                continue

            # Process each line
            pattern = r'Soil ID "(\w+)".*\(clay/silt/sand\).*non-blank.*' 
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('all clay silt sand values are blank')
                known_reason = True
                continue

            pattern = r'Soil ID "(\w+)".*field capacity.*less than the wilting point.*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('field capacity for this layer is less than wilting point')
                known_reason = True
                continue

            pattern = r'Soil ID "(\w+)".*\(wilting point\).*is blank.*required field.*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('wilting point is blank')
                known_reason = True
                continue
            
            pattern = r'Soil ID "(\w+)".*The bulk density for this layer is blank.*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('no valid bulk density found')
                known_reason = True
                continue

            pattern = r'There are no validated soil layers for the soil with ID "(\w+)".*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('no valid soil layers found')
                known_reason = True
                continue

            pattern = r'Soil ID "(\w+)".*\(hydrologic soil group\).*is blank in the input data.*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('missing hydrologic soil group')
                known_reason = True
                continue

            pattern = r'Soil ID "(\w+)".*\(field capacity\).*is blank in the input data.*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('missing field capacity')
                known_reason = True
                continue

            pattern = r'Soil ID "(\w+)".*\(saturated hydraulic conductivity\).*is blank in the input data.*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('missing saturated hydraulic conductivity')
                known_reason = True
                continue

            pattern = r'Soil ID "(\w+)".*The sum of clay.*silt.*sand.*is less than 100% and is less than the current threshold limit.*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('sum of clay silt sand is too low')
                known_reason = True
                continue

            pattern = r'Soil ID "(\w+)".*The sum of clay.*silt.*sand.*is greater than 100% and is more than the current threshold limit.*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('sum of clay silt sand is too high')
                known_reason = True
                continue

            pattern = r'Soil ID "(\w+)".*This record has a duplicate soil layer depth.*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('duplicate layer depth of that of previous soil')
                known_reason = True
                continue
            
            pattern = r'Soil ID "(\w+)".*"hzdepb_r",\(soil layer depth\) on record.*is blank in the input data.*'
            match = re.match(pattern, line)
            if match:
                soil_ids.append(match.group(1))
                reason_ignored.append('soil layer depth is missing')
                known_reason = True
                continue

            if not known_reason:
                raise Exception(f'Unidentified case for file {f.name} at line {count}')
            
        # Assemble everything into a dataframe
        return pd.DataFrame({'soil_id': soil_ids, 'problems': reason_ignored})

In [46]:
excluded_NITA_files = list(path_to_split_files.glob('NITA_Excluded_Soil_Records*.csv'))

In [47]:
df_excluded_soils = pd.concat([process_NITA_Excluded_Soil_Records_file(f) for f in excluded_NITA_files if f.stat().st_size != 0])
df_excluded_soils['problems'] = df_excluded_soils['problems'].astype('category')


In [48]:
df_excluded_soils['problems'].unique().tolist()

['all clay silt sand values are blank',
 'missing hydrologic soil group',
 'field capacity for this layer is less than wilting point',
 'wilting point is blank',
 'no valid bulk density found',
 'missing field capacity',
 'missing saturated hydraulic conductivity',
 'sum of clay silt sand is too low',
 'soil layer depth is missing',
 'sum of clay silt sand is too high',
 'duplicate layer depth of that of previous soil']

In [49]:
df_excluded_soils.to_csv(path_to_soil_dir / 'nita_excluded_soil_ids_mukey.csv', index=False)

In [50]:
df_excluded_soils.to_parquet(path_to_soil_dir / 'nita_excluded_soil_ids_mukey.parquet')

In [51]:
df_excluded_soils

Unnamed: 0,soil_id,problems
0,50226,all clay silt sand values are blank
1,50227,all clay silt sand values are blank
2,50229,all clay silt sand values are blank
3,50231,all clay silt sand values are blank
4,50233,all clay silt sand values are blank
...,...,...
164,3222357,all clay silt sand values are blank
165,3222361,all clay silt sand values are blank
166,3222365,missing hydrologic soil group
167,3222366,all clay silt sand values are blank


In [67]:
soil_id_excluded = set(df_excluded_soils['soil_id'].apply(str).to_list())
soil_id_valid = set(df_soil_data['Soil_ID'].apply(str).to_list())

In [71]:
unaccounted_soil_ids = soil_id_excluded - soil_id_valid

In [72]:
len(unaccounted_soil_ids)

0