This notebook outlines the work flow to obtain soil data from NRCS and format it for the whole US in an AnnAGNPS friendly way

In [1]:
import os, subprocess

from pathlib import Path
from tqdm import tqdm

import pandas as pd

- Go to : https://sdmdataaccess.nrcs.usda.gov/Query.aspx
- Run the following query and download the file by email:
```
    select 
    sa.saverest, 
    l.areasymbol, 
    l.areaname,
    mu.musym, 
    mu.mukey,
    hydgrp,
    kwfact,
    albedodry_r,
    (SELECT CASE when min(resdept_r) is null then '>200' else cast(min(resdept_r) as 
    varchar) END
                from component left outer join corestrictions on component.cokey = 
    corestrictions.cokey where component.cokey = c.cokey and reskind is not null) as 
    restrictiondepthr,
    partdensity,
    c.compname, 
    texdesc, 
    hzdepb_r, 
    dbovendry_r, 
    claytotal_r,
    silttotal_r,
    sandtotal_r,
    (select sum(cf.fragvol_r) as fragvol  FROM chfrags cf WHERE cf.chkey = ch.chkey 
    ) as fragvol,
    sandvf_r,
    caco3_r,
    ksat_r,
    wthirdbar_r,
    wfifteenbar_r,
    om_r, 
    ph1to1h2o_r,
    c.comppct_r

    FROM 
    legend l INNER JOIN mapunit mu ON mu.lkey = l.lkey 
    LEFT OUTER JOIN sacatalog sa ON sa.areasymbol = l.areasymbol
    LEFT OUTER JOIN component c ON c.mukey = mu.mukey and c.cokey = (SELECT TOP 1 component.cokey FROM component WHERE 
    component.mukey=mu.mukey ORDER BY component.comppct_r DESC)
    LEFT OUTER JOIN chorizon ch ON ch.cokey = c.cokey 
    LEFT OUTER JOIN chtexturegrp ct ON ch.chkey=ct.chkey 

    WHERE ct.rvindicator = 'yes'

    Order by l.areasymbol, musym, mukey, compname, hzdepb_r
```

In [2]:
path_to_soil_dir = Path(r'..\..\inputs\soil')
path_to_output_soil_dir = Path(r'..\..\outputs\soil_data_market\soil\ALL_US')

path_to_soil_table = path_to_soil_dir / 'soil_data_US_with_mukey.csv'
path_to_NITA_bin = Path(r'C:\Users\Luc\projects\pyagnps\src\bins\NITA_v6.00.a.007_release_64-bit_Windows.exe')

Read Soil Data Table (for the entire US)

In [3]:
df = pd.read_csv(path_to_soil_table)

We need to use the mukey instead of musym as the Soil_ID but NITA expects a column called musym so a trick is performed where the mukey column is renamed musym

In [4]:
df = df.rename(columns={'musym': 'musym_label',
                        'mukey': 'mukey_label'})
df['musym'] = df['mukey_label']

Remove commas from `compname` and `texdesc` columns so that NITA can process it

In [5]:
df['compname'] = df['compname'].str.replace(',','')
df['texdesc'] = df['texdesc'].str.replace(',','')

Write the whole updated table to file but step is not necessary

In [6]:
path_to_nasis = path_to_output_soil_dir / 'all_nasis_mukey_trick.csv'
df.to_csv(path_to_nasis, index=False)

In [7]:
df

Unnamed: 0,saverest,areasymbol,areaname,musym_label,mukey_label,hydgrp,kwfact,albedodry_r,restrictiondepthr,partdensity,...,fragvol,sandvf_r,caco3_r,ksat_r,wthirdbar_r,wfifteenbar_r,om_r,ph1to1h2o_r,comppct_r,musym
0,8/30/2022 7:52:33 PM,AK600,"Matanuska-Susitna Valley Area, Alaska",101,50226,B,,0.30,>200,1.40,...,0.0,,0.0,26.00,45.0,20.0,60.00,4.3,90,50226
1,8/30/2022 7:52:33 PM,AK600,"Matanuska-Susitna Valley Area, Alaska",101,50226,B,0.37,0.30,>200,2.65,...,2.0,18.0,0.0,9.17,35.0,10.0,7.50,5.0,90,50226
2,8/30/2022 7:52:33 PM,AK600,"Matanuska-Susitna Valley Area, Alaska",101,50226,B,0.43,0.30,>200,2.65,...,2.0,18.0,0.0,9.17,40.0,10.0,3.00,5.6,90,50226
3,8/30/2022 7:52:33 PM,AK600,"Matanuska-Susitna Valley Area, Alaska",101,50226,B,0.05,0.30,>200,2.65,...,10.0,5.0,0.0,28.23,10.0,2.0,0.50,6.1,90,50226
4,8/30/2022 7:52:33 PM,AK600,"Matanuska-Susitna Valley Area, Alaska",102,50227,B,,0.30,>200,1.40,...,0.0,,0.0,26.00,45.0,20.0,60.00,4.3,60,50227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1230561,9/12/2022 2:11:51 PM,WY737,"Sweetwater County Area, Wyoming",9325,2925093,A,0.32,0.23,165,2.65,...,2.0,25.4,3.0,100.00,13.8,4.6,0.17,8.6,50,2925093
1230562,9/12/2022 2:11:51 PM,WY737,"Sweetwater County Area, Wyoming",9325,2925093,A,,0.23,165,,...,,,,0.20,,,,,50,2925093
1230563,9/12/2022 2:11:51 PM,WY737,"Sweetwater County Area, Wyoming",9A,2572297,B,0.28,0.23,>200,,...,4.0,19.0,7.0,30.00,16.7,7.4,1.50,8.0,85,2572297
1230564,9/12/2022 2:11:51 PM,WY737,"Sweetwater County Area, Wyoming",9A,2572297,B,0.24,0.23,>200,,...,4.0,10.3,7.0,30.00,15.2,6.0,0.25,8.0,85,2572297


Writing multiple files for combining later on

In [8]:
area_symbols = df['areasymbol'].unique()

for loc in tqdm(area_symbols):
    outpath = path_to_output_soil_dir / f'nasis_{loc}_mukey.csv'
    df[df['areasymbol']==loc].to_csv(outpath, index=False)

100%|██████████| 3240/3240 [03:09<00:00, 17.11it/s]


In [15]:
nb_dir = Path('__file__').absolute().parent
os.chdir(nb_dir)

Run NITA seperately for each generated file

In [17]:
for label in tqdm(area_symbols):
    os.chdir(nb_dir)
    control_file = path_to_output_soil_dir / f'NITA_CONTROL_{label}.csv'

    with control_file.open(mode='a') as file:
        file.write('FILENAME,UNITS_OUT,COMBINE\n')
        file.write(f'nasis_{label}_mukey.csv,1,0\n')

    os.chdir(path_to_output_soil_dir)
    command = str(path_to_NITA_bin.absolute()) + f' /f:NITA_CONTROL_{label}.csv'
    subprocess.call(command)

os.chdir(nb_dir)


100%|██████████| 3240/3240 [04:46<00:00, 11.29it/s]


Combine all `*_soil_data.csv` and `*_soil_layers_data.csv`

In [26]:
soil_data_files = list(path_to_output_soil_dir.glob('*_mukey_soil_data.csv'))
soil_layers_data_files = list(path_to_output_soil_dir.glob('*_mukey_soil_layers_data.csv'))

for f_d, f_l in zip(soil_data_files,soil_layers_data_files):
    if f_d.stat().st_size*f_l.stat().st_size == 0:
        print(f'Problem with files: {f_d.name} and {f_l.name} (empty)')

Problem with files: nasis_AK651_mukey_soil_data.csv and nasis_AK651_mukey_soil_data.csv (empty)


In [49]:
df_soil_data = pd.concat([pd.read_csv(f) for f in soil_data_files if f.stat().st_size != 0])
df_soil_layers_data = pd.concat([pd.read_csv(f) for f in soil_layers_data_files if f.stat().st_size != 0])

In [50]:
df_soil_data = df_soil_data.drop_duplicates().reset_index(drop=True)
df_soil_layers_data = df_soil_layers_data.drop_duplicates().reset_index(drop=True)

In [51]:
df_soil_data

Unnamed: 0,Soil_ID,Hydrologic_Soil_Group,K_Factor,Albedo,Time_to_Consolidation,Impervious_Depth,Specific_Gravity,Initial_Soil_Conditions_ID,Soil_Name,Soil_Texture,Number_of_Soil_Layers,Input_Units_Code
0,50226,B,0.0487,0.30,,,,,Benka,Silt loam,3,1
1,50227,B,0.0487,0.30,,,,,Benka,Silt loam,3,1
2,50229,B,0.0487,0.30,,,,,Benka,Silt loam,3,1
3,50231,B,0.0487,0.30,,,,,Benka,Silt loam,3,1
4,50233,B,0.0487,0.23,,1270.0,,,Bodenburg,Silt loam,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
307572,3222362,C,0.0724,0.30,,,,,Saltwells,Silt loam,7,1
307573,3222363,C,0.0566,0.23,,,,,Saltwells,Loam,7,1
307574,3222364,C,0.0645,0.23,,,,,Saltwells,Loam,6,1
307575,3222366,D,0.0421,0.23,,180.0,,,Rangecreek,Paragravelly fine sandy loam,2,1


In [52]:
n_soils = len(df_soil_data['Soil_ID'].unique())
print(f'Number of unique Soil_IDs after processing = {n_soils}')

Number of unique Soil_IDs after processing = 307577


In [53]:
df_soil_layers_data

Unnamed: 0,Soil_ID,Layer_Number,Layer_Depth,Bulk_Density,Clay_Ratio,Silt_Ratio,Sand_Ratio,Rock_Ratio,Very_Fine_Sand_Ratio,CaCO3_Content,...,Base_Saturation,Unstable_Aggregate_Ratio,pH,Organic_Matter_Ratio,Organic_N_Ratio,Inorganic_N_Ratio,Organic_P_Ratio,Inorganic_P_Ratio,Soil_Structure_Code,Input_Units_Code
0,50226,1,120.0,0.88,0.05,0.62,0.33,0.02,0.180,0.00,...,,,5.0,0.075,,,,,,1
1,50226,2,760.0,0.88,0.05,0.62,0.33,0.02,0.180,0.00,...,,,5.6,0.030,,,,,,1
2,50226,3,1520.0,1.43,0.02,0.06,0.92,0.10,0.050,0.00,...,,,6.1,0.005,,,,,,1
3,50227,1,120.0,0.88,0.05,0.62,0.33,0.02,0.180,0.00,...,,,5.0,0.075,,,,,,1
4,50227,2,760.0,0.88,0.05,0.62,0.33,0.02,0.180,0.00,...,,,5.6,0.030,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099738,3222364,6,2000.0,1.62,0.32,0.58,0.10,,0.068,0.20,...,,,8.6,0.001,,,,,,1
1099739,3222366,1,50.0,1.54,0.15,0.15,0.70,0.25,0.171,0.04,...,,,7.8,0.005,,,,,,1
1099740,3222366,2,180.0,1.54,0.23,0.37,0.40,0.50,0.118,0.10,...,,,7.8,0.002,,,,,,1
1099741,3222367,1,50.0,1.58,0.29,0.61,0.10,0.15,0.061,0.20,...,,,8.2,0.005,,,,,,1


In [66]:
df_soil_data.to_parquet(path_to_output_soil_dir / 'all_valid_soil_data.parquet')
df_soil_layers_data.to_parquet(path_to_output_soil_dir / 'all_valid_soil_layers_data.parquet')

df_soil_data.to_csv(path_to_output_soil_dir / 'all_valid_soil_data.csv', index=False)
df_soil_layers_data.to_csv(path_to_output_soil_dir / 'all_valid_soil_layers_data.csv', index=False)