In [1]:
from pathlib import Path
import json
import pandas as pd
import geopandas as gpd

from sqlalchemy import URL, create_engine, text as sql_text

## Database connection setup

In [2]:
# DATABASE SETUP
credentials = Path("../../inputs/db_credentials.json")
with open(credentials, "r") as f:
    credentials = json.load(f)

user = credentials["user"]
password = credentials["password"]
host = credentials["host"]
port = credentials["port"]
database = credentials["database"]

url_object = URL.create(
    "postgresql",
    username=user,
    password=password,
    host=host,
    port=port,
    database=database
)

# create a SQLAlchemy engine object
engine = create_engine(url_object)

## User input

In [3]:
lon, lat = -89.91528, 34.23195

casename = 'goodwin_creek'
output_folder = Path('outputs')

output_folder.mkdir(exist_ok=True)

## Get local THUC

In [4]:
thuc = pd.read_sql_query(sql_text(f"SELECT thuc_near_run_id_tr({lon},{lat})"),con=engine.connect())
thuc_id = thuc.iloc[0].values[0]

print(f"(lat, lon) = ({lat}, {lon}) fall within THUC ID: {thuc_id}")

(lat, lon) = (34.23195, -89.91528) fall within THUC ID: 1148


## Get Cells Geometry

### Method 1: Querying the database

In [5]:
cells_query = f"SELECT geom, cell_id FROM thuc_cell_geo_tr({lon},{lat}, '{thuc_id}')"
cells_geometry = gpd.read_postgis(sql=sql_text(cells_query), con=engine.connect(), geom_col='geom')


In [6]:
# Write to GeoJSON file
path_to_geojson_cells = output_folder / f"{casename}_cells.geojson"
cells_geometry.to_file(path_to_geojson_cells, driver='GeoJSON')

### Method 2: Reading a file

If you have a path to a file or a string representing a geojson file you can read it directly

In [7]:
cells_geometry = gpd.read_file(path_to_geojson_cells)

## Get Reaches Geometry
Works the same way as the cells

### Method 1: Querying the database

In [8]:
reaches_query = f"SELECT geom, reach_id FROM thuc_reach_geo_tr({lon},{lat}, '{thuc_id}')"
reaches_geometry = gpd.read_postgis(sql=sql_text(reaches_query), con=engine.connect(), geom_col='geom')

In [9]:
# Write to GeoJSON file
path_to_geojson_reaches = output_folder / f"{casename}_reaches.geojson"
reaches_geometry.to_file(path_to_geojson_reaches, driver='GeoJSON')

### Method 2: Reading a file

In [10]:
reaches_geometry = gpd.read_file(path_to_geojson_reaches)

## Read Cell and Reaches Data Section

### Get list of cells

#### Method 1: If you already have the GeoDataFrame loaded:

In [11]:
cell_ids = cells_geometry['cell_id'].to_list()

#### Method 2: If you don't care about the geometry and just want the cell_ids of a given watershed you can just query it from the watershed without the geometry

In [12]:
cells_query = f"SELECT cell_id FROM thuc_cell_geo_tr({lon},{lat}, '{thuc_id}')"
df_cells = pd.read_sql(sql=sql_text(cells_query), con=engine.connect())

cell_ids = df_cells['cell_id'].to_list()

### Get list of reaches

#### Method 1: If you already have the GeoDataFrame loaded:

In [13]:
reach_ids = reaches_geometry['reach_id'].to_list()

#### Method 2: If you don't care about the geometry and want to query the reach_ids directly from the watershed

In [14]:
reaches_query = f"SELECT reach_id FROM thuc_reach_geo_tr({lon},{lat}, '{thuc_id}')"
df_reaches = pd.read_sql(sql=sql_text(reaches_query), con=engine.connect())

reach_ids = df_reaches['reach_id'].to_list()

### Read Cell Data Section

In [15]:
query = f"SELECT * FROM thuc_{thuc_id}_annagnps_cell_data_section WHERE cell_id in {*cell_ids,}"

df_cell_data_section = pd.read_sql_query(sql=sql_text(query), con=engine.connect())
df_cell_data_section.head()

Unnamed: 0,cell_id,soil_id,mgmt_field_id,reach_id,reach_location_code,cell_area,time_of_conc,avg_elevation,rcn_calib_id,avg_land_slope,...,delivery_ratio,constant_usle_c_fctr,constant_usle_p_fctr,all_oc_calib_fctr,all_n_calib_fctr,all_p_calib_fctr,sheet_and_rill_erosion_calib_fctr,gullies_erosion_calib_fctr,input_units_code,soil_id_annagnps_valid
0,956601,568332,Grassland_Pasture,95660,0,10.89,,108.81,,0.02668,...,,,,,,,,,1,1
1,956581,568332,Grassland_Pasture,95658,0,10.35,,105.94,,0.0296,...,,,,,,,,,1,1
2,956611,568326,Grassland_Pasture,95661,0,10.26,,112.79,,0.03681,...,,,,,,,,,1,0
3,956691,568332,Grassland_Pasture,95669,0,9.99,,115.65,,0.04749,...,,,,,,,,,1,1
4,956602,568326,Grassland_Pasture,95660,1,13.68,,102.71,,0.04329,...,,,,,,,,,1,0


### Reach Reach Data Section

In [16]:
query = f"SELECT * FROM thuc_{thuc_id}_annagnps_reach_data_section WHERE reach_id in {*reach_ids,}"

df_reach_data_section = pd.read_sql_query(sql=sql_text(query), con=engine.connect())
df_reach_data_section.head()

Unnamed: 0,reach_id,receiving_reach,vegetation_code,elevation,slope,mannings_n,infiltration_rate,hydraulic_geom_id,length,top_width,...,sand_scour_code,small_agg_scour_code,large_agg_scour_code,valley_clay_scour_code,valley_silt_scour_code,valley_sand_scour_code,valley_small_agg_scour_code,valley_large_agg_scour_code,delivery_ratio,input_units_code
0,95625,95624,,67.82,0.00684,,,,307.28,,...,,,,,,,,,,1
1,95626,95625,,70.63,0.0029,,,,1141.25,,...,,,,,,,,,,1
2,95627,95626,,71.8,0.00277,,,,144.85,,...,,,,,,,,,,1
3,95628,95627,,73.15,0.00441,,,,499.71,,...,,,,,,,,,,1
4,95629,95627,,72.26,0.00159,,,,506.98,,...,,,,,,,,,,1


In [17]:
def post_process_df_reach_data_section(df_reach_data_section, outlet_reach_id=None):
    """
    AnnAGNPS needs an extra row in the reach data section of a "ghost" reach that points to "OUTLET".
    To do so we identify the outlet_reach_id (it's the "Receiving_Reach" that does not appear in the "Reach_ID" column).
    This reach is also the "nearest reach" that was found when the user clicked on the map so it can be provided directly
    to optimize code/memory/speed
    """
    if not(outlet_reach_id):
        reaches = set(df_reach_data_section["reach_id"])
        receiving_reaches = set(df_reach_data_section["receiving_reach"])
        outlet_reach_id = list(receiving_reaches - reaches)[0]
    
    outlet_row = df_reach_data_section[df_reach_data_section['receiving_reach']==outlet_reach_id].copy()
    outlet_row['reach_id'] = outlet_reach_id
    outlet_row['receiving_reach'] = 'OUTLET'
    outlet_row['length'] = 0

    # Insert at the top
    df_reach_data_section_valid = pd.concat([outlet_row, df_reach_data_section], ignore_index=True)
    return df_reach_data_section_valid

df_reach_data_section_valid = post_process_df_reach_data_section(df_reach_data_section)
df_reach_data_section_valid.head()

Unnamed: 0,reach_id,receiving_reach,vegetation_code,elevation,slope,mannings_n,infiltration_rate,hydraulic_geom_id,length,top_width,...,sand_scour_code,small_agg_scour_code,large_agg_scour_code,valley_clay_scour_code,valley_silt_scour_code,valley_sand_scour_code,valley_small_agg_scour_code,valley_large_agg_scour_code,delivery_ratio,input_units_code
0,95624,OUTLET,,67.82,0.00684,,,,0.0,,...,,,,,,,,,,1
1,95625,95624,,67.82,0.00684,,,,307.28,,...,,,,,,,,,,1
2,95626,95625,,70.63,0.0029,,,,1141.25,,...,,,,,,,,,,1
3,95627,95626,,71.8,0.00277,,,,144.85,,...,,,,,,,,,,1
4,95628,95627,,73.15,0.00441,,,,499.71,,...,,,,,,,,,,1


## Join geometries with attributes for GIS layer export

In [18]:
cells_geometry = cells_geometry.merge(df_cell_data_section, on='cell_id')
reaches_geometry = reaches_geometry.merge(df_reach_data_section, on='reach_id')

## Read Soil Data
Using the same logic we get the `annagnps_soil_data` and `annagnps_soil_layers_data` for the desired cells. As a bonus we can also produce the raw soil data that people could process themselves through NITA if they wanted to

In [19]:
soil_ids = df_cell_data_section['soil_id'].to_list()

In [20]:
query_soil = f"""SELECT * FROM usa_valid_soil_data WHERE "Soil_ID" in {*soil_ids,}"""
query_soil_layers = f"""SELECT * FROM usa_valid_soil_layers_data WHERE "Soil_ID" in {*soil_ids,}"""
query_raw = f"""SELECT * FROM raw_nrcs_soil_data WHERE "mukey" in {*soil_ids,}"""

df_soil_data = pd.read_sql_query(sql=sql_text(query_soil), con=engine.connect())
df_soil_layers_data = pd.read_sql_query(sql=sql_text(query_soil_layers), con=engine.connect())
df_raw = pd.read_sql_query(sql=sql_text(query_raw), con=engine.connect())

In [21]:
df_soil_data.head()

Unnamed: 0,Soil_ID,Hydrologic_Soil_Group,K_Factor,Albedo,Time_to_Consolidation,Impervious_Depth,Specific_Gravity,Initial_Soil_Conditions_ID,Soil_Name,Soil_Texture,Number_of_Soil_Layers,Input_Units_Code
0,568308,C,0.0566,0.3,,540.0,,,Calloway,Silt loam,3,1
1,568309,B,0.0645,0.3,,,,,Collins,Silt loam,2,1
2,568310,B,0.0645,0.3,,,,,Collins,Silt loam,2,1
3,568313,B,0.0566,0.3,,,,,Falaya,Silt loam,2,1
4,568314,B,0.0566,0.3,,,,,Falaya,Silt loam,2,1


In [22]:
df_soil_layers_data.head()

Unnamed: 0,Soil_ID,Layer_Number,Layer_Depth,Bulk_Density,Clay_Ratio,Silt_Ratio,Sand_Ratio,Rock_Ratio,Very_Fine_Sand_Ratio,CaCO3_Content,...,Base_Saturation,Unstable_Aggregate_Ratio,pH,Organic_Matter_Ratio,Organic_N_Ratio,Inorganic_N_Ratio,Organic_P_Ratio,Inorganic_P_Ratio,Soil_Structure_Code,Input_Units_Code
0,568308,1,640.0,1.55,0.2,0.686,0.114,,0.066,0.0,...,,,5.3,0.0125,,,,,,1
1,568308,2,1630.0,1.52,0.21,0.677,0.113,,0.066,0.0,...,,,5.3,0.0025,,,,,,1
2,568308,3,1730.0,1.57,0.24,0.667,0.093,,0.065,0.0,...,,,6.5,0.001,,,,,,1
3,568309,1,200.0,1.52,0.115,0.677,0.208,,0.134,0.0,...,,,5.0,0.0125,,,,,,1
4,568309,2,1570.0,1.52,0.115,0.677,0.208,,0.134,0.0,...,,,5.0,0.0015,,,,,,1


In [23]:
df_raw.head()

Unnamed: 0,areasymbol,areaname,musym,mukey,hydgrp,kwfact,albedodry_r,restrictiondepthr,partdensity,compname,...,sandtotal_r,fragvol,sandvf_r,caco3_r,ksat_r,wthirdbar_r,wfifteenbar_r,om_r,ph1to1h2o_r,comppct_r
0,MS107,"Panola County, Mississippi",CaB,568308,C/D,0.43,0.3,54,,Calloway,...,11.4,,6.6,0.0,9.17,28.7,13.5,1.25,5.3,90
1,MS107,"Panola County, Mississippi",CaB,568308,C/D,0.55,0.3,54,,Calloway,...,11.3,,6.6,0.0,0.92,27.9,12.5,0.25,5.3,90
2,MS107,"Panola County, Mississippi",CaB,568308,C/D,0.55,0.3,54,,Calloway,...,9.3,,6.5,0.0,0.92,29.4,14.5,0.1,6.5,90
3,MS107,"Panola County, Mississippi",Cm,568309,B,0.49,0.3,>200,,Collins,...,20.8,,13.4,0.0,9.17,24.2,8.4,1.25,5.0,90
4,MS107,"Panola County, Mississippi",Cm,568309,B,0.64,0.3,>200,,Collins,...,20.8,,13.4,0.0,9.17,22.6,6.9,0.15,5.0,90


## Export Everything!

In [24]:
path_to_soil_data = output_folder / f"{casename}_soil_data.csv"
path_to_soil_layers_data = output_folder / f"{casename}_soil_layers_data.csv"
path_to_raw_soil_data = output_folder / f"{casename}_raw_soil_data_gNATSGO.csv"

path_to_cell_data_section = output_folder / f"{casename}_AnnAGNPS_Cell_Data_Section.csv"
path_to_reach_data_section = output_folder / f"{casename}_AnnAGNPS_Reach_Data_Section.csv"

path_to_cells_geom = output_folder / f"{casename}_AnnAGNPS_Cells.gpkg"
path_to_reaches_geom = output_folder / f"{casename}_AnnAGNPS_Reaches.gpkg"

# Soil
df_soil_data.to_csv(path_to_soil_data, index=False)
df_soil_layers_data.to_csv(path_to_soil_layers_data, index=False)
df_raw.to_csv(path_to_raw_soil_data, index=False)

# Cell and Reach Data Sections
df_cell_data_section.to_csv(path_to_cell_data_section, index=False)
df_reach_data_section_valid.to_csv(path_to_reach_data_section, index=False)

# Geometries: We've already exported to GeoJSON above but other formats are possible
    # Geopackage : compact and open source, I recommend we use this!
cells_geometry.to_file(path_to_cells_geom, driver='GPKG', index=False)
reaches_geometry.to_file(path_to_reaches_geom, driver='GPKG', index=False)
    # ESRI Shapefile : yuck, it's multiple files, it renames attributes that are too long, I DO NOT recommend we use this
cells_geometry.to_file(path_to_cells_geom.with_suffix(".shp"), driver='ESRI Shapefile', index=False)
reaches_geometry.to_file(path_to_reaches_geom.with_suffix(".shp"), driver='ESRI Shapefile', index=False)
    # GeoJSON (for complenetess) This one is ok :)
cells_geometry.to_file(path_to_cells_geom.with_suffix(".geojson"), driver='GeoJSON', index=False)
reaches_geometry.to_file(path_to_reaches_geom.with_suffix(".geojson"), driver='GeoJSON', index=False)

  cells_geometry.to_file(path_to_cells_geom.with_suffix(".shp"), driver='ESRI Shapefile', index=False)
  reaches_geometry.to_file(path_to_reaches_geom.with_suffix(".shp"), driver='ESRI Shapefile', index=False)


Note: If you don't want to actually write a file to disk maybe you can use the StringIO class and write to a binary object in memory if you want to handle the output differently

cf. https://stackoverflow.com/questions/50959593/write-pandas-dataframe-to-csv-stringio-instead-of-file