In [4]:
import json
from pathlib import Path

from tqdm import tqdm

import geopandas as gpd
import pandas as pd

from pyagnps import soil_data_market as sdm

from sqlalchemy import create_engine, text as sql_text

In [9]:
# DATABASE SETUP
credentials = Path("../../inputs/db_credentials_old.json")
with open(credentials, "r") as f:
    credentials = json.load(f)

user = credentials["user"]
password = credentials["password"]
host = credentials["host"]
port = credentials["port"]
database = credentials["database"]

engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")

In [10]:
# path_to_thucs = Path(
#     "D:/AIMS/Datasets/THUCS_TopAGNPS_Delineations/40k_SM/tophuc_S_M_40000_closed_holes_with_container_thuc_merged_bbox_area_first_kept.gpkg"
# )
path_to_thucs = Path('../../inputs/thucs/tophuc_S_M_40000_closed_holes_with_container_thuc_merged_bbox_area_first_kept.gpkg')

thucs = gpd.read_file(
    path_to_thucs
)  # GeoDataFrame containing the thucs and their geometry
thucs = thucs.sort_values(by=["bbox_area_sqkm"], ascending=False)

runlist = thucs['tophucid'].to_list()

In [12]:
runlist = ['1809']

### Method 1 to get valid Soil IDs : Query database

In [13]:
# Fetch the list of valid Soil_ID values from usa_valid_soil_data
soil_ids_query = """SELECT DISTINCT "Soil_ID" FROM usa_valid_soil_data"""
soil_ids_df = pd.read_sql(soil_ids_query, engine)
soil_ids = soil_ids_df['Soil_ID'].tolist()

In [14]:
len(soil_ids)

307577

### Method 2 to get valid Soil IDs: Read parquet file

In [15]:
path_to_NITA_processed_soil_data = Path('D:/AIMS/Datasets/Soil/DATABASE_POPULATION_TASKS/SDM_QUERY_AND_NITA_PROCESSING/ALL_US_v2_SSURGO_STATSGO2_RSS/all_valid_soil_data.parquet')
df_soil_data = pd.read_parquet(path_to_NITA_processed_soil_data)
soil_ids = df_soil_data['Soil_ID'].tolist()
len(soil_ids)

307788

### Reading NITA discarded records

In [28]:
path_to_NITA_discarded = Path('D:/AIMS/Datasets/Soil/DATABASE_POPULATION_TASKS/SDM_QUERY_AND_NITA_PROCESSING/ALL_US_v2_SSURGO_STATSGO2_RSS/nita_excluded_soil_ids_mukey.parquet')
df_nita_discarded = pd.read_parquet(path_to_NITA_discarded)
df_nita_discarded = df_nita_discarded.drop_duplicates(subset='soil_id', keep='first')

In [29]:
discarded_soil_ids = df_nita_discarded['soil_id'].tolist()

In [130]:
'567304' in soil_ids

False

In [122]:
thuc_id = '1148' #'1060' #'1481' #'0200' #'1569' #'1128' #'1569' #'1769' #'1809'

In [123]:
rows_invalid_soil_id = []

# Query cells that have soil_id populated
query_cds = f"SELECT cell_id, soil_id FROM thuc_{thuc_id}_annagnps_cell_data_section WHERE soil_id IS NOT NULL"


with engine.connect() as conn:
    # query all populated cells
    cell_data_section = pd.read_sql(sql=sql_text(query_cds), con=conn)
    # filter out the cells that have a soil_id in in usa_valid_soil_data
    cell_data_section = cell_data_section[~cell_data_section['soil_id'].isin(soil_ids)]

    populated_invalid_cells = cell_data_section['cell_id'].to_list()
    # populated_invalid_soils = cell_data_section['soil_id'].to_list()

    if populated_invalid_cells:

        query_geom = "SELECT dn, geom FROM thuc_{}_annagnps_cell_ids WHERE dn in ({})"\
            .format(thuc_id, ','.join(str(x) for x in populated_invalid_cells))


        cells = gpd.read_postgis(sql=sql_text(query_geom), con=conn, geom_col="geom")
        cells = cells.to_crs('epsg:4326')
        cells['thuc'] = thuc_id

        invalid_rows = cells.merge(cell_data_section, left_on='dn', right_on='cell_id')
        invalid_rows = invalid_rows[['geom', 'thuc', 'cell_id', 'soil_id']]

        rows_invalid_soil_id.append(invalid_rows)

## Merge with discarded NITA files

In [124]:
invalid_rows_copy = invalid_rows.copy(deep=True)
invalid_rows_copy['soil_id'] = invalid_rows_copy['soil_id'].astype(str)

invalid_rows_copy = invalid_rows_copy.merge(df_nita_discarded, how='left', on='soil_id')

In [125]:
invalid_rows_copy

Unnamed: 0,geom,thuc,cell_id,soil_id,problems
0,"POLYGON ((-89.82490 34.98747, -89.82491 34.987...",1148,664103,1907932,
1,"POLYGON ((-90.23222 34.99988, -90.23232 34.996...",1148,638721,567304,
2,"POLYGON ((-89.81792 34.98945, -89.81794 34.988...",1148,664162,567754,missing hydrologic soil group
3,"POLYGON ((-90.23617 34.99970, -90.23620 34.998...",1148,638472,567304,
4,"POLYGON ((-89.82126 34.98818, -89.82128 34.987...",1148,664142,567754,missing hydrologic soil group
...,...,...,...,...,...
120227,"POLYGON ((-90.90811 32.32047, -90.90812 32.320...",1148,32,808562,
120228,"POLYGON ((-90.90702 32.32613, -90.90704 32.325...",1148,42,808562,
120229,"POLYGON ((-90.90808 32.32155, -90.90809 32.321...",1148,33,808562,
120230,"POLYGON ((-90.89814 32.32463, -90.89814 32.324...",1148,43,633319,


In [127]:
# invalid_rows_copy.explore(column='problems')

In [69]:
invalid_rows_copy[~invalid_rows_copy['problems'].isna()]

Unnamed: 0,geom,thuc,cell_id,soil_id,problems


In [41]:
# Fetch the list of valid Soil_ID values from usa_valid_soil_data
raw_mukey_query = """SELECT * FROM raw_nrcs_soil_data WHERE mukey in (2532950, 2501314, 2498837, 2498619, 382198, 128022, 139817)"""
df_mukey = pd.read_sql(raw_mukey_query, engine)
df_mukey

Unnamed: 0,areasymbol,areaname,musym,mukey,hydgrp,kwfact,albedodry_r,restrictiondepthr,partdensity,compname,...,sandtotal_r,fragvol,sandvf_r,caco3_r,ksat_r,wthirdbar_r,wfifteenbar_r,om_r,ph1to1h2o_r,comppct_r
0,OK031,"Comanche County, Oklahoma",Ro,382198,D,,,0,,Rock outcrop,...,,,,,0.217,,,,,70


In [128]:
# Fetch the list of valid Soil_ID values from usa_valid_soil_data
soil_id_query = """SELECT * FROM usa_valid_soil_data WHERE "Soil_ID" = 2532950"""
df_mukey = pd.read_sql(soil_id_query, engine)
df_mukey

PendingRollbackError: Can't reconnect until invalid transaction is rolled back.  Please rollback() fully before proceeding (Background on this error at: https://sqlalche.me/e/20/8s2b)

In [106]:
thucs.sort_values(by='bbox_area_sqkm', ascending=True)

Unnamed: 0,tophucid,drainage_area_sqkm,outlethuc12,outlethuc12_hutype,receivinghuc12,contained_hucs,num_contained_hucs,foreign_inflows,num_foreign_inflows,tophucid_bbox_container,totophuc,bbox_area_sqkm,geometry
4774,4780,7.07,210100040502,S,210100040600,210100040502,1,,0,4779,OUTSIDE,11.839246,"POLYGON ((-66.08714 17.96425, -66.08731 17.964..."
4789,4795,7.45,210100050301,S,210100050800,210100050301,1,,0,,OUTSIDE,15.961644,"POLYGON ((-65.67754 18.36661, -65.67745 18.366..."
4779,4785,13.58,210100040509,S,210100040600,210100040509,1,,0,,OUTSIDE,23.814385,"POLYGON ((-65.96558 17.97343, -65.96560 17.973..."
4775,4781,11.92,210100040503,S,210100040600,210100040503,1,,0,,OUTSIDE,26.352014,"POLYGON ((-66.07535 17.96552, -66.07542 17.965..."
4783,4789,17.41,210100050104,S,210100050800,210100050104,1,,0,,OUTSIDE,27.215604,"POLYGON ((-65.78897 18.11622, -65.78858 18.116..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,1481,39815.86,110300040708,S,110300100101,"110200110902,110200120503,110200091008,1102001...",368,110200091001,1,,1483,126025.026268,"POLYGON ((-103.31611 37.28287, -103.31680 37.2..."
1938,1944,67619.22,131000000000,S,130402011403,131000000000,1,,0,,1955,142628.745971,"POLYGON ((-104.87690 29.92567, -104.87557 29.9..."
1949,1955,37626.18,130402052007,S,130402052404,"130402051705,130402051112,130402051502,1304020...",362,"130401000406,130401000408,130401000307,1304010...",7,,1959,146469.298861,"POLYGON ((-105.34992 30.57171, -105.35024 30.5..."
1469,1475,39468.25,110500021110,S,110500030203,"110500010202,110500020107,110400010305,1105000...",386,,0,,1476,152979.310277,"POLYGON ((-103.07970 36.73863, -103.08031 36.7..."
