In [1]:
import socket
import time
import json
from pathlib import Path

from tqdm import tqdm

import geopandas as gpd
import pandas as pd

from pyagnps import soil_data_market as sdm
from pyagnps.utils import log_to_file, get_current_time

from sqlalchemy import create_engine, text as sql_text
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import SQLAlchemyError

In [2]:
# DATABASE SETUP
credentials = Path("../../inputs/db_credentials.json")
with open(credentials, "r") as f:
    credentials = json.load(f)

user = credentials["user"]
password = credentials["password"]
host = credentials["host"]
port = credentials["port"]
database = credentials["database"]

engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")

In [3]:
path_to_thucs = Path(
    "D:/AIMS/Datasets/THUCS_TopAGNPS_Delineations/40k_SM/tophuc_S_M_40000_closed_holes_with_container_thuc_merged_bbox_area_first_kept.gpkg"
)


thucs = gpd.read_file(
    path_to_thucs
)  # GeoDataFrame containing the thucs and their geometry
thucs = thucs.sort_values(by=["bbox_area_sqkm"], ascending=False)

runlist = thucs['tophucid'].to_list()

In [4]:
cells_without_soil = {}

for thuc_id in tqdm(runlist):

    # thuc_id = f'{i:04.0f}'
    
    # Query for cells where soil is not populated
    # query_geom = f"SELECT * FROM thuc_{thuc_id}_annagnps_cell_ids LIMIT 1"
    query_cds = f"SELECT cell_id FROM thuc_{thuc_id}_annagnps_cell_data_section WHERE soil_id is NULL"
    query_total_count = f"SELECT COUNT(cell_id) FROM thuc_{thuc_id}_annagnps_cell_data_section"

    try:
        with engine.connect() as conn:
            cell_data_section = pd.read_sql(sql=sql_text(query_cds), con=conn)
            total_count = pd.read_sql(sql=sql_text(query_total_count), con=conn)
            # cells = gpd.read_postgis(sql=sql_text(query), con=conn, geom_col="geom")

            # utm = cells.estimate_utm_crs()
            # cells = cells.to_crs(utm)

        no_soil_cells = cell_data_section['cell_id'].to_list()

        if no_soil_cells:
            cells_without_soil[thuc_id] = {'total_cells' : total_count['count'].values[0],
                                        'cells_without_soil': no_soil_cells,
                                        'no_pop_frac': total_count['count'].values[0]/len(no_soil_cells)}
    except:
        continue


100%|██████████| 4800/4800 [12:58<00:00,  6.17it/s] 


In [6]:
# for tid, tdata in cells_without_soil.items():
    # print(f"{tid}, frac_no_pop: {tdata['no_pop_frac']}")

In [7]:
len(cells_without_soil)

1740

In [10]:
df_redo = pd.DataFrame({'tophucid': list(cells_without_soil.keys())})

In [12]:
df_redo.to_csv('../../tmp/thuc_redo_soil.csv', index=False)