In [1]:
import json
from pathlib import Path

from tqdm import tqdm

import geopandas as gpd
import pandas as pd

from pyagnps import soil_data_market as sdm

from sqlalchemy import create_engine, text as sql_text

In [2]:
# DATABASE SETUP
credentials = Path("../../inputs/db_credentials.json")
with open(credentials, "r") as f:
    credentials = json.load(f)

user = credentials["user"]
password = credentials["password"]
host = credentials["host"]
port = credentials["port"]
database = credentials["database"]

engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")

In [3]:
# path_to_thucs = Path(
#     "D:/AIMS/Datasets/THUCS_TopAGNPS_Delineations/40k_SM/tophuc_S_M_40000_closed_holes_with_container_thuc_merged_bbox_area_first_kept.gpkg"
# )
path_to_thucs = Path('../../inputs/thucs/tophuc_S_M_40000_closed_holes_with_container_thuc_merged_bbox_area_first_kept.gpkg')


thucs = gpd.read_file(
    path_to_thucs
)  # GeoDataFrame containing the thucs and their geometry
thucs = thucs.sort_values(by=["bbox_area_sqkm"], ascending=False)

runlist = thucs['tophucid'].to_list()

In [4]:
cells_without_soil = {}

for thuc_id in tqdm(runlist):

    # thuc_id = f'{i:04.0f}'
    
    # Query for cells where soil is not populated
    # query_geom = f"SELECT * FROM thuc_{thuc_id}_annagnps_cell_ids LIMIT 1"
    query_cds = f"SELECT cell_id FROM thuc_{thuc_id}_annagnps_cell_data_section WHERE soil_id is NULL"
    query_total_count = f"SELECT COUNT(cell_id) FROM thuc_{thuc_id}_annagnps_cell_data_section"

    try:
        with engine.connect() as conn:
            cell_data_section = pd.read_sql(sql=sql_text(query_cds), con=conn)
            total_count = pd.read_sql(sql=sql_text(query_total_count), con=conn)
            # cells = gpd.read_postgis(sql=sql_text(query), con=conn, geom_col="geom")

            # utm = cells.estimate_utm_crs()
            # cells = cells.to_crs(utm)

        no_soil_cells = cell_data_section['cell_id'].to_list()

        if no_soil_cells:
            cells_without_soil[thuc_id] = {'total_cells' : total_count['count'].values[0],
                                        'cells_without_soil': no_soil_cells,
                                        'no_pop_frac': len(no_soil_cells)/total_count['count'].values[0]}
    except:
        continue


100%|██████████| 4800/4800 [15:53<00:00,  5.03it/s] 


In [29]:
import json

dico = cells_without_soil.copy()

for key in dico.keys():
    # cells_without_soil[key]['no_pop_frac'] = 1/cells_without_soil[key]['no_pop_frac']
    dico[key]['cells_without_soil'] = [x for x in dico[key]['cells_without_soil']]
    # dico[key]['total_cells'] = str(dico[key]['total_cells'])
    dico[key]['no_pop_frac'] = 1/dico[key]['no_pop_frac']
    # break
    

# 1/cells_without_soil['1955']['no_pop_frac']

In [31]:
with open('thuc_cells_no_soil.json', 'w') as json_file:
    
    json.dump(dico, json_file, indent=4)

# Query the empty cells by doing a loop through the dict

In [35]:
cells_geo = []

for thuc in tqdm(cells_without_soil.keys()):

    query = "SELECT * FROM thuc_{}_annagnps_cell_ids WHERE dn in ({})".format(thuc, ','.join(str(x) for x in cells_without_soil[thuc]['cells_without_soil']))
    
    with engine.connect() as conn:
        cells = gpd.read_postgis(sql=sql_text(query), con=conn, geom_col="geom")
        cells = cells.to_crs('epsg:4326')

    break

  0%|          | 0/1739 [00:00<?, ?it/s]

In [26]:
thuc

'1955'

In [13]:
cells_without_soil['1955']['cells_without_soil']

[22,
 23,
 41,
 43,
 61,
 63,
 81,
 132,
 133,
 142,
 143,
 151,
 152,
 153,
 161,
 162,
 163,
 171,
 172,
 173,
 182,
 183,
 191,
 192,
 193,
 201,
 202,
 203,
 281,
 381,
 511,
 512,
 513,
 521,
 522,
 523,
 621,
 681,
 821,
 851,
 913,
 921,
 922,
 923,
 931,
 932,
 933,
 941,
 971,
 1031,
 1061,
 1101,
 1171,
 1211,
 1251,
 1281,
 1282,
 1283,
 1292,
 1293,
 1301,
 1302,
 1303,
 1312,
 1313,
 1322,
 1332,
 1333,
 1341,
 1342,
 1343,
 1352,
 1353,
 1361,
 1362,
 1363,
 1371,
 1372,
 1373,
 1381,
 1382,
 1383,
 1391,
 1392,
 1393,
 1451,
 1713,
 1721,
 1722,
 1723,
 1731,
 1732,
 1733,
 1941,
 1942,
 1943,
 1952,
 1953,
 1962,
 1963,
 1971,
 1972,
 1973,
 1981,
 1982,
 1983,
 1992,
 1993,
 2002,
 2003,
 2011,
 2012,
 2013,
 2022,
 2023,
 2032,
 2042,
 2043,
 2051,
 2052,
 2053,
 2062,
 2063,
 2072,
 2073,
 2081,
 2082,
 2083,
 2091,
 2092,
 2093,
 2101,
 2102,
 2103,
 2112,
 2113,
 2121,
 2122,
 2123,
 2133,
 2141,
 2142,
 2143,
 2152,
 2153,
 2162,
 2163,
 2172,
 2173,
 2183,
 2191,

In [6]:
# for tid, tdata in cells_without_soil.items():
    # print(f"{tid}, frac_no_pop: {tdata['no_pop_frac']}")

In [5]:
len(cells_without_soil)

1739

In [6]:
df_redo = pd.DataFrame({'tophucid': list(cells_without_soil.keys())})

In [7]:
df_redo.to_csv('../../tmp/thuc_redo_soil_v2.csv', index=False)