In [20]:
from pathlib import Path
import json
import pandas as pd
from sqlalchemy import URL, create_engine, text as sql_text
from tqdm import tqdm

In [21]:
credentials = Path('../../inputs/db_credentials.json')
with open(credentials, 'r') as f:
    credentials = json.load(f)

user     = credentials['user']
password = credentials['password']
host     = credentials['host']
port     = credentials['port']
database = credentials['database']

In [22]:
url_object = URL.create(
    "postgresql",
    username=user,
    password=password,
    host=host,
    port=port,
    database=database
)

# create a SQLAlchemy engine object
engine = create_engine(url_object)

Count cells and reaches per THUC

In [23]:
thuc = []
num_cells = []
num_reaches = []

for thuc_id in tqdm(range(4800+1)):

    thuc.append(thuc_id)

    query_cell  = f"SELECT cell_id FROM thuc_{thuc_id:04}_annagnps_cell_data_section"
    query_reach = f"SELECT reach_id FROM thuc_{thuc_id:04}_annagnps_reach_data_section"

    with engine.connect() as conn:

        try:

            df_cell = pd.read_sql_query(sql=sql_text(query_cell), con=conn)
            df_reach = pd.read_sql_query(sql=sql_text(query_reach), con=conn)

            num_cells.append(df_cell.shape[0])
            num_reaches.append(df_reach.shape[0])

        except Exception as e:
            print(e)
            num_cells.append(None)
            num_reaches.append(None)
            continue

df_thucs = pd.DataFrame(
    {"THUC_ID": thuc,
     "NUM_CELLS": num_cells,
     "NUM_REACHES": num_reaches}
)

  0%|          | 0/4801 [00:00<?, ?it/s]

(psycopg2.errors.UndefinedTable) relation "thuc_0000_annagnps_cell_data_section" does not exist
LINE 1: SELECT cell_id FROM thuc_0000_annagnps_cell_data_section
                            ^

[SQL: SELECT cell_id FROM thuc_0000_annagnps_cell_data_section]
(Background on this error at: https://sqlalche.me/e/20/f405)


 25%|██▌       | 1217/4801 [05:25<30:08,  1.98it/s]  

(psycopg2.errors.UndefinedTable) relation "thuc_1217_annagnps_cell_data_section" does not exist
LINE 1: SELECT cell_id FROM thuc_1217_annagnps_cell_data_section
                            ^

[SQL: SELECT cell_id FROM thuc_1217_annagnps_cell_data_section]
(Background on this error at: https://sqlalche.me/e/20/f405)
(psycopg2.errors.UndefinedTable) relation "thuc_1218_annagnps_cell_data_section" does not exist
LINE 1: SELECT cell_id FROM thuc_1218_annagnps_cell_data_section
                            ^

[SQL: SELECT cell_id FROM thuc_1218_annagnps_cell_data_section]
(Background on this error at: https://sqlalche.me/e/20/f405)
(psycopg2.errors.UndefinedTable) relation "thuc_1219_annagnps_cell_data_section" does not exist
LINE 1: SELECT cell_id FROM thuc_1219_annagnps_cell_data_section
                            ^

[SQL: SELECT cell_id FROM thuc_1219_annagnps_cell_data_section]
(Background on this error at: https://sqlalche.me/e/20/f405)
(psycopg2.errors.UndefinedTable) relation "thuc_1

 93%|█████████▎| 4466/4801 [13:41<00:18, 18.30it/s]  

(psycopg2.errors.UndefinedTable) relation "thuc_4463_annagnps_cell_data_section" does not exist
LINE 1: SELECT cell_id FROM thuc_4463_annagnps_cell_data_section
                            ^

[SQL: SELECT cell_id FROM thuc_4463_annagnps_cell_data_section]
(Background on this error at: https://sqlalche.me/e/20/f405)


100%|██████████| 4801/4801 [14:44<00:00,  5.43it/s]


In [24]:
df_thucs

Unnamed: 0,THUC_ID,NUM_CELLS,NUM_REACHES
0,0,,
1,1,146491.0,62291.0
2,2,3555.0,1461.0
3,3,205997.0,85561.0
4,4,173992.0,72114.0
...,...,...,...
4796,4796,126.0,52.0
4797,4797,182.0,74.0
4798,4798,381.0,156.0
4799,4799,596.0,244.0


In [30]:
# df_thucs.loc[df_thucs["THUC_ID"]==1148,"NUM_REACHES"].sum()
df_thucs["NUM_REACHES"].sum()


35413465.0

In [38]:
df_thucs = df_thucs.dropna()

In [39]:
df_thucs["NUM_REACHES"] = df_thucs["NUM_REACHES"].apply(int)
df_thucs["NUM_CELLS"] = df_thucs["NUM_CELLS"].apply(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_thucs["NUM_REACHES"] = df_thucs["NUM_REACHES"].apply(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_thucs["NUM_CELLS"] = df_thucs["NUM_CELLS"].apply(int)


In [35]:
df_thucs["THUC_ID"] = df_thucs["THUC_ID"].apply(str).str.zfill(4)

In [40]:
df_thucs.to_csv('../../outputs/topagnps/aims_cells_reaches.csv', index=False)

In [41]:
df_thucs.describe()

Unnamed: 0,NUM_CELLS,NUM_REACHES
count,4794.0,4794.0
mean,17886.913433,7387.039007
std,63172.403159,26140.037981
min,2.0,2.0
25%,504.0,210.0
50%,1065.0,441.0
75%,3834.0,1589.75
max,763057.0,328460.0


In [44]:
tot_cells = df_thucs["NUM_CELLS"].sum()
tot_reaches = df_thucs["NUM_REACHES"].sum()

print(f"On AIMS there are {df_thucs.shape[0]} T-HUCs, totaling {tot_cells:.0f} cells and {tot_reaches:.0f} reaches")

On AIMS there are 4794 T-HUCs, totaling 85749863 cells and 35413465 reaches


In [45]:
df_thucs["C2R"] = df_thucs["NUM_CELLS"]/df_thucs["NUM_REACHES"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_thucs["C2R"] = df_thucs["NUM_CELLS"]/df_thucs["NUM_REACHES"]


In [48]:
df_thucs["C2R"].describe()

count    4794.000000
mean        2.402489
std         0.097271
min         0.750000
25%         2.375000
50%         2.424851
75%         2.456262
max         2.496732
Name: C2R, dtype: float64