In [2]:
import argparse
import copy
import glob
import os
import pickle
import random
import shutil
import sys
import time
import traceback
from concurrent.futures import ProcessPoolExecutor, as_completed, wait
from datetime import datetime, timezone
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from tools_shared_functions import (
    aggregate_wbd_hucs,
    filter_nwm_segments_by_stream_order,
    flow_data,
    get_metadata,
    get_nwm_segs,
    get_thresholds,
)

import utils.fim_logger as fl
from utils.shared_variables import VIZ_PROJECTION

In [98]:
## Functions to get process and filter the metadata

# -------------------------------------------------------
def list_of_lids(conus_list, verbose):
    '''
    Extract a list of LIDs from the conus_list
    
    Example: 
    lid_list = list_of_lids(conus_list, True)
    '''
    lid_list = []
    for i, site in enumerate(conus_list):
        nws_lid = site['identifiers']['nws_lid']
        lid_list.append(nws_lid)
    if verbose == True:
        print(f'List of LIDs: {lid_list}')
        
    return lid_list

# lid_list = list_of_lids(conus_list, True)

# -------------------------------------------------------
def list_duplicate_lids(conus_list, verbose):
    '''
    Extract a list of duplicate LIDs from the conus_list
    
    Example: 
    lid_list, duplicate_lid_list = list_duplicate_lids(conus_list, True)
    '''
    lid_list = []
    duplicate_lid_list = []
     
    
    for i, site in enumerate(conus_list):
        nws_lid = site['identifiers']['nws_lid']

        if nws_lid in lid_list:
            duplicate_lid_list.append(nws_lid)
        else: 
            lid_list.append(nws_lid)

    if verbose == True:
        print(f'Length of unique LID list: {len(lid_list)}')
        print(f'List of duplicate LIDs: {duplicate_lid_list}')

        
    return lid_list, duplicate_lid_list

# lid_list, duplicate_lid_list = list_duplicate_lids(conus_list, True)

# -------------------------------------------------------
def filter_by_lid(lid_filter, conus_list, verbose):
    '''
    Function to filter conus_list by LID
    
    Example:
    conus_list_filt = filter_by_lid('None', conus_list, True)
    '''
    conus_list_filt = []
    for i, site in enumerate(conus_list):
        lid = site['identifiers']['nws_lid']
        if lid == lid_filter:
            conus_list_filt.append(site)
    if verbose == True:
        print(f'LID filter: {lid_filter} \nNumber of sites: {len(conus_list_filt)}')
        
    return conus_list_filt

# conus_list_filt = filter_by_lid('None', conus_list, True)

# -------------------------------------------------------
def filter_by_state(state_filter, conus_list, verbose):
    '''
    Function to filter conus_list by state
    
    Example: 
    conus_list_filt = filter_by_state('Alaska', conus_list, True)
    '''
    conus_list_filt = []
    for i, site in enumerate(conus_list):
        state = site['nws_data']['state']
        if state == state_filter:
            conus_list_filt.append(site)
    if verbose == True:
        print(f'State: {state_filter} \nNumber of sites: {len(conus_list_filt)}')
        
    return conus_list_filt

# conus_list_filt = filter_by_state('Alaska', conus_list, True)

# -------------------------------------------------------

In [None]:
# --------- Inputs --------- 

search = 5

nwm_us_search, nwm_ds_search = search, search


# output_catfim_dir = 
API_BASE_URL = 'https://nwcal-wrds.nwc.nws.noaa.gov/api/location/v3.0'
metadata_url = f'{API_BASE_URL}/metadata'

# Get metadata for Islands and Alaska
ak_test_list, ___ = get_metadata(
    metadata_url,
    select_by='state',
    selector=['AK'],
    must_include='nws_data.rfc_forecast_point',
    upstream_trace_distance=nwm_us_search,
    downstream_trace_distance=nwm_ds_search,
)


In [None]:
ak_test_list

In [73]:
## Testing get_metadata() functionality

# --------- Inputs --------- 

search = 5

nwm_us_search, nwm_ds_search = search, search


# output_catfim_dir = 
API_BASE_URL = 'https://nwcal-wrds.nwc.nws.noaa.gov/api/location/v3.0'
metadata_url = f'{API_BASE_URL}/metadata'


# lid_to_run = 
# nwm_metafile = 

# --------- Code --------- 

all_meta_lists = []


conus_list, ___ = get_metadata(
    metadata_url,
    select_by='nws_lid',
    selector=['all'],
    must_include='nws_data.rfc_forecast_point',
    upstream_trace_distance=nwm_us_search,
    downstream_trace_distance=nwm_ds_search,
)


# Get metadata for Islands and Alaska
islands_list, ___ = get_metadata(
    metadata_url,
    select_by='state',
    selector=['HI', 'PR', 'AK'],
    must_include=None,
    upstream_trace_distance=nwm_us_search,
    downstream_trace_distance=nwm_ds_search,
)
# Append the lists
all_meta_lists = conus_list + islands_list

# print(islands_list)

# with open(meta_file, "wb") as p_handle:
#     pickle.dump(all_meta_lists, p_handle, protocol=pickle.HIGHEST_PROTOCOL)


In [19]:
# ------ New addition: filtering ------

# -- function --
def filter_metadata_list (metadata_list, verbose):
    '''
    
    Filter metadata list to remove: 
    - sites where the nws_lid = None
    - duplicate sites
    
    '''

    unique_lids, duplicate_lids = [], []
    duplicate_metadata_list, unique_metadata_list = [], []

    nonelid_metadata_list = [] # TODO: remove eventually?    

    for i, site in enumerate(metadata_list):
        nws_lid = site['identifiers']['nws_lid']

        if nws_lid == None:
            # No LID available
            nonelid_metadata_list.append(site)

            # TODO: replace this with Continue, eventually we wont need this list

        elif nws_lid in unique_lids:
            # Duplicate LID
            duplicate_lids.append(nws_lid)
            duplicate_metadata_list.append(site)

        else: 
            # Unique/unseen LID that's not None
            unique_lids.append(nws_lid)
            unique_metadata_list.append(site)

    if verbose == True:
        print(f'Input metadata list length: {len(metadata_list)}')
        print(f'Output (unique) metadata list length: {len(unique_metadata_list)}')
        print(f'Number of unique LIDs: {len(unique_lids)} \nNumber of duplicate LIDs: {len(duplicate_lids)} \nNumber of None LIDs: {len(nonelid_metadata_list)}')

    return unique_lids, duplicate_lids, nonelid_metadata_list, duplicate_metadata_list, unique_metadata_list # TODO: eventually, have it only return necessary objects



unique_lids, duplicate_lids, nonelid_metadata_list, duplicate_metadata_list, unique_metadata_list =  filter_metadata_list(all_meta_lists, True)
print()

Input metadata list length: 7631
Output (unique) metadata list length: 7214
Number of unique LIDs: 7214 
Number of duplicate LIDs: 152 
Number of None LIDs: 265



In [23]:
## Get state counts

state_list = ['Puerto Rico', 'Hawaii', 'Alaska']

print('Current Code: Single API call (only forecast points)')
for state in state_list: 
    currentcode_state = filter_by_state(state, conus_list, True)
    print()

print()
print('Proposed Update: Double API call (forecast points + all HI, AK, and PR points)')
print()
for state in state_list: 
    # print('Before filtering out duplicates:')
    # prefilt_state = filter_by_state(state, all_meta_lists, True)
    print('AFTER filtering out duplicates:')
    postfilt_state = filter_by_state(state, unique_metadata_list, True)
    print()

Current Code: Single API call (only forecast points)
State: Puerto Rico 
Number of sites: 5

State: Hawaii 
Number of sites: 2

State: Alaska 
Number of sites: 145


Proposed Update: Double API call (forecast points + all HI, AK, and PR points)

AFTER filtering out duplicates:
State: Puerto Rico 
Number of sites: 238

AFTER filtering out duplicates:
State: Hawaii 
Number of sites: 495

AFTER filtering out duplicates:
State: Alaska 
Number of sites: 1950



In [16]:
postfilt_state = filter_by_state('Connecticut', unique_metadata_list, True)
postfilt_state = filter_by_state('New York', unique_metadata_list, True)
postfilt_state = filter_by_state('Texas', unique_metadata_list, True)

State: Connecticut 
Number of sites: 23
State: New York 
Number of sites: 142
State: Texas 
Number of sites: 380


In [17]:
## Current code formulation


unique_lids, duplicate_lids, nonelid_metadata_list, duplicate_metadata_list, unique_metadata_list =  filter_metadata_list(conus_list, True)
print()
conus_list_filt = filter_by_state('Alaska', conus_list, True)


Input metadata list length: 4679
Output (unique) metadata list length: 4679
Number of unique LIDs: 4679 
Number of duplicate LIDs: 0 
Number of None LIDs: 0

State: Alaska 
Number of sites: 145


In [18]:
# lid_list, duplicate_lid_list = list_duplicate_lids(all_meta_lists, True)

conus_list_filt = filter_by_lid(None, islands_list, True)


LID filter: None 
Number of sites: 265


In [138]:
# state_list, ___ = get_metadata(
#     metadata_url,
#     select_by='state',
#     # selector=['HI', 'PR', 'AK'],
#     selector=['AK'],

#     # must_include='identifiers.nws_lid', ## ddin't work oh well
#     must_include=None,

#     # must_include='nws_data.rfc_forecast_point',
#     upstream_trace_distance=nwm_us_search,
#     downstream_trace_distance=nwm_ds_search,
# )

### Get a HUC list for a given HUC02 region 

In [75]:
fim_output_path = '/data/previous_fim/fim_4_5_2_11/'

# huc2 = '20' # Hawaii
# huc2 = '21' # Puerto Rico
huc2 = '19' # Alaska

all_hucs = os.listdir(fim_output_path)

subsetted_hucs = [x for x in all_hucs if x.startswith(huc2)]

for i in subsetted_hucs:
    print(i)

19020101
19020102
19020103
19020104
19020201
19020202
19020203
19020301
19020302
19020401
19020402
19020501
19020502
19020503
19020504
19020505
19020601
19020602
19020800


### Get stats for current CatFIM

In [3]:
## Inputs
states = ['AK', 'PR', 'HI']

## Previous runs
catfim_folder_prev = '/data/catfim/'
result_folders_prev = ['hand_4_5_11_1_flow_based', 'hand_4_5_11_1_stage_based', 'fim_4_5_2_11_flow_based', 'fim_4_5_2_11_stage_based']

## Current test runs
catfim_folder_testing = '/data/catfim/emily_test'
result_folders_testing = ['site_filtering_HI_flow_based', 'site_filtering_HI_stage_based', 
                  'site_filtering_PR_flow_based', 'site_filtering_PR_stage_based', 
                  'site_filtering_AK_flow_based', 'site_filtering_AK_stage_based']

def count_mapped_for_state(catfim_folder, result_folders, states):
    # Read in CatFIM outputs
    for result_folder in result_folders:
        print()
        print('-----' + result_folder + '-----')
        
        catfim_points_path = 'None'
        
        catfim_outputs_mapping_path = os.path.join(catfim_folder, result_folder, 'mapping')
            
        # Get filepath
        for file in os.listdir(catfim_outputs_mapping_path):
            if file.endswith('catfim_sites.csv'):
                catfim_points_path = os.path.join(catfim_outputs_mapping_path, file)

        if catfim_points_path == 'None':
            print(f'No site csv found in {catfim_outputs_mapping_path}')
            continue
        
        # Open points file
        try:
            catfim_points = gpd.read_file(catfim_points_path)

        except Exception as e:
            print('An error occurred', e)
            continue

        # Get mapped vs unmapped data for the listed states
        for state in states:
            catfim_points_state = catfim_points[catfim_points['states'] == state]
            
            if len(catfim_points_state) != 0:
                num_not_mapped = len(catfim_points_state[catfim_points_state['mapped'] == 'no'])
                
                catfim_points_state_mapped = catfim_points_state[catfim_points_state['mapped'] == 'yes']
                num_mapped = len(catfim_points_state_mapped)
                
                huc_list = catfim_points_state['HUC8']
                
                if 'ahps_lid' in catfim_points_state.columns:
                    lid_list = catfim_points_state['ahps_lid']
                    lid_list_mapped = catfim_points_state_mapped['ahps_lid']
                elif 'nws_lid' in catfim_points_state.columns:
                    lid_list = catfim_points_state['nws_lid']
                    lid_list_mapped = catfim_points_state_mapped['nws_lid']
                else:
                    print('Could not find ahps_lid or nws_lid column in csv.')                   
                    print(catfim_points_state.columns)
                    continue
                
                huc_list_unique = set(huc_list)
                lid_list_unique = set(lid_list)
                
                num_duplicate_sites = len(lid_list) - len(lid_list_unique)
                num_duplicate_sites_mapped = len(lid_list_mapped) - len(set(lid_list_mapped))

                print(f'{state} \n   Mapped: {num_mapped} \n   Not mapped: {num_not_mapped}')
                
                
                print(f'   Number of duplicate LIDs: {num_duplicate_sites}')
                print(f'   Number of duplicate LIDs mapped: {num_duplicate_sites_mapped}')

                print(f'   {len(huc_list_unique)} hucs: {huc_list_unique}')

        return catfim_points, #catfim_points_state

In [7]:
# catfim_points_pr = count_mapped_for_state('/data/catfim/emily_test/', ['site_filtering_PR_stage_based'], 'PR')

In [6]:
count_mapped_for_state(catfim_folder_prev, result_folders_prev, states)


-----hand_4_5_11_1_flow_based-----
AK 
   Mapped: 14 
   Not mapped: 39
   Number of duplicate LIDs: 0
   Number of duplicate LIDs mapped: 0
   14 hucs: {'19020202', '19020502', '19020201', '19020101', '19020504', '19020301', '19020402', '19020503', '19020104', '19020102', '19020505', '19020302', '19020401', '19020501'}
PR 
   Mapped: 4 
   Not mapped: 1
   Number of duplicate LIDs: 0
   Number of duplicate LIDs mapped: 0
   2 hucs: {'21010002', '21010005'}
HI 
   Mapped: 1 
   Not mapped: 1
   Number of duplicate LIDs: 0
   Number of duplicate LIDs mapped: 0
   2 hucs: {'20020000', '20010000'}

-----hand_4_5_11_1_stage_based-----
AK 
   Mapped: 13 
   Not mapped: 40
   Number of duplicate LIDs: 0
   Number of duplicate LIDs mapped: 0
   14 hucs: {'19020202', '19020502', '19020201', '19020101', '19020504', '19020301', '19020402', '19020503', '19020104', '19020102', '19020505', '19020302', '19020401', '19020501'}
PR 
   Mapped: 0 
   Not mapped: 5
   Number of duplicate LIDs: 0
   Numb

In [8]:
count_mapped_for_state(catfim_folder_testing, result_folders_testing, states)


-----site_filtering_HI_flow_based-----
HI 
   Mapped: 47 
   Not mapped: 442
   Number of duplicate LIDs: 0
   Number of duplicate LIDs mapped: 0
   7 hucs: {'20060000', '20020000', '20030000', '20050000', '20070000', '20010000', '20040000'}

-----site_filtering_HI_stage_based-----
No site csv found in /data/catfim/emily_test/site_filtering_HI_stage_based/mapping

-----site_filtering_PR_flow_based-----
PR 
   Mapped: 59 
   Not mapped: 181
   Number of duplicate LIDs: 0
   Number of duplicate LIDs mapped: 0
   5 hucs: {'21010005', '21010002', '21010004', '21010003', '21010008'}

-----site_filtering_PR_stage_based-----
No site csv found in /data/catfim/emily_test/site_filtering_PR_stage_based/mapping

-----site_filtering_AK_flow_based-----
AK 
   Mapped: 30 
   Not mapped: 615
   Number of duplicate LIDs: 0
   Number of duplicate LIDs mapped: 0
   18 hucs: {'19020202', '19020502', '19020201', '19020101', '19020504', '19020601', '19020203', '19020301', '19020402', '19020503', '19020104'

In [194]:
huc_list = catfim_points_state['HUC8']
lid_list = catfim_points_state['ahps_lid']

huc_list_unique = set(huc_list)
lid_list_unique = set(lid_list)
num_duplicate_sites = len(lid_list) - len(lid_list_unique)

print(f'Number of duplicate LIDs: {num_duplicate_sites}')
print(f'{len(huc_list_unique)} hucs: {huc_list_unique}')


Number of duplicate LIDs: 0
7 hucs: {'20050000', '20060000', '20030000', '20020000', '20010000', '20070000', '20040000'}


### Test state column instablility

In [167]:
input_meta_list = conus_list # conus_list or islands_list

# -----

state_data = []

for i, site in enumerate(input_meta_list):
    lid = site['identifiers']['nws_lid']

    nws_data_state = site['nws_data']['state']
    usgs_data_state = site['usgs_data']['state']
    nws_preferred_state = site['nws_preferred']['state']
    usgs_preferred_state = site['usgs_preferred']['state']

    row = {'index': i, 'lid': lid, 
           'nws_data_state':nws_data_state,
           'usgs_data_state':usgs_data_state,
           'nws_preferred_state':nws_preferred_state,
           'usgs_preferred_state':usgs_preferred_state}

    state_data.append(row)

state_data_df = pd.DataFrame(state_data)

summary_df = pd.DataFrame({
    'nws_data_state': state_data_df['nws_data_state'].isna().sum(),
    'usgs_data_state': state_data_df['usgs_data_state'].isna().sum(),
    'nws_preferred_state': state_data_df['nws_preferred_state'].isna().sum(),
    'usgs_preferred_state': state_data_df['usgs_preferred_state'].isna().sum()}, index=[f'Number of NA Values in State Column, out of {len(state_data_df)} rows'])

In [171]:
# state_data_df[state_data_df['nws_data_state'].isna()]
# state_data_df.loc[(state_data_df['nws_data_state'].isna()) & (state_data_df['usgs_data_state'].isna())]
state_data_df.loc[(state_data_df['nws_preferred_state'].isna()) & (state_data_df['usgs_preferred_state'].isna())]

Unnamed: 0,index,lid,nws_data_state,usgs_data_state,nws_preferred_state,usgs_preferred_state
302,302,BEAA3,,,,


### Review CatFIM Logs

In [3]:
run_path = '/data/catfim/emily_test/site_filtering_PR_stage_based/'
log_file = 'catfim_2024_12_12-19_24_09.log'

log_path = os.path.join(run_path, 'logs', log_file)




In [37]:
import re

out_df = []

with open(log_path) as f:
    for line in f:
        # print(line)
        
        # Initialize variables
        huc, lid, message_type, message = '', '', '', ''

        # Get the HUC8
        match = re.search(r"\d{8}", line)

        if match:
            huc = match.group()
            
            # Get the LID
            match2 = re.search(r"(?<= : ).{5}", line)
            if match2:
                lid = match2.group()
                
                if 'WARNING' in line:
            
                    # print(line)
                    
                    # Get the text
                    pattern = lid+':'
                    match3 = re.search(f"(?<={pattern})(.*)", line)
                    if match3:
                        message = match3.group()
                        message_type = 'WARNING'

                elif 'TRACE' in line: 
                    # Get the text
                    pattern = lid+':'
                    match3 = re.search(f"(?<={pattern})(.*)", line)
                    if match3:
                        message = match3.group()
                        message_type = 'TRACE'
                        
                else:
                    continue
                        
                new_line = {'huc': huc, 'lid':lid, 'message_type':message_type, 'message':message}

                out_df.append(new_line)
                        
                        
                
out_df = pd.DataFrame(out_df)

                
# out_df.to_csv('PR_error_logs.csv')

In [44]:
## __create_acceptable_usgs_elev_df from generate_categorical_fim.py

import argparse
import glob
import math
import os
import shutil
import sys
import time
import traceback
from concurrent.futures import ProcessPoolExecutor, as_completed, wait
from datetime import datetime, timezone

import geopandas as gpd
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from catfim.generate_categorical_fim_flows import generate_flows
from catfim.generate_categorical_fim_mapping import (
    manage_catfim_mapping,
    post_process_cat_fim_for_viz,
    produce_stage_based_lid_tifs,
)
from tools_shared_functions import (
    filter_nwm_segments_by_stream_order,
    get_datum,
    get_nwm_segs,
    get_thresholds,
    ngvd_to_navd_ft,
)
from tools_shared_variables import (
    acceptable_alt_acc_thresh,
    acceptable_alt_meth_code_list,
    acceptable_coord_acc_code_list,
    acceptable_coord_method_code_list,
    acceptable_site_type_list,
)

import utils.fim_logger as fl
from utils.shared_variables import VIZ_PROJECTION


# from itertools import repeat
# from pathlib import Path


# global RLOG
FLOG = fl.FIM_logger()  # the non mp version
MP_LOG = fl.FIM_logger()  # the Multi Proc version

gpd.options.io_engine = "pyogrio"

def __create_acceptable_usgs_elev_df(usgs_elev_df, huc_lid_id):
    acceptable_usgs_elev_df = None
    try:
        # Drop columns that offend acceptance criteria
        print(len(usgs_elev_df)) ## TEMP DEBUG
        usgs_elev_df['acceptable_codes'] = (
            # usgs_elev_df['usgs_data_coord_accuracy_code'].isin(acceptable_coord_acc_code_list)
            # & usgs_elev_df['usgs_data_coord_method_code'].isin(acceptable_coord_method_code_list)
            usgs_elev_df['usgs_data_alt_method_code'].isin(acceptable_alt_meth_code_list)
            & usgs_elev_df['usgs_data_site_type'].isin(acceptable_site_type_list)
        )
        
        print(len(usgs_elev_df)) ## TEMP DEBUG
        
        usgs_elev_df = usgs_elev_df.astype({'usgs_data_alt_accuracy_code': float})
        usgs_elev_df['acceptable_alt_error'] = np.where(
            usgs_elev_df['usgs_data_alt_accuracy_code'] <= acceptable_alt_acc_thresh, True, False
        )
        
        print('after 
        print(len(usgs_elev_df)) ## TEMP DEBUG
              
        acceptable_usgs_elev_df = usgs_elev_df[
            (usgs_elev_df['acceptable_codes'] == True) & (usgs_elev_df['acceptable_alt_error'] == True)
        ]

        # # TEMP DEBUG Record row difference and write it to a CSV or something
        # label = 'Old code' ## TEMP DEBUG
        # num_potential_rows = usgs_elev_df.shape[0]
        # num_acceptable_rows = acceptable_usgs_elev_df.shape[0]
        # out_message = f'{label}: kept {num_acceptable_rows} rows out of {num_potential_rows} available rows.'

    except Exception:
        # Not sure any of the sites actually have those USGS-related
        # columns in this particular file, so just assume it's fine to use

        # print("(Various columns related to USGS probably not in this csv)")
        # print(f"Exception: \n {repr(e)} \n")
        MP_LOG.error(f"{huc_lid_id}: An error has occurred while working with the usgs_elev table")
        MP_LOG.error(traceback.format_exc())
        acceptable_usgs_elev_df = usgs_elev_df

    return acceptable_usgs_elev_df


In [46]:
usgs_elev_table = '/data/previous_fim/fim_4_5_2_11/21010005/usgs_elev_table.csv'
huc_lid_id = 'bzap4'


usgs_elev_df = pd.read_csv(usgs_elev_table)



acceptable_usgs_elev_df = __create_acceptable_usgs_elev_df(usgs_elev_df, huc_lid_id)

acceptable_usgs_elev_df

Unnamed: 0,location_id,nws_lid,feature_id,HydroID,levpa_id,dem_elevation,dem_adj_elevation,order_,LakeID,HUC8,...,mainstem,acceptable_codes,acceptable_alt_error,source,stream_stn,fid_xs,geometry,index_right,geometry_ln,geometry_snapped


In [29]:
len(set(out_df['lid']))

236

In [35]:

# message_string = 'Unable to find gage data'
message_string = 'stage values'

stage_val_lids = out_df[out_df['message'].str.contains('stage values')]['lid']
no_gage_data_lids = out_df[out_df['message'].str.contains('Unable to find gage data')]['lid']



# # Using set intersection
lids_stageval_nogagedata = list(set(stage_val_lids) & set(no_gage_data_lids))

# len(stage_val_lids)
# len(no_gage_data_lids)
# len(lids_stageval_nogagedata)

# print(stage_val_lids)
# lids_stageval_nogagedata