**README**

This notebook is intended for developer and testing use, so the functions will 
probably not be as useful for outside users. That said, feel free to peruse the
notebook and use what you wish from it!

In [None]:
# Import packages and functions

import argparse
import copy
import glob
import re
import os
import pickle
import random
import shutil
import sys
import time
import traceback
from concurrent.futures import ProcessPoolExecutor, as_completed, wait
from datetime import datetime, timezone
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from tools_shared_functions import (
    aggregate_wbd_hucs,
    filter_nwm_segments_by_stream_order,
    flow_data,
    get_metadata,
    get_nwm_segs,
    get_thresholds,
)

import utils.fim_logger as fl
from utils.shared_variables import VIZ_PROJECTION

print('Imports complete.')

In [None]:
# Define Functions to retrieve, process, and filter the metadata

# -------------------------------------------------------
def list_of_lids(conus_list, verbose):
    '''
    Extract a list of LIDs from the conus_list
    
    Example: 
    lid_list = list_of_lids(conus_list, True)
    '''
    lid_list = []
    for i, site in enumerate(conus_list):
        nws_lid = site['identifiers']['nws_lid']
        lid_list.append(nws_lid)
    if verbose == True:
        print(f'List of LIDs: {lid_list}')
        
    return lid_list

# -------------------------------------------------------
def list_duplicate_lids(conus_list, verbose):
    '''
    Extract a list of duplicate LIDs from the conus_list
    
    Example: 
    lid_list, duplicate_lid_list = list_duplicate_lids(conus_list, True)
    '''
    lid_list = []
    duplicate_lid_list = []
     
    
    for i, site in enumerate(conus_list):
        nws_lid = site['identifiers']['nws_lid']

        if nws_lid in lid_list:
            duplicate_lid_list.append(nws_lid)
        else: 
            lid_list.append(nws_lid)

    if verbose == True:
        print(f'Length of unique LID list: {len(lid_list)}')
        print(f'List of duplicate LIDs: {duplicate_lid_list}')

        
    return lid_list, duplicate_lid_list

# -------------------------------------------------------
def filter_by_lid(lid_filter, conus_list, verbose):
    '''
    Function to filter conus_list by LID
    
    Example:
    conus_list_filt = filter_by_lid('None', conus_list, True)
    '''
    conus_list_filt = []
    for i, site in enumerate(conus_list):
        lid = site['identifiers']['nws_lid']
        if lid == lid_filter:
            conus_list_filt.append(site)
    if verbose == True:
        print(f'LID filter: {lid_filter} \nNumber of sites: {len(conus_list_filt)}')
        
    return conus_list_filt

# -------------------------------------------------------
def filter_by_state(state_filter, conus_list, verbose):
    '''
    Function to filter conus_list by state
    
    Example: 
    conus_list_filt = filter_by_state('Alaska', conus_list, True)
    '''
    conus_list_filt = []
    for i, site in enumerate(conus_list):
        state = site['nws_data']['state']
        if state == state_filter:
            conus_list_filt.append(site)
    if verbose == True:
        print(f'State: {state_filter} \nNumber of sites: {len(conus_list_filt)}')
        
    return conus_list_filt

# -------------------------------------------------------

print('Define functions complete.')

In [None]:
# Testing get_metadata() functionality

# --------- Inputs --------- 

search = 5

nwm_us_search, nwm_ds_search = search, search


# output_catfim_dir = 
API_BASE_URL = 'https://nwcal-wrds.nwc.nws.noaa.gov/api/location/v3.0'
metadata_url = f'{API_BASE_URL}/metadata'


# lid_to_run = 
# nwm_metafile = 

# --------- Code --------- 

all_meta_lists = []


conus_list, ___ = get_metadata(
    metadata_url,
    select_by='nws_lid',
    selector=['all'],
    must_include='nws_data.rfc_forecast_point',
    upstream_trace_distance=nwm_us_search,
    downstream_trace_distance=nwm_ds_search,
)


# Get metadata for Islands and Alaska
islands_list, ___ = get_metadata(
    metadata_url,
    select_by='state',
    selector=['HI', 'PR', 'AK'],
    must_include=None,
    upstream_trace_distance=nwm_us_search,
    downstream_trace_distance=nwm_ds_search,
)
# Append the lists
all_meta_lists = conus_list + islands_list

# print(islands_list)

# with open(meta_file, "wb") as p_handle:
#     pickle.dump(all_meta_lists, p_handle, protocol=pickle.HIGHEST_PROTOCOL)

print('Metadata retrieval complete.')

In [None]:
# ------ New addition: filtering ------

# -- function --
def filter_metadata_list (metadata_list, verbose):
    '''
    
    Filter metadata list to remove: 
    - sites where the nws_lid = None
    - duplicate sites
    
    '''

    unique_lids, duplicate_lids = [], []
    duplicate_metadata_list, unique_metadata_list = [], []

    nonelid_metadata_list = [] # TODO: remove eventually?    

    for i, site in enumerate(metadata_list):
        nws_lid = site['identifiers']['nws_lid']

        if nws_lid == None:
            # No LID available
            nonelid_metadata_list.append(site)

            # TODO: replace this with Continue, eventually we wont need this list

        elif nws_lid in unique_lids:
            # Duplicate LID
            duplicate_lids.append(nws_lid)
            duplicate_metadata_list.append(site)

        else: 
            # Unique/unseen LID that's not None
            unique_lids.append(nws_lid)
            unique_metadata_list.append(site)

    if verbose == True:
        print(f'Input metadata list length: {len(metadata_list)}')
        print(f'Output (unique) metadata list length: {len(unique_metadata_list)}')
        print(f'Number of unique LIDs: {len(unique_lids)} \nNumber of duplicate LIDs: {len(duplicate_lids)} \nNumber of None LIDs: {len(nonelid_metadata_list)}')

    return unique_lids, duplicate_lids, nonelid_metadata_list, duplicate_metadata_list, unique_metadata_list # TODO: eventually, have it only return necessary objects


# Run filtering function
unique_lids, duplicate_lids, nonelid_metadata_list, duplicate_metadata_list, unique_metadata_list =  filter_metadata_list(all_meta_lists, True)

print('Filtering complete.')

In [None]:
# Quantify number of CatFIM sites available for each state (before and after filtering out duplicates)

state_list = ['Puerto Rico', 'Hawaii', 'Alaska']

print('Current Code: Single API call (only forecast points)')
for state in state_list: 
    currentcode_state = filter_by_state(state, conus_list, True)
    print()

print()
print('Proposed Update: Double API call (forecast points + all HI, AK, and PR points)')
print()
for state in state_list: 
    # print('Before filtering out duplicates:')
    # prefilt_state = filter_by_state(state, all_meta_lists, True)
    print('AFTER filtering out duplicates:')
    postfilt_state = filter_by_state(state, unique_metadata_list, True)
    print()

In [None]:
postfilt_state = filter_by_state('Connecticut', unique_metadata_list, True)
postfilt_state = filter_by_state('New York', unique_metadata_list, True)
postfilt_state = filter_by_state('Texas', unique_metadata_list, True)

In [None]:
# Test current code formulation

unique_lids, duplicate_lids, nonelid_metadata_list, duplicate_metadata_list, unique_metadata_list =  filter_metadata_list(conus_list, True)
print()
conus_list_filt = filter_by_state('Alaska', conus_list, True)

### Get a HUC list given HUC02 region 

Creates a list of HUC8 IDs in the provided FIM output path based on an input HUC2 value. 

In [None]:
fim_output_path = '/data/previous_fim/fim_4_5_2_11/'

# huc2 = '20' # Hawaii
# huc2 = '21' # Puerto Rico
huc2 = '19' # Alaska

all_hucs = os.listdir(fim_output_path)

subsetted_hucs = [x for x in all_hucs if x.startswith(huc2)]

for i in subsetted_hucs:
    print(i)

### Get stats for CatFIM Results

Summarizes the number of mapped and unmapped points for a list of CatFIM results given a list of states. 

In [None]:
# Define function that counts mapped vs unmapped values for a CatFIM output folder

def count_mapped_for_state(catfim_folder, result_folders, states):
    # Read in CatFIM outputs
    for result_folder in result_folders:
        print()
        print('-----' + result_folder + '-----')
        
        catfim_points_path = 'None'
        catfim_outputs_mapping_path = os.path.join(catfim_folder, result_folder, 'mapping')
            
        # Get filepath
        for file in os.listdir(catfim_outputs_mapping_path):
            if file.endswith('catfim_sites.csv'):
                catfim_points_path = os.path.join(catfim_outputs_mapping_path, file)

        if catfim_points_path == 'None':
            print(f'No site csv found in {catfim_outputs_mapping_path}')
            continue
        
        # Open points file
        try:
            catfim_points = gpd.read_file(catfim_points_path)

        except Exception as e:
            print('An error occurred', e)
            continue

        # Get mapped vs unmapped data for the listed states
        for state in states:
            catfim_points_state = catfim_points[catfim_points['states'] == state]
            
            if len(catfim_points_state) != 0:
                num_not_mapped = len(catfim_points_state[catfim_points_state['mapped'] == 'no'])
                
                catfim_points_state_mapped = catfim_points_state[catfim_points_state['mapped'] == 'yes']
                num_mapped = len(catfim_points_state_mapped)
                
                huc_list = catfim_points_state['HUC8']
                
                if 'ahps_lid' in catfim_points_state.columns:
                    lid_list = catfim_points_state['ahps_lid']
                    lid_list_mapped = catfim_points_state_mapped['ahps_lid']
                elif 'nws_lid' in catfim_points_state.columns:
                    lid_list = catfim_points_state['nws_lid']
                    lid_list_mapped = catfim_points_state_mapped['nws_lid']
                else:
                    print('Could not find ahps_lid or nws_lid column in csv.')                   
                    print(catfim_points_state.columns)
                    continue
                
                huc_list_unique = set(huc_list)
                lid_list_unique = set(lid_list)
                
                num_duplicate_sites = len(lid_list) - len(lid_list_unique)
                num_duplicate_sites_mapped = len(lid_list_mapped) - len(set(lid_list_mapped))

                print(f'{state} \n   Mapped: {num_mapped} \n   Not mapped: {num_not_mapped}')
                
                
                print(f'   Number of duplicate LIDs: {num_duplicate_sites}')
                print(f'   Number of duplicate LIDs mapped: {num_duplicate_sites_mapped}')

                print(f'   {len(huc_list_unique)} hucs: {huc_list_unique}')

        return catfim_points, #catfim_points_state

In [None]:
# Set up inputs for counting mapped points in the CatFIM results

# Input state list
states = ['AK', 'PR', 'HI']

# Previous runs
catfim_folder_prev = '/data/catfim/'
result_folders_prev = ['hand_4_5_11_1_flow_based', 'hand_4_5_11_1_stage_based', 'fim_4_5_2_11_flow_based', 'fim_4_5_2_11_stage_based']

# Current test runs
catfim_folder_testing = '/data/catfim/emily_test'
result_folders_testing = ['site_filtering_HI_flow_based', 'site_filtering_HI_stage_based', 
                  'site_filtering_PR_flow_based', 'site_filtering_PR_stage_based', 
                  'site_filtering_AK_flow_based', 'site_filtering_AK_stage_based']

In [None]:
# Run count_mapped_for_state() for previous CatFIM results

count_mapped_for_state(catfim_folder_prev, result_folders_prev, states)

In [None]:
# Run count_mapped_for_state() for most recent CatFIM results

count_mapped_for_state(catfim_folder_testing, result_folders_testing, states)

In [None]:
# Quanitfy duplicate CatFIM sites

huc_list = catfim_points_state['HUC8']
lid_list = catfim_points_state['ahps_lid']

huc_list_unique = set(huc_list)
lid_list_unique = set(lid_list)
num_duplicate_sites = len(lid_list) - len(lid_list_unique)

print(f'Number of duplicate LIDs: {num_duplicate_sites}')
print(f'{len(huc_list_unique)} hucs: {huc_list_unique}')


### Create a Table of Sites with NA Value in the State Column

This code is helpful for testing instability in metadata columns. 

In [None]:
input_meta_list = conus_list # conus_list or islands_list

# -----

state_data = []

for i, site in enumerate(input_meta_list):
    lid = site['identifiers']['nws_lid']

    nws_data_state = site['nws_data']['state']
    usgs_data_state = site['usgs_data']['state']
    nws_preferred_state = site['nws_preferred']['state']
    usgs_preferred_state = site['usgs_preferred']['state']

    row = {'index': i, 'lid': lid, 
           'nws_data_state':nws_data_state,
           'usgs_data_state':usgs_data_state,
           'nws_preferred_state':nws_preferred_state,
           'usgs_preferred_state':usgs_preferred_state}

    state_data.append(row)s

state_data_df = pd.DataFrame(state_data)

summary_df = pd.DataFrame({
    'nws_data_state': state_data_df['nws_data_state'].isna().sum(),
    'usgs_data_state': state_data_df['usgs_data_state'].isna().sum(),
    'nws_preferred_state': state_data_df['nws_preferred_state'].isna().sum(),
    'usgs_preferred_state': state_data_df['usgs_preferred_state'].isna().sum()}, index=[f'Number of NA Values in State Column, out of {len(state_data_df)} rows'])

In [None]:
# state_data_df[state_data_df['nws_data_state'].isna()]
# state_data_df.loc[(state_data_df['nws_data_state'].isna()) & (state_data_df['usgs_data_state'].isna())]
state_data_df.loc[(state_data_df['nws_preferred_state'].isna()) & (state_data_df['usgs_preferred_state'].isna())]

### Parse CatFIM Logs into a DataFrame

In [None]:
run_path = '/data/catfim/emily_test/site_filtering_PR_stage_based/'
log_file = 'catfim_2024_12_12-19_24_09.log'

log_path = os.path.join(run_path, 'logs', log_file)


out_df = []

with open(log_path) as f:
    for line in f:
        # print(line)
        
        # Initialize variables
        huc, lid, message_type, message = '', '', '', ''

        # Get the HUC8
        match = re.search(r"\d{8}", line)
        if match:
            huc = match.group()
            
            # Get the LID
            match2 = re.search(r"(?<= : ).{5}", line)
            if match2:
                lid = match2.group()
                
                if 'WARNING' in line:                    
                    pattern = lid + ':'
                    match3 = re.search(f"(?<={pattern})(.*)", line)
                    if match3:
                        message = match3.group()
                        message_type = 'WARNING'

                elif 'TRACE' in line: 
                    pattern = lid + ':'
                    match3 = re.search(f"(?<={pattern})(.*)", line)
                    if match3:
                        message = match3.group()
                        message_type = 'TRACE'
                        
                else:
                    continue
                        
                new_line = {'huc': huc, 'lid':lid, 'message_type':message_type, 'message':message}

                out_df.append(new_line)
                
out_df = pd.DataFrame(out_df)
                
# out_df.to_csv('PR_error_logs.csv')