In [179]:
import argparse
import copy
import glob
import os
import pickle
import random
import shutil
import sys
import time
import traceback
from concurrent.futures import ProcessPoolExecutor, as_completed, wait
from datetime import datetime, timezone
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from tools_shared_functions import (
    aggregate_wbd_hucs,
    filter_nwm_segments_by_stream_order,
    flow_data,
    get_metadata,
    get_nwm_segs,
    get_thresholds,
)

import utils.fim_logger as fl
from utils.shared_variables import VIZ_PROJECTION

In [72]:
## Functions to get process and filter the metadata

# -------------------------------------------------------
def list_of_lids(conus_list, verbose):
    '''
    Extract a list of LIDs from the conus_list
    
    Example: 
    lid_list = list_of_lids(conus_list, True)
    '''
    lid_list = []
    for i, site in enumerate(conus_list):
        nws_lid = site['identifiers']['nws_lid']
        lid_list.append(nws_lid)
    if verbose == True:
        print(f'List of LIDs: {lid_list}')
        
    return lid_list

# lid_list = list_of_lids(conus_list, True)

# -------------------------------------------------------
def list_duplicate_lids(conus_list, verbose):
    '''
    Extract a list of duplicate LIDs from the conus_list
    
    Example: 
    lid_list, duplicate_lid_list = list_duplicate_lids(conus_list, True)
    '''
    lid_list = []
    duplicate_lid_list = []
     
    
    for i, site in enumerate(conus_list):
        nws_lid = site['identifiers']['nws_lid']

        if nws_lid in lid_list:
            duplicate_lid_list.append(nws_lid)
        else: 
            lid_list.append(nws_lid)

    if verbose == True:
        print(f'Length of unique LID list: {len(lid_list)}')
        print(f'List of duplicate LIDs: {duplicate_lid_list}')

        
    return lid_list, duplicate_lid_list

# lid_list, duplicate_lid_list = list_duplicate_lids(conus_list, True)

# -------------------------------------------------------
def filter_by_lid(lid_filter, conus_list, verbose):
    '''
    Function to filter conus_list by LID
    
    Example:
    conus_list_filt = filter_by_lid('None', conus_list, True)
    '''
    conus_list_filt = []
    for i, site in enumerate(conus_list):
        lid = site['identifiers']['nws_lid']
        if lid == lid_filter:
            conus_list_filt.append(site)
    if verbose == True:
        print(f'LID filter: {lid_filter} \nNumber of sites: {len(conus_list_filt)}')
        
    return conus_list_filt

# conus_list_filt = filter_by_lid('None', conus_list, True)

# -------------------------------------------------------
def filter_by_state(state_filter, conus_list, verbose):
    '''
    Function to filter conus_list by state
    
    Example: 
    conus_list_filt = filter_by_state('Alaska', conus_list, True)
    '''
    conus_list_filt = []
    for i, site in enumerate(conus_list):
        state = site['nws_data']['state']
        if state == state_filter:
            conus_list_filt.append(site)
    if verbose == True:
        print(f'State: {state_filter} \nNumber of sites: {len(conus_list_filt)}')
        
    return conus_list_filt

# conus_list_filt = filter_by_state('Alaska', conus_list, True)

# -------------------------------------------------------

In [73]:
## Testing get_metadata() functionality


# --------- Inputs --------- 


search = 5

nwm_us_search, nwm_ds_search = search, search


# output_catfim_dir = 
API_BASE_URL = 'https://nwcal-wrds.nwc.nws.noaa.gov/api/location/v3.0'
metadata_url = f'{API_BASE_URL}/metadata'


# lid_to_run = 
# nwm_metafile = 

# --------- Code --------- 

all_meta_lists = []


conus_list, ___ = get_metadata(
    metadata_url,
    select_by='nws_lid',
    selector=['all'],
    must_include='nws_data.rfc_forecast_point',
    upstream_trace_distance=nwm_us_search,
    downstream_trace_distance=nwm_ds_search,
)


# Get metadata for Islands and Alaska
islands_list, ___ = get_metadata(
    metadata_url,
    select_by='state',
    selector=['HI', 'PR', 'AK'],
    must_include=None,
    upstream_trace_distance=nwm_us_search,
    downstream_trace_distance=nwm_ds_search,
)
# Append the lists
all_meta_lists = conus_list + islands_list

# print(islands_list)

# with open(meta_file, "wb") as p_handle:
#     pickle.dump(all_meta_lists, p_handle, protocol=pickle.HIGHEST_PROTOCOL)


In [13]:
# print(f'Length of conus_list: {len(conus_list)}')
# print(f'Length of islands_list: {len(islands_list)}')
# print(f'Length of all_meta_lists: {len(all_meta_lists)}')

In [19]:
# ------ New addition: filtering ------



# -- function --
def filter_metadata_list (metadata_list, verbose):
    '''
    
    Filter metadata list to remove: 
    - sites where the nws_lid = None
    - duplicate sites
    
    '''

    unique_lids, duplicate_lids = [], []
    duplicate_metadata_list, unique_metadata_list = [], []

    nonelid_metadata_list = [] # TODO: remove eventually?    

    for i, site in enumerate(metadata_list):
        nws_lid = site['identifiers']['nws_lid']

        if nws_lid == None:
            # No LID available
            nonelid_metadata_list.append(site)

            # TODO: replace this with Continue, eventually we wont need this list

        elif nws_lid in unique_lids:
            # Duplicate LID
            duplicate_lids.append(nws_lid)
            duplicate_metadata_list.append(site)

        else: 
            # Unique/unseen LID that's not None
            unique_lids.append(nws_lid)
            unique_metadata_list.append(site)

    if verbose == True:
        print(f'Input metadata list length: {len(metadata_list)}')
        print(f'Output (unique) metadata list length: {len(unique_metadata_list)}')
        print(f'Number of unique LIDs: {len(unique_lids)} \nNumber of duplicate LIDs: {len(duplicate_lids)} \nNumber of None LIDs: {len(nonelid_metadata_list)}')

    return unique_lids, duplicate_lids, nonelid_metadata_list, duplicate_metadata_list, unique_metadata_list # TODO: eventually, have it only return necessary objects



unique_lids, duplicate_lids, nonelid_metadata_list, duplicate_metadata_list, unique_metadata_list =  filter_metadata_list(all_meta_lists, True)
print()




Input metadata list length: 7631
Output (unique) metadata list length: 7214
Number of unique LIDs: 7214 
Number of duplicate LIDs: 152 
Number of None LIDs: 265



In [23]:
## Get state counts

state_list = ['Puerto Rico', 'Hawaii', 'Alaska']

print('Current Code: Single API call (only forecast points)')
for state in state_list: 
    currentcode_state = filter_by_state(state, conus_list, True)
    print()

print()
print('Proposed Update: Double API call (forecast points + all HI, AK, and PR points)')
print()
for state in state_list: 
    # print('Before filtering out duplicates:')
    # prefilt_state = filter_by_state(state, all_meta_lists, True)
    print('AFTER filtering out duplicates:')
    postfilt_state = filter_by_state(state, unique_metadata_list, True)
    print()


Current Code: Single API call (only forecast points)
State: Puerto Rico 
Number of sites: 5

State: Hawaii 
Number of sites: 2

State: Alaska 
Number of sites: 145


Proposed Update: Double API call (forecast points + all HI, AK, and PR points)

AFTER filtering out duplicates:
State: Puerto Rico 
Number of sites: 238

AFTER filtering out duplicates:
State: Hawaii 
Number of sites: 495

AFTER filtering out duplicates:
State: Alaska 
Number of sites: 1950



In [16]:
postfilt_state = filter_by_state('Connecticut', unique_metadata_list, True)
postfilt_state = filter_by_state('New York', unique_metadata_list, True)
postfilt_state = filter_by_state('Texas', unique_metadata_list, True)

State: Connecticut 
Number of sites: 23
State: New York 
Number of sites: 142
State: Texas 
Number of sites: 380


In [17]:
## Current code formulation


unique_lids, duplicate_lids, nonelid_metadata_list, duplicate_metadata_list, unique_metadata_list =  filter_metadata_list(conus_list, True)
print()
conus_list_filt = filter_by_state('Alaska', conus_list, True)


Input metadata list length: 4679
Output (unique) metadata list length: 4679
Number of unique LIDs: 4679 
Number of duplicate LIDs: 0 
Number of None LIDs: 0

State: Alaska 
Number of sites: 145


In [18]:
# lid_list, duplicate_lid_list = list_duplicate_lids(all_meta_lists, True)

conus_list_filt = filter_by_lid(None, islands_list, True)


LID filter: None 
Number of sites: 265


In [3]:
## Trying to time the API call (but the results were weird)
# from time import process_time
# t1_start = process_time() # stopwatch
 

# conus_list, ___ = get_metadata(
#     metadata_url,
#     select_by='nws_lid',
#     selector=['all'],
#     must_include='nws_data.rfc_forecast_point',
#     upstream_trace_distance=nwm_us_search,
#     downstream_trace_distance=nwm_ds_search,
# )



# print('List length:', len(conus_list))

# t1_stop = process_time() # stopwatch
# print('Runtime (seconds):', t1_stop-t1_start) 

In [5]:
# conus_list[1]
# conus_list[1]['identifiers']['nws_lid']

In [138]:
# state_list, ___ = get_metadata(
#     metadata_url,
#     select_by='state',
#     # selector=['HI', 'PR', 'AK'],
#     selector=['AK'],

#     # must_include='identifiers.nws_lid', ## ddin't work oh well
#     must_include=None,

#     # must_include='nws_data.rfc_forecast_point',
#     upstream_trace_distance=nwm_us_search,
#     downstream_trace_distance=nwm_ds_search,
# )

In [6]:

# unique_lids, duplicate_lids, nonelid_metadata_list, duplicate_metadata_list, unique_metadata_list =  filter_metadata_list(state_list, True)

In [84]:
# filt_list = conus_list[conus_list['identifiers']['nws_lid']=='00RDR']
# type(conus_list)
# list of dictionaries of dictionaries 
# type(conus_list[1])
# conus_list[1].keys()
# conus_list[1]

### Get a HUC list for a given HUC02 region 

In [75]:
fim_output_path = '/data/previous_fim/fim_4_5_2_11/'

# huc2 = '20' # Hawaii
# huc2 = '21' # Puerto Rico
huc2 = '19' # Alaska


all_hucs = os.listdir(fim_output_path)

subsetted_hucs = [x for x in all_hucs if x.startswith(huc2)]


for i in subsetted_hucs:
    print(i)

19020101
19020102
19020103
19020104
19020201
19020202
19020203
19020301
19020302
19020401
19020402
19020501
19020502
19020503
19020504
19020505
19020601
19020602
19020800


### Get stats for current CatFIM

In [205]:
## Inputs
states = ['AK', 'PR', 'HI']

## Previous runs
catfim_folder_prev = '/data/catfim/'
result_folders_prev = ['hand_4_5_11_1_flow_based', 'hand_4_5_11_1_stage_based', 'fim_4_5_2_11_flow_based', 'fim_4_5_2_11_stage_based']

## Current test runs
catfim_folder_testing = '/data/catfim/emily_test'
result_folders_testing = ['site_filtering_HI_flow_based', 'site_filtering_HI_stage_based', 
                  'site_filtering_PR_flow_based', 'site_filtering_PR_stage_based', 
                  'site_filtering_AK_flow_based', 'site_filtering_AK_stage_based']



def count_mapped_for_state(catfim_folder, result_folders, states):
    # Read in CatFIM outputs
    for result_folder in result_folders:
        print()
        print('-----' + result_folder + '-----')
        
        catfim_points_path = 'None'
        
        catfim_outputs_mapping_path = os.path.join(catfim_folder, result_folder, 'mapping')
            
        # Get filepath
        for file in os.listdir(catfim_outputs_mapping_path):
            if file.endswith('catfim_sites.csv'):
                catfim_points_path = os.path.join(catfim_outputs_mapping_path, file)

        if catfim_points_path == 'None':
            print(f'No site csv found in {catfim_outputs_mapping_path}')
            continue
        
        # Open points file
        try:
            catfim_points = gpd.read_file(catfim_points_path)

        except Exception as e:
            print('An error occurred', e)
            continue

        # Get mapped vs unmapped data for the listed states
        for state in states:
            catfim_points_state = catfim_points[catfim_points['states'] == state]
            
            if len(catfim_points_state) != 0:
                num_not_mapped = len(catfim_points_state[catfim_points_state['mapped'] == 'no'])
                num_mapped = len(catfim_points_state[catfim_points_state['mapped'] == 'yes'])
                
                huc_list = catfim_points_state['HUC8']
                
                
                if 'ahps_lid' in catfim_points_state.columns:
                    lid_list = catfim_points_state['ahps_lid']
                elif 'nws_lid' in catfim_points_state.columns:
                    lid_list = catfim_points_state['nws_lid']
                else:
                    print('Could not find ahps_lid or nws_lid column in csv.')                   
                    print(catfim_points_state.columns)
                    continue
                
                


                huc_list_unique = set(huc_list)
                lid_list_unique = set(lid_list)
                num_duplicate_sites = len(lid_list) - len(lid_list_unique)

                print(f'{state} \n   Mapped: {num_mapped} \n   Not mapped: {num_not_mapped}')
                
                print(f'   Number of duplicate LIDs: {num_duplicate_sites}')
                print(f'   {len(huc_list_unique)} hucs: {huc_list_unique}')
                
        # return catfim_points, catfim_points_state
        



In [206]:
count_mapped_for_state(catfim_folder_prev, result_folders_prev, states)


-----hand_4_5_11_1_flow_based-----
AK 
   Mapped: 14 
   Not mapped: 39
   Number of duplicate LIDs: 0
   14 hucs: {'19020503', '19020202', '19020302', '19020502', '19020402', '19020401', '19020104', '19020101', '19020102', '19020505', '19020504', '19020501', '19020201', '19020301'}
PR 
   Mapped: 4 
   Not mapped: 1
   Number of duplicate LIDs: 0
   2 hucs: {'21010005', '21010002'}
HI 
   Mapped: 1 
   Not mapped: 1
   Number of duplicate LIDs: 0
   2 hucs: {'20020000', '20010000'}

-----hand_4_5_11_1_stage_based-----
AK 
   Mapped: 13 
   Not mapped: 40
   Number of duplicate LIDs: 0
   14 hucs: {'19020503', '19020202', '19020302', '19020502', '19020402', '19020401', '19020104', '19020101', '19020102', '19020505', '19020504', '19020501', '19020201', '19020301'}
PR 
   Mapped: 0 
   Not mapped: 5
   Number of duplicate LIDs: 0
   2 hucs: {'21010005', '21010002'}
HI 
   Mapped: 0 
   Not mapped: 2
   Number of duplicate LIDs: 0
   2 hucs: {'20020000', '20010000'}

-----fim_4_5_2_11_fl

In [None]:
count_mapped_for_state(catfim_folder_testing, result_folders_testing, states)

yes
Index(['field_1', 'wrds_timestamp', 'nrldb_timestamp', 'nwis_timestamp',
       'metadata_sources', 'ahps_lid', 'usgs_gage', 'nwm_seg',
       'identifiers_goes_id', 'identifiers_env_can_gage_id', 'nws_data_name',
       'nws_data_wfo', 'nws_data_rfc', 'nws_data_geo_rfc', 'nws_data_latitude',
       'nws_data_longitude', 'nws_data_map_link',
       'nws_data_horizontal_datum_name', 'nws_data_state', 'nws_data_county',
       'nws_data_county_code', 'nws_data_huc', 'nws_data_hsa',
       'nws_data_zero_datum', 'nws_data_vertical_datum_name',
       'nws_data_rfc_forecast_point', 'nws_data_rfc_defined_fcst_point',
       'nws_data_riverpoint', 'usgs_data_name', 'usgs_data_geo_rfc',
       'usgs_data_latitude', 'usgs_data_longitude', 'usgs_data_map_link',
       'usgs_data_coord_accuracy_code', 'usgs_data_latlon_datum_name',
       'usgs_data_coord_method_code', 'usgs_data_state', 'usgs_data_huc',
       'usgs_data_site_type', 'usgs_data_altitude',
       'usgs_data_alt_accuracy_code'

In [194]:

huc_list = catfim_points_state['HUC8']
lid_list = catfim_points_state['ahps_lid']


huc_list_unique = set(huc_list)
lid_list_unique = set(lid_list)
num_duplicate_sites = len(lid_list) - len(lid_list_unique)


print(f'Number of duplicate LIDs: {num_duplicate_sites}')
print(f'{len(huc_list_unique)} hucs: {huc_list_unique}')




Number of duplicate LIDs: 0
7 hucs: {'20050000', '20060000', '20030000', '20020000', '20010000', '20070000', '20040000'}


In [188]:
catfim_points_state

Unnamed: 0,field_1,wrds_timestamp,nrldb_timestamp,nwis_timestamp,metadata_sources,ahps_lid,usgs_gage,nwm_seg,identifiers_goes_id,identifiers_env_can_gage_id,...,crosswalk_datasets_nws_usgs_crosswalk_dataset_nws_usgs_crosswalk_dataset_id,crosswalk_datasets_nws_usgs_crosswalk_dataset_name,crosswalk_datasets_nws_usgs_crosswalk_dataset_description,assigned_crs,HUC8,name,states,mapped,status,geometry
0,0,USGS data: USGS NWIS - Last updated: 2024-12-1...,NWS data: NRLDB - Last updated: 2024-12-10 22:...,USGS data: USGS NWIS - Last updated: 2024-12-1...,['NWS data: NRLDB - Last updated: 2024-12-10 2...,nlih1,16717000,800008995,D114953C,,...,3.0,NWS Station to USGS Gages 3.0,Created 20240328. Authoritative 3.0 dataset m...,EPSG:4269 Assumed,20010000,Hawaii,HI,yes,Missing flow data for action; moderate; major;...,POINT (-17271423.078598868 2245133.7728782427)
1,1,USGS data: USGS NWIS - Last updated: 2024-12-1...,NWS data: NRLDB - Last updated: 2024-12-10 22:...,USGS data: USGS NWIS - Last updated: 2024-12-1...,['NWS data: NRLDB - Last updated: 2024-12-10 2...,wwkh1,16518000,800015240,DD5E710A,,...,3.0,NWS Station to USGS Gages 3.0,Created 20240328. Authoritative 3.0 dataset m...,EPSG:4269 Assumed,20020000,Maui,HI,yes,Missing flow data for action; moderate; record,POINT (-17381756.156252272 2369756.847743014)
2,2,USGS data: USGS NWIS - Last updated: 2024-12-1...,NWS data: NRLDB - Last updated: 2024-12-10 22:...,USGS data: USGS NWIS - Last updated: 2024-12-1...,['NWS data: NRLDB - Last updated: 2024-12-10 2...,aeph1,,,,,...,3.0,NWS Station to USGS Gages 3.0,Created 20240328. Authoritative 3.0 dataset m...,EPSG:4269 Assumed,20070000,Kauai,HI,no,Error getting stages values from WRDS API,POINT (-17754685.729507785 2501056.3808664195)
3,3,USGS data: USGS NWIS - Last updated: 2024-12-1...,NWS data: NRLDB - Last updated: 2024-12-10 22:...,USGS data: USGS NWIS - Last updated: 2024-12-1...,['NWS data: NRLDB - Last updated: 2024-12-10 2...,ahmh1,,,,,...,3.0,NWS Station to USGS Gages 3.0,Created 20240328. Authoritative 3.0 dataset m...,EPSG:4269 Assumed,20010000,Hawaii,HI,no,Error getting stages values from WRDS API,POINT (-17322766.10523033 2253658.9837820856)
4,4,USGS data: USGS NWIS - Last updated: 2024-12-1...,NWS data: NRLDB - Last updated: 2024-12-10 22:...,USGS data: USGS NWIS - Last updated: 2024-12-1...,['NWS data: NRLDB - Last updated: 2024-12-10 2...,ahuh1,,,,,...,3.0,NWS Station to USGS Gages 3.0,Created 20240328. Authoritative 3.0 dataset m...,EPSG:4269 Assumed,20060000,Oahu,HI,no,Error getting stages values from WRDS API,POINT (-17570359.20600229 2443458.5061386973)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,484,USGS data: USGS NWIS - Last updated: 2024-12-1...,NWS data: NRLDB - Last updated: 2024-12-10 22:...,USGS data: USGS NWIS - Last updated: 2024-12-1...,['NWS data: NRLDB - Last updated: 2024-12-10 2...,ito,,,,,...,3.0,NWS Station to USGS Gages 3.0,Created 20240328. Authoritative 3.0 dataset m...,EPSG:4269 Assumed,20010000,Hawaii,HI,no,Error getting stages values from WRDS API,POINT (-17260732.700543668 2240150.578406945)
485,485,USGS data: USGS NWIS - Last updated: 2024-12-1...,NWS data: NRLDB - Last updated: 2024-12-10 22:...,USGS data: USGS NWIS - Last updated: 2024-12-1...,['NWS data: NRLDB - Last updated: 2024-12-10 2...,lih,,,,,...,3.0,NWS Station to USGS Gages 3.0,Created 20240328. Authoritative 3.0 dataset m...,EPSG:4269 Assumed,20070000,Kauai,HI,no,Error getting stages values from WRDS API,POINT (-17737502.947662175 2509184.187952809)
486,486,USGS data: USGS NWIS - Last updated: 2024-12-1...,NWS data: NRLDB - Last updated: 2024-12-10 22:...,USGS data: USGS NWIS - Last updated: 2024-12-1...,['NWS data: NRLDB - Last updated: 2024-12-10 2...,ogg,,,,,...,3.0,NWS Station to USGS Gages 3.0,Created 20240328. Authoritative 3.0 dataset m...,EPSG:4269 Assumed,20020000,Maui,HI,no,Error getting stages values from WRDS API,POINT (-17414375.861736543 2378612.189414868)
487,487,USGS data: USGS NWIS - Last updated: 2024-12-1...,NWS data: NRLDB - Last updated: 2024-12-10 22:...,USGS data: USGS NWIS - Last updated: 2024-12-1...,['NWS data: NRLDB - Last updated: 2024-12-10 2...,phnl,,,,,...,3.0,NWS Station to USGS Gages 3.0,Created 20240328. Authoritative 3.0 dataset m...,EPSG:4269 Assumed,20060000,Oahu,HI,no,Error getting stages values from WRDS API,POINT (-17581738.531727582 2430541.04639686)


In [43]:
# print(len(catfim_points[catfim_points['mapped']=='yes']))

# mapped_points_state = catfim_points_state[catfim_points_state['mapped']=='yes']




input_points = catfim_points_state


# ---- function ----

lid_list = []
duplicate_lid_list = []


for index, row in input_points.iterrows():
    nws_lid = row['ahps_lid']

    if nws_lid in lid_list:
        duplicate_lid_list.append(nws_lid)
    else: 
        lid_list.append(nws_lid)

print(f'Length of unique LID list: {len(lid_list)}')
print(f'List of duplicate LIDs: {duplicate_lid_list}')


Length of unique LID list: 489
List of duplicate LIDs: []


In [38]:
type(mapped_points_state)

pandas.core.frame.DataFrame

### Test state column instablility

In [167]:



input_meta_list = conus_list # conus_list or islands_list

# -----

state_data = []

for i, site in enumerate(input_meta_list):
    lid = site['identifiers']['nws_lid']

    nws_data_state = site['nws_data']['state']
    usgs_data_state = site['usgs_data']['state']
    nws_preferred_state = site['nws_preferred']['state']
    usgs_preferred_state = site['usgs_preferred']['state']

    row = {'index': i, 'lid': lid, 
           'nws_data_state':nws_data_state,
           'usgs_data_state':usgs_data_state,
           'nws_preferred_state':nws_preferred_state,
           'usgs_preferred_state':usgs_preferred_state}

    state_data.append(row)

state_data_df = pd.DataFrame(state_data)



summary_df = pd.DataFrame({
    'nws_data_state': state_data_df['nws_data_state'].isna().sum(),
    'usgs_data_state': state_data_df['usgs_data_state'].isna().sum(),
    'nws_preferred_state': state_data_df['nws_preferred_state'].isna().sum(),
    'usgs_preferred_state': state_data_df['usgs_preferred_state'].isna().sum()}, index=[f'Number of NA Values in State Column, out of {len(state_data_df)} rows'])
 


In [168]:
state_data_df

Unnamed: 0,index,lid,nws_data_state,usgs_data_state,nws_preferred_state,usgs_preferred_state
0,0,00BRD,California,,California,California
1,1,00RDR,California,,California,California
2,2,1TEST,Vermont,,Vermont,Vermont
3,3,AABDB,California,,California,California
4,4,AACLA,California,,California,California
...,...,...,...,...,...,...
4674,4674,RAPV2,Virginia,,Virginia,Virginia
4675,4675,YWPC1,California,,California,California
4676,4676,WSPV2,Virginia,,Virginia,Virginia
4677,4677,AMLC1,California,,California,California


In [169]:
summary_df

Unnamed: 0,nws_data_state,usgs_data_state,nws_preferred_state,usgs_preferred_state
"Number of NA Values in State Column, out of 4679 rows",2,1041,1,1


In [171]:
# noneval = 'None'

# len(state_data_df[state_data_df['usgs_data_state']==None])

# state_data_df[state_data_df['lid']=='YWPC1']

# state_data_df[state_data_df['index']==2]





# state_data_df[state_data_df['nws_data_state'].isna()]

# state_data_df.loc[(state_data_df['nws_data_state'].isna()) & (state_data_df['usgs_data_state'].isna())]
state_data_df.loc[(state_data_df['nws_preferred_state'].isna()) & (state_data_df['usgs_preferred_state'].isna())]



## So now the question is, which of these 'state' columns are being used to pull the data? because if it's anything other than the USGS data state columnm, then we're actually ok I think...


Unnamed: 0,index,lid,nws_data_state,usgs_data_state,nws_preferred_state,usgs_preferred_state
302,302,BEAA3,,,,
