## Make ras2fim/ripple boundary dataset

###### This comes in two parts, it will load all of the stats.csv from the ripple downloads, which are often split into smaller groups. ie) for FIM30, there was 485 MC (model collections), but they were downloaded in sets of 50. 

Last edited: Mar 5, 2025

*** Processing steps
1) Load all of the ripple stats csvs
   
2) load the old ras2fm v2 data. At this point, we don't have a specific dataset for ras2fim that has the huc number and number of models included. We will make a simple csv with columns of HUC and model_count.

3) Merge the ripple and ras2fim df together

4) Make a new dataframe starting with just unique HUCs. Each HUC can iterate back through the original merged df and look for the count of models for ripple mip, ripple ble and ras2fim

5) save as csv and gpkg  (csv for HV and gkpg for easy visual


In [1]:
import os
import glob
import stat

import pandas as pd


# Display all rows
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_rows', 10)

# Display all columns
pd.set_option('display.max_columns', 10)

# Display full width of columns
pd.set_option('display.max_colwidth', None)

print("Done loading imports")

Done loading imports


In [None]:
# GLOBAL VARIABLES
# NOTE: Careful about checking some of this in if it has actual server names or paths

ROOT_PATH = "/{fill in}"
RIPPLE_STATS_CSV_DIR = f"{ROOT_PATH}/fim-data/ripple/fim_30_prod_data/stats_csv_temp/"
RAS2FIM_STATUS_FILE = f"{ROOT_PATH}/fim-data/ripple/fim_30_prod_data/ras2fim_v2_huc_list_w_feature_counts.csv"
OUTPUT_CSV_path = "/{fill in}/outputs/hecras_boundaries.csv"

is_verbose = False

print("Done loading global variables")


Done loading global variables


In [3]:
# Load ripple csv's

stats_files = glob.glob(RIPPLE_STATS_CSV_DIR + "*.csv")
df_ripple_stats_files = []

for filename in stats_files:
    df = pd.read_csv(filename,
                     index_col=None,
                     usecols=['huc', 'source', 'num_features'],
                     dtype={'huc': str})
    df_ripple_stats_files.append(df)

df_ripple_stats = pd.concat(df_ripple_stats_files, ignore_index=True)

if is_verbose:
    print(df_ripple_stats)


In [4]:
# Load the ras2fim data
df_ras2fim = pd.read_csv(RAS2FIM_STATUS_FILE,
                         index_col=None,
                         dtype={'huc': str})
df_ras2fim["source"] = "ras2fim"
if is_verbose:
    print(df_ras2fim)


In [5]:
df_stats = pd.concat([df_ripple_stats, df_ras2fim], ignore_index=True)

df_stats['huc'] = df_stats['huc'].str.zfill(8)

if is_verbose:
    print(df_stats)

In [6]:
# Group by 'Category' and pivot 'Item' to columns
df_pivot = df_stats.pivot_table(index='huc', columns='source', values='num_features')

df_pivot["ble"].fillna("0", inplace = True)
df_pivot["mip"].fillna("0", inplace = True)
df_pivot["ras2fim"].fillna("0", inplace = True)

df_pivot['ble'] = df_pivot['ble'].astype(int)
df_pivot['mip'] = df_pivot['mip'].astype(int)
df_pivot['ras2fim'] = df_pivot['ras2fim'].astype(int)

# drop blank rows
df_pivot = df_pivot[ ((df_pivot["ble"] > 0) | (df_pivot["mip"] > 0) | (df_pivot["ras2fim"] > 0)) ]

if is_verbose:
    print(df_pivot)

In [7]:

# just testing
#df = df_pivot.loc[df_pivot['ble'] > 0]
# df_pivot.loc[(df_pivot['mip'] == 0) & (df_pivot['ble'] > 0) ]
# df_pivot.loc[df_pivot['ras2fim'] > 0]


In [8]:
# find the source with the highest number of features.
cols_to_check = ['ble', 'mip', 'ras2fim']
# df_pivot["selected_source"] = 
df_pivot["selected_source"] = df_pivot[cols_to_check].idxmax(axis=1)

if is_verbose:
    print(df_pivot)


In [9]:
# table adjustments
df_pivot.rename(columns={"ble": "num_ble_features", "mip": "num_mip_features", "ras2fim": "num_ras2fim_features"}, inplace=True)
df_pivot["is_active"] = "True"
if is_verbose:
    print(df_pivot)

In [10]:

df_pivot.to_csv(OUTPUT_CSV_path)
os.chmod(OUTPUT_CSV_path, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
print("df_pivot saved")

df_pivot saved
