In [1]:
import os
import sys

import pandas as pd
import numpy as np

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from attributes_calc import process_basin

pd.set_option("display.max_rows", None)

## Load the Data

Get file

In [2]:
def open_basin(path: str) -> pd.DataFrame:
    df = None

    try:
        with open(path, "r") as fp:
            # load area from header
            fp.readline()
            fp.readline()
            area = int(fp.readline())
            # load the dataframe from the rest of the stream
            df = pd.read_csv(fp, sep="\s+")
            df["date"] = pd.to_datetime(
                df.Year.map(str) + "/" + df.Mnth.map(str) + "/" + df.Day.map(str),
                format="%Y/%m/%d",
            )
    except:
        return None
    
    # df = df.set_index("date")
    df.rename(columns={"prcp(mm/day)": "prcp", "tmax(C)": "t_max", "tmin(C)": "t_min"}, inplace=True)
    df["t_mean"] = (df["t_max"] + df["t_min"]) / 2
    return df

In [3]:
attr_path = "../../../data/CAMELS_US/camels_attributes_v2.0"
attributes = os.listdir(attr_path)

actual = pd.read_csv(os.path.join(attr_path, f"{attributes[0]}"), sep=";")
attributes.remove(attributes[0])

for file in attributes:
    df_tmp = pd.read_csv(os.path.join(attr_path, f"{file}"), sep=";")
    actual = actual.merge(df_tmp, on="gauge_id")

actual = actual.rename(columns={"gauge_id": "basin_id", "frac_snow": "frac_snow_daily"})

In [4]:
actual.head()

Unnamed: 0,basin_id,q_mean,runoff_ratio,slope_fdc,baseflow_index,stream_elas,q5,q95,high_q_freq,high_q_dur,...,pet_mean,p_seasonality,frac_snow_daily,aridity,high_prec_freq,high_prec_dur,high_prec_timing,low_prec_freq,low_prec_dur,low_prec_timing
0,1013500,1.699155,0.543437,1.528219,0.585226,1.845324,0.241106,6.373021,6.1,8.714286,...,1.971555,0.18794,0.31344,0.630559,12.95,1.348958,son,202.2,3.427119,mam
1,1022500,2.173062,0.602269,1.77628,0.554478,1.702782,0.204734,7.123049,3.9,2.294118,...,2.119256,-0.11453,0.245259,0.587356,20.55,1.205279,son,233.65,3.662226,jja
2,1030500,1.820108,0.555859,1.87111,0.508441,1.377505,0.107149,6.854887,12.25,7.205882,...,2.043594,0.047358,0.277018,0.624111,17.15,1.207746,son,215.6,3.514262,djf
3,1031500,2.030242,0.576289,1.494019,0.445091,1.648693,0.111345,8.010503,18.9,3.286957,...,2.071324,0.104091,0.291836,0.58795,18.9,1.148936,son,227.35,3.473644,djf
4,1047000,2.18287,0.656868,1.415939,0.473465,1.510238,0.196458,8.095148,14.95,2.577586,...,2.090024,0.147776,0.280118,0.628929,20.1,1.165217,son,235.9,3.691706,djf


In [5]:
forcing_path = "../../../data/CAMELS_US/basin_mean_forcing/daymet"

actual_basins = actual.basin_id.tolist()

df = open_basin(os.path.join(forcing_path, "01", "01013500_lump_cida_forcing_leap.txt"))
df = process_basin(df, basin_id=int("1013500"))

for region in sorted(os.listdir(forcing_path)):
    for basin in sorted(os.listdir(os.path.join(forcing_path, region))):
        if not basin.endswith(".txt"):
            continue
        if basin == "01013500_lump_cida_forcing_leap.txt":
            continue

        basin_id = int(basin.split("_")[0])
        if basin_id not in actual_basins:
            print(basin)
            continue
        module_path = os.path.abspath(os.path.join(".."))

        df_tmp = open_basin(os.path.join(forcing_path, region, basin))
        if df_tmp is None:
            continue

        df_tmp = process_basin(df_tmp, basin_id=basin_id)
        df = pd.concat([df, df_tmp])

01150900_lump_cida_forcing_leap.txt
02081113_lump_cida_forcing_leap.txt
03448942_lump_cida_forcing_leap.txt
06775500_lump_cida_forcing_leap.txt
06846500_lump_cida_forcing_leap.txt
09535100_lump_cida_forcing_leap.txt


In [6]:
for i in df.basin_id.tolist():
    if i not in actual.basin_id.tolist():
        print(i)
        df = df[df.basin_id != i]

In [7]:
cols = df.columns.tolist()
df = df.reset_index()
df = df[cols]

In [8]:
len(df), len(actual)

(671, 671)

In [9]:
actual = actual[df.columns]

In [10]:
df.head()

Unnamed: 0,basin_id,p_mean,p_seasonality,frac_snow_daily,high_prec_freq,high_prec_dur,low_prec_freq,low_prec_dur
0,1013500,3.120536,0.226636,0.274001,13.457143,1.328896,202.028571,3.393134
1,1022500,3.620896,-0.058069,0.20062,20.314286,1.20908,232.685714,3.622306
2,1030500,3.275458,0.092734,0.231223,16.914286,1.244232,213.057143,3.451518
3,1031500,3.466448,0.128724,0.242675,18.942857,1.163905,227.171429,3.499052
4,1047000,3.321473,0.164534,0.237749,20.228571,1.173942,236.742857,3.688173


In [11]:
actual.head()

Unnamed: 0,basin_id,p_mean,p_seasonality,frac_snow_daily,high_prec_freq,high_prec_dur,low_prec_freq,low_prec_dur
0,1013500,3.126679,0.18794,0.31344,12.95,1.348958,202.2,3.427119
1,1022500,3.608126,-0.11453,0.245259,20.55,1.205279,233.65,3.662226
2,1030500,3.274405,0.047358,0.277018,17.15,1.207746,215.6,3.514262
3,1031500,3.522957,0.104091,0.291836,18.9,1.148936,227.35,3.473644
4,1047000,3.323146,0.147776,0.280118,20.1,1.165217,235.9,3.691706


In [12]:
df = df.sort_values(by="basin_id")
actual = actual.sort_values(by="basin_id")

In [13]:
df_err = (df[cols].iloc[0] - actual[cols].iloc[0]) / actual[cols].iloc[0]
df_err["basin_id"] = df.iloc[0].basin_id
df_err = df_err.to_frame().T

for i in range(len(df)):
    row1 = df.iloc[i]
    row2 = actual.iloc[i]

    if int(row1.basin_id) != int(row2.basin_id):
        print(i, row1.basin_id, row2.basin_id)

    if i == 0:
        continue

    df_err_tmp = (row1 - row2) / row2
    df_err_tmp["basin_id"] = df.iloc[i].basin_id

    df_err = pd.concat([df_err, df_err_tmp.to_frame().T])

In [14]:
cols = df_err.columns.tolist()
cols.remove("basin_id")
df_err = df_err.reset_index()
df_err = df_err[["basin_id", *cols]]

In [15]:
df_err.head()

Unnamed: 0,basin_id,p_mean,p_seasonality,frac_snow_daily,high_prec_freq,high_prec_dur,low_prec_freq,low_prec_dur
0,1013500.0,-0.001965,0.205894,-0.125828,0.039162,-0.014872,-0.000848,-0.009916
1,1022500.0,0.003539,-0.492978,-0.182006,-0.01147,0.003154,-0.004127,-0.0109
2,1030500.0,0.000322,0.958151,-0.165314,-0.013744,0.030209,-0.011794,-0.017854
3,1031500.0,-0.01604,0.236652,-0.168454,0.002268,0.013028,-0.000785,0.007315
4,1047000.0,-0.000503,0.113407,-0.151256,0.006397,0.007487,0.003573,-0.000957


In [16]:
df.iloc[0]

basin_id           1.013500e+06
p_mean             3.120536e+00
p_seasonality      2.266361e-01
frac_snow_daily    2.740008e-01
high_prec_freq     1.345714e+01
high_prec_dur      1.328896e+00
low_prec_freq      2.020286e+02
low_prec_dur       3.393134e+00
Name: 0, dtype: float64

In [17]:
actual.iloc[0]

basin_id           1.013500e+06
p_mean             3.126679e+00
p_seasonality      1.879403e-01
frac_snow_daily    3.134404e-01
high_prec_freq     1.295000e+01
high_prec_dur      1.348958e+00
low_prec_freq      2.022000e+02
low_prec_dur       3.427119e+00
Name: 0, dtype: float64

In [18]:
(df.iloc[0] - actual.iloc[0]) / actual.iloc[0]

basin_id           0.000000
p_mean            -0.001965
p_seasonality      0.205894
frac_snow_daily   -0.125828
high_prec_freq     0.039162
high_prec_dur     -0.014872
low_prec_freq     -0.000848
low_prec_dur      -0.009916
Name: 0, dtype: float64

In [19]:
df_err.to_csv("../../data/camels_us_err.csv", index=False)