In [1]:
import numpy as np
import pandas as pd
import os
from pathlib import Path, PosixPath
from tqdm import tqdm
import sys
import tensorflow as tf
from typing import List
import json

In [2]:
CREATE_CSV_FILES = True
CAMELS_COMBINED = True
root_path = "/content/gdrive/MyDrive/Colab Datasets/Hydrology/us"

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


# Create csv Files



## Read Forcing Files

In [5]:
def read_forcing_file(fpath: PosixPath):
    basin_id = int(fpath.name.split('_')[0])
    data = pd.read_csv(fpath, delim_whitespace=True, skiprows=3, parse_dates=[[0,1,2]])

    gauge_lat, gauge_elv_m, basin_area_m2  = np.genfromtxt(fpath, max_rows=3)

    data["basin_id"] = basin_id
    data["gauge_lat"] = gauge_lat
    data["gauge_elv(m)"] = gauge_elv_m
    data["basin_area(m2)"] = int(basin_area_m2)

    data.set_index(['basin_id', 'Year_Mnth_Day'], inplace=True)

    return data

def read_forcing_data(data_dir: str):
    files = list(Path(data_dir).glob('**/*_forcing_leap.txt'))

    data_list = []

    pbar = tqdm(files, file=sys.stdout, position=0)
    for fpath in pbar:
        pbar.set_description("process " + fpath.name)
        data = read_forcing_file(fpath)

        data_list.append(data)

    forcing_data = pd.concat(data_list, axis=0, copy=False)

    return forcing_data

In [6]:
if CREATE_CSV_FILES:
  maurer_dir = root_path + "/basin_timeseries_v1p2_metForcing_obsFlow/basin_dataset_public_v1p2/basin_mean_forcing/maurer"
  forcing_data = read_forcing_data(maurer_dir)
  forcing_data.to_csv(root_path + "/forcing_data_us.csv")
  forcing_data

process 08195000_lump_maurer_forcing_leap.txt: 100%|██████████| 675/675 [08:26<00:00,  1.33it/s]


In [11]:
forcing_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Hr,Dayl(s),PRCP(mm/day),SRAD(W/m2),SWE(mm),Tmax(C),Tmin(C),Vp(Pa),gauge_lat,gauge_elv(m),basin_area(m2)
basin_id,Year_Mnth_Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4127918,1980-01-01,12,30438.21,0.03,153.26,0.0,-1.17,-1.17,399.96,46.21,265.0,522953962
4127918,1980-01-02,12,30688.13,0.10,178.17,0.0,-5.34,-5.34,271.42,46.21,265.0,522953962
4127918,1980-01-03,12,30758.40,0.03,231.17,0.0,-12.35,-12.35,134.96,46.21,265.0,522953962
4127918,1980-01-04,12,30758.40,0.00,236.84,0.0,-13.10,-13.10,114.45,46.21,265.0,522953962
4127918,1980-01-05,12,30758.40,0.46,202.74,0.0,-11.54,-11.54,134.06,46.21,265.0,522953962
...,...,...,...,...,...,...,...,...,...,...,...,...
8195000,2008-12-27,12,36288.00,0.02,288.51,0.0,18.73,18.73,783.07,29.91,686.0,1007074709
8195000,2008-12-28,12,36289.87,0.01,372.77,0.0,10.55,10.55,401.43,29.91,686.0,1007074709
8195000,2008-12-29,12,36311.07,0.00,391.41,0.0,6.07,6.07,240.96,29.91,686.0,1007074709
8195000,2008-12-30,12,36381.23,0.00,398.28,0.0,8.23,8.23,238.01,29.91,686.0,1007074709


## Read Discharge Files

In [6]:
def read_discharge_file(fpath: PosixPath, area):
    data = pd.read_csv(fpath, delim_whitespace=True, parse_dates=[[1,2,3]], header=0,
                       names=["basin_id", "Year", "Mnth", "Day", "QObs(mm/d)", "flag"])
    # normalize discharge from cubic feed per second to mm per day
    data["QObs(mm/d)"] = 28316846.592 * data["QObs(mm/d)"] * 86400 / (area * 10**6)

    data.set_index(['basin_id', 'Year_Mnth_Day'], inplace=True)

    return data


def read_discharge_data(data_dir: str):
    files = list(Path(data_dir).glob('**/*_streamflow_qc.txt'))

    data_list = []

    pbar = tqdm(files, file=sys.stdout, position=0)
    for fpath in pbar:
        pbar.set_description("process " + fpath.name)
        basin_id = int(fpath.name.split('_')[0])
        area = forcing_data.loc[basin_id].iloc[0]['basin_area(m2)']

        data = read_discharge_file(fpath, area)

        data_list.append(data)

    discharge_data = pd.concat(data_list, axis=0, copy=False)

    return discharge_data

In [21]:
if CREATE_CSV_FILES:
  streamflow_dir = root_path + "/basin_timeseries_v1p2_metForcing_obsFlow/basin_dataset_public_v1p2/usgs_streamflow"
  discharge_data = read_discharge_data(streamflow_dir)
  discharge_data.to_csv(root_path + "/discharge_data_us.csv")
  discharge_data

process 08324000_streamflow_qc.txt: 100%|██████████| 674/674 [00:37<00:00, 17.96it/s]


## Read Attributes Files

In [7]:
def read_attributes(data_dir: str):
    files = list(Path(data_dir).glob('camels_*.txt'))

    data_list = []

    pbar = tqdm(files, file=sys.stdout, position=0)
    for fpath in pbar:
        pbar.set_description("process " + fpath.name)
        data = pd.read_csv(fpath, delimiter=";", index_col="gauge_id")

        data_list.append(data)

    attributes = pd.concat(data_list, axis=1, copy=False)

    return attributes

In [8]:
if CREATE_CSV_FILES:
  attributes_dir = root_path + "/camels_attributes_v2.0"
  attr = read_attributes(attributes_dir)
  attr.to_csv(root_path + "/attributes_us.csv")
  attr

process camels_vege.txt: 100%|██████████| 7/7 [00:02<00:00,  2.42it/s]


# Read csv Files

In [19]:
BASIN_COL = 'basin_id'
DATE_COL = 'Year_Mnth_Day'

In [12]:
# read csvs
attr_p = pd.read_csv(root_path + "/attributes_us.csv")
discharge_p = pd.read_csv(root_path + "/discharge_data_us.csv", dtype=None, parse_dates=[1])
forcing_p = pd.read_csv(root_path + "/forcing_data_us.csv", dtype=None, parse_dates= [1])

In [19]:
attr_p

Unnamed: 0,gauge_id,p_mean,pet_mean,p_seasonality,frac_snow,aridity,high_prec_freq,high_prec_dur,high_prec_timing,low_prec_freq,...,area_geospa_fabric,frac_forest,lai_max,lai_diff,gvf_max,gvf_diff,dom_land_cover_frac,dom_land_cover,root_depth_50,root_depth_99
0,1013500,3.126679,1.971555,0.187940,0.313440,0.630559,12.95,1.348958,son,202.20,...,2303.95,0.9063,4.167304,3.340732,0.804567,0.371648,0.883452,Mixed Forests,,
1,1022500,3.608126,2.119256,-0.114530,0.245259,0.587356,20.55,1.205279,son,233.65,...,620.38,0.9232,4.871392,3.746692,0.863936,0.337712,0.820493,Mixed Forests,0.237435,2.238444
2,1030500,3.274405,2.043594,0.047358,0.277018,0.624111,17.15,1.207746,son,215.60,...,3676.09,0.8782,4.685200,3.665543,0.858502,0.351393,0.975258,Mixed Forests,,
3,1031500,3.522957,2.071324,0.104091,0.291836,0.587950,18.90,1.148936,son,227.35,...,766.53,0.9548,4.903259,3.990843,0.870668,0.398619,1.000000,Mixed Forests,0.250000,2.400000
4,1047000,3.323146,2.090024,0.147776,0.280118,0.628929,20.10,1.165217,son,235.90,...,904.94,0.9906,5.086811,4.300978,0.891383,0.445473,0.850450,Mixed Forests,0.241027,2.340180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,14309500,4.977781,3.122204,-0.995847,0.061255,0.627228,15.10,1.776471,djf,222.65,...,226.31,1.0000,4.227902,1.986325,0.883414,0.115741,1.000000,Evergreen Needleleaf Forest,0.170000,1.800000
667,14316700,4.543400,2.277630,-0.821172,0.176337,0.501305,14.75,1.446078,djf,214.85,...,588.01,1.0000,4.859652,2.828735,0.914354,0.171176,1.000000,Evergreen Needleleaf Forest,0.170000,1.800000
668,14325000,6.297437,2.434652,-0.952055,0.030203,0.386610,14.60,1.467337,djf,219.05,...,444.92,1.0000,4.150730,1.867148,0.873517,0.115977,0.997677,Evergreen Needleleaf Forest,0.170186,1.801394
669,14362250,2.781676,3.325188,-0.985486,0.141500,1.195390,20.45,1.786026,djf,260.35,...,43.88,1.0000,4.430338,2.451489,0.868294,0.117102,1.000000,Evergreen Needleleaf Forest,0.170000,1.800000


In [15]:
forcing_p

Unnamed: 0,Hr,Dayl(s),PRCP(mm/day),SRAD(W/m2),SWE(mm),Tmax(C),Tmin(C),Vp(Pa),gauge_lat,gauge_elv(m),basin_area(m2)
0,12,30438.21,0.03,153.26,0.0,-1.17,-1.17,399.96,46.21,265.0,522953962
1,12,30688.13,0.10,178.17,0.0,-5.34,-5.34,271.42,46.21,265.0,522953962
2,12,30758.4,0.03,231.17,0.0,-12.35,-12.35,134.96,46.21,265.0,522953962
3,12,30758.4,0.00,236.84,0.0,-13.10,-13.10,114.45,46.21,265.0,522953962
4,12,30758.4,0.46,202.74,0.0,-11.54,-11.54,134.06,46.21,265.0,522953962
...,...,...,...,...,...,...,...,...,...,...,...
7150270,12,36288.0,0.02,288.51,0.0,18.73,18.73,783.07,29.91,686.0,1007074709
7150271,12,36289.87,0.01,372.77,0.0,10.55,10.55,401.43,29.91,686.0,1007074709
7150272,12,36311.07,0.00,391.41,0.0,6.07,6.07,240.96,29.91,686.0,1007074709
7150273,12,36381.23,0.00,398.28,0.0,8.23,8.23,238.01,29.91,686.0,1007074709


In [20]:
locs = np.union1d(np.union1d(forcing_p[BASIN_COL], discharge_p[BASIN_COL]), attr_p['gauge_id'])
Nloc = len(locs)

print(type(discharge_p[DATE_COL].dtypes))
print('Initial Nloc', Nloc)

<class 'numpy.dtypes.DateTime64DType'>
Initial Nloc 675


## Sanity Check

In [21]:
def remove_basins_globally(basins: List[int]):
  global discharge_p
  global forcing_p
  global attr_p
  global locs
  global Nloc

  print("removing", basins)

  discharge_p.drop(discharge_p[discharge_p[BASIN_COL].isin(basins)].index, inplace=True, errors='ignore')
  forcing_p.drop(forcing_p[forcing_p[BASIN_COL].isin(basins)].index, inplace=True, errors='ignore')
  attr_p.drop(attr_p[attr_p['gauge_id'].isin(basins)].index, inplace=True, errors='ignore')

  locs = np.setdiff1d(locs, basins)
  Nloc = len(locs)
  print('new Nloc', Nloc)

In [22]:
intersect = np.intersect1d(np.intersect1d(forcing_p[BASIN_COL], discharge_p[BASIN_COL]), attr_p['gauge_id'])

diff = np.setdiff1d(locs, intersect)
print("diff", diff)

if len(diff) > 0:
  remove_basins_globally(diff)

diff [1150900 6775500 6846500 9535100]
removing [1150900 6775500 6846500 9535100]
new Nloc 671


In [23]:
def find_time_range(df: pd.DataFrame):
  groups = df[[BASIN_COL, DATE_COL]].groupby([BASIN_COL])
  counts = groups.count()

  abnormal_cols = counts[counts[DATE_COL] < 1000].index.values

  if len(abnormal_cols) > 0:
    print('abnormal cols', abnormal_cols)
    remove_basins_globally(abnormal_cols)
    groups = df[[BASIN_COL, DATE_COL]].groupby([BASIN_COL])

  mins = groups.min()
  maxs = groups.max()

  min_d  = np.max(mins.values)
  max_d  = np.min(maxs.values)
  print("range", min_d, max_d)

  return min_d, max_d

In [24]:
min1, max1 = find_time_range(discharge_p)
min2, max2 = find_time_range(forcing_p)

range 1989-10-02T00:00:00.000000000 2009-09-30T00:00:00.000000000
range 1980-01-01T00:00:00.000000000 2008-12-31T00:00:00.000000000


In [25]:
InitialDate = max(min1, min2)
print("InitialDate ", InitialDate)

EndDate = min(max1, max2)
print("EndDate ", EndDate)

InitialDate  1989-10-02T00:00:00.000000000
EndDate  2008-12-31T00:00:00.000000000


In [26]:
# filter data sets based on the InitialDate and EndDate
discharge_p.drop(discharge_p[(discharge_p[DATE_COL] < InitialDate) | (discharge_p[DATE_COL] > EndDate)].index,
                 inplace=True, errors='ignore')
forcing_p.drop(forcing_p[(forcing_p[DATE_COL] < InitialDate) | (forcing_p[DATE_COL] > EndDate)].index,
                 inplace=True, errors='ignore')

In [27]:
# create basin meta data
basin_meta_p = forcing_p[[BASIN_COL, 'gauge_lat', 'gauge_elv(m)', 'basin_area(m2)']].copy()
basin_meta_p.drop_duplicates(inplace=True, ignore_index=True)

assert len(basin_meta_p) == Nloc

In [28]:
basin_meta_p

Unnamed: 0,basin_id,gauge_lat,gauge_elv(m),basin_area(m2)
0,4127918,46.21,265.0,522953962
1,4059500,46.03,424.0,1164996450
2,4127997,45.03,326.0,497065152
3,4056500,46.35,264.0,2847769100
4,4027000,46.25,493.0,1545561957
...,...,...,...,...
666,8200000,29.64,506.0,247497024
667,8178880,29.78,655.0,849152968
668,8196000,29.70,668.0,326199006
669,8189500,28.61,106.0,1786327890


In [29]:
# drop metadata from forcing dataset
forcing_p.drop(['gauge_lat', 'gauge_elv(m)', 'basin_area(m2)'], axis=1, inplace=True, errors='ignore')

# create new average temperature column
forcing_p['Tmean(C)'] = (forcing_p['Tmin(C)'] + forcing_p['Tmax(C)'])/2
print(discharge_p.columns)

Index(['basin_id', 'Year_Mnth_Day', 'QObs(mm/d)', 'flag'], dtype='object')


## Dynamic Data

In [30]:
input_forcing_fields = ['Year_Mnth_Day', 'basin_id', 'PRCP(mm/day)', 'Tmean(C)']
input_discharge_fields = ['Year_Mnth_Day', 'basin_id', 'QObs(mm/d)']

forcing_p[input_forcing_fields]

Unnamed: 0,Year_Mnth_Day,basin_id,PRCP(mm/day),Tmean(C)
3562,1989-10-02,4127918,1.19,12.25
3563,1989-10-03,4127918,0.41,4.77
3564,1989-10-04,4127918,0.05,2.54
3565,1989-10-05,4127918,8.07,3.44
3566,1989-10-06,4127918,7.04,5.92
...,...,...,...,...
7150270,2008-12-27,8195000,0.02,18.73
7150271,2008-12-28,8195000,0.01,10.55
7150272,2008-12-29,8195000,0.00,6.07
7150273,2008-12-30,8195000,0.00,8.23


In [31]:
discharge_p[input_discharge_fields]

Unnamed: 0,Year_Mnth_Day,basin_id,QObs(mm/d)
3561,1989-10-02,2070000,17.325588
3562,1989-10-03,2070000,4.856415
3563,1989-10-04,2070000,2.896348
3564,1989-10-05,2070000,2.310078
3565,1989-10-06,2070000,2.030069
...,...,...,...
8435630,2008-12-27,8324000,0.038203
8435631,2008-12-28,8324000,0.046246
8435632,2008-12-29,8324000,0.048257
8435633,2008-12-30,8324000,0.050268


In [32]:
input_p = forcing_p[input_forcing_fields].merge(discharge_p[input_discharge_fields], on=[BASIN_COL, DATE_COL], copy=True, validate="1:1")
input_p

Unnamed: 0,Year_Mnth_Day,basin_id,PRCP(mm/day),Tmean(C),QObs(mm/d)
0,1989-10-02,4127918,1.19,12.25,0.285381
1,1989-10-03,4127918,0.41,4.77,0.280703
2,1989-10-04,4127918,0.05,2.54,0.294738
3,1989-10-05,4127918,8.07,3.44,0.290059
4,1989-10-06,4127918,7.04,5.92,0.369592
...,...,...,...,...,...
4717796,2008-12-27,8195000,0.02,18.73,0.080170
4717797,2008-12-28,8195000,0.01,10.55,0.077740
4717798,2008-12-29,8195000,0.00,6.07,0.075311
4717799,2008-12-30,8195000,0.00,8.23,0.077740


In [33]:
input_p.sort_values([DATE_COL, BASIN_COL], inplace=True, ignore_index=True)

In [34]:
input_p['PRCP(mm/day)'] = input_p['PRCP(mm/day)'].astype('float')
input_p.rename(columns = {'PRCP(mm/day)':'precipitation_us'}, inplace = True)

In [35]:
assert len(input_p) == len (forcing_p) and len(input_p) == len (discharge_p)

In [36]:
# group intervals of NaN values
def find_intervals(df, date, column):
  m = df[column].isna()
  r = [[*g[date]] for _, g in df[m].groupby((~m).cumsum())]
  return r


# get intervals containing NaN values in dataframe
def get_nan_intervals(df):
  catchments = list(df['basin_id'].unique())

  NaN_intervals = []
  ids = []
  for c in catchments:
    sub_df = df[df['basin_id'] == c]
    ranges = find_intervals(sub_df, 'Year_Mnth_Day', 'precipitation_us')
    if len(ranges) > 0:
      ranges = [len(x) for x in ranges]
      NaN_intervals.append(ranges)
      ids.append(c)

  return NaN_intervals, ids

In [37]:
BasicInputTimeSeries = input_p.to_numpy()

In [38]:
print(BasicInputTimeSeries.shape)
print(BasicInputTimeSeries[0])

(4717801, 5)
[Timestamp('1989-10-02 00:00:00') 1013500 10.38 8.71 0.3215057525056905]


## Static Data

In [39]:
INVALID_ATTR= []

if CAMELS_COMBINED:
  INVALID_ATTR = [
      'gauge_name', 'huc_02', 'area_geospa_fabric', 'lai_max', 'lai_diff', 'gvf_max', 'gvf_diff', 'root_depth_XX',
      'soil_depth_pelletier', 'soil_depth_statsgo', 'soil_porosity', 'soil_conductivity', 'max_water_content',
      'sand_frac', 'silt_frac', 'clay_frac', 'water_frac', 'organic_frac', 'other_frac', 'geo_class_1st', 'geo_1st_class',
      'glim_1st_class_frac', 'geo_2nd_class', 'glim_2nd_class_frac', 'dom_land_cover_frac', 'carbonate_rocks_frac',
      'geo_class_1st_frac', 'geo_class_2nd', 'geo_class_2nd_frac', 'carb_rocks_frac', 'geol_porostiy',
      'geol_permeability', 'root_depth_50', 'root_depth_99', 'frac_forest', 'slope_mean', 'geol_1st_class', 'geol_2nd_class'
  ]


# else:
#   INVALID_ATTR = [
#       'gauge_name', 'huc_02', 'area_geospa_fabric', 'lai_max', 'lai_diff', 'gvf_max', 'gvf_diff', 'root_depth_XX',
#       'soil_depth_pelletier', 'soil_depth_statsgo', 'soil_porosity', 'soil_conductivity', 'max_water_content',
#       'sand_frac', 'silt_frac', 'clay_frac', 'water_frac', 'organic_frac', 'other_frac', 'geo_class_1st',
#       'geo_class_1st_frac', 'geo_class_2nd', 'geo_class_2nd_frac', 'carb_rocks_frac', 'geol_porosity',
#       'geol_permeability', 'high_prec_timing', 'low_prec_timing', 'geol_1st_class', 'geol_2nd_class', 'dom_land_cover'
#   ]

In [40]:
attr_p['gauge_name'].tolist()

['Fish River near Fort Kent, Maine',
 'Narraguagus River at Cherryfield, Maine',
 'Mattawamkeag River near Mattawamkeag, Maine',
 'Piscataquis River near Dover-Foxcroft, Maine',
 'Carrabassett River near North Anson, Maine',
 'Diamond River near Wentworth Location, NH',
 'Wild River at Gilead, Maine',
 'Swift River near Roxbury, Maine',
 'Little Androscoggin River near South Paris, Maine',
 'OYSTER RIVER NEAR DURHAM, NH',
 'SMITH RIVER NEAR BRISTOL, NH',
 'PENDLETON HILL BROOK NEAR CLARKS FALLS, CT.',
 'MOUNT HOPE RIVER NEAR WARRENVILLE, CT.',
 'LITTLE RIVER NEAR HANOVER, CT.',
 'MOOSE RIVER AT VICTORY, VT',
 'AMMONOOSUC RIVER AT BETHLEHEM JUNCTION, NH',
 'WELLS RIVER AT WELLS RIVER, VT',
 'EAST ORANGE BRANCH AT EAST ORANGE, VT',
 'AYERS BROOK AT RANDOLPH, VT',
 'WHITE RIVER AT WEST HARTFORD, VT',
 'PRIEST BROOK NEAR WINCHENDON, MA',
 'NORTH RIVER AT SHATTUCKVILLE, MA',
 'GREEN RIVER NEAR COLRAIN, MA',
 'WEST BRANCH WESTFIELD RIVER AT HUNTINGTON, MA',
 'HUBBARD RIVER NR. WEST HARTLAND,

In [41]:
filtered_attr = attr_p.copy()
filtered_attr.drop(INVALID_ATTR, axis=1, inplace=True, errors='ignore')
filtered_attr.dtypes

gauge_id              int64
p_mean              float64
pet_mean            float64
p_seasonality       float64
frac_snow           float64
aridity             float64
high_prec_freq      float64
high_prec_dur       float64
high_prec_timing     object
low_prec_freq       float64
low_prec_dur        float64
low_prec_timing      object
q_mean              float64
runoff_ratio        float64
slope_fdc           float64
baseflow_index      float64
stream_elas         float64
q5                  float64
q95                 float64
high_q_freq         float64
high_q_dur          float64
low_q_freq          float64
low_q_dur           float64
zero_q_freq         float64
hfd_mean            float64
gauge_lat           float64
gauge_lon           float64
elev_mean           float64
area_gages2         float64
dom_land_cover       object
dtype: object

In [42]:
filtered_attr.columns

Index(['gauge_id', 'p_mean', 'pet_mean', 'p_seasonality', 'frac_snow',
       'aridity', 'high_prec_freq', 'high_prec_dur', 'high_prec_timing',
       'low_prec_freq', 'low_prec_dur', 'low_prec_timing', 'q_mean',
       'runoff_ratio', 'slope_fdc', 'baseflow_index', 'stream_elas', 'q5',
       'q95', 'high_q_freq', 'high_q_dur', 'low_q_freq', 'low_q_dur',
       'zero_q_freq', 'hfd_mean', 'gauge_lat', 'gauge_lon', 'elev_mean',
       'area_gages2', 'dom_land_cover'],
      dtype='object')

In [43]:
assert len(filtered_attr) == Nloc

In [44]:
# Categorical variable encoding
cols = filtered_attr.columns
categorical_cols = filtered_attr.select_dtypes(exclude = ['number']).columns

for cat_col in categorical_cols:
  num_cat = filtered_attr[cat_col].value_counts().count()
  filtered_attr[cat_col] = (filtered_attr[cat_col].astype('category').cat.codes)/num_cat

  print('Processed ', cat_col)

Processed  high_prec_timing
Processed  low_prec_timing
Processed  dom_land_cover


In [45]:
filtered_attr.isna().sum()

gauge_id            0
p_mean              0
pet_mean            0
p_seasonality       0
frac_snow           0
aridity             0
high_prec_freq      0
high_prec_dur       0
high_prec_timing    0
low_prec_freq       0
low_prec_dur        0
low_prec_timing     0
q_mean              1
runoff_ratio        1
slope_fdc           1
baseflow_index      0
stream_elas         1
q5                  1
q95                 1
high_q_freq         1
high_q_dur          1
low_q_freq          1
low_q_dur           1
zero_q_freq         1
hfd_mean            1
gauge_lat           0
gauge_lon           0
elev_mean           0
area_gages2         0
dom_land_cover      0
dtype: int64

In [46]:
# Remove NaN values
nan_cols = [col for col in filtered_attr.columns if filtered_attr.isna().sum()[col] > 0]
print(nan_cols)

for nan_col in nan_cols:
  filtered_attr[nan_col] = filtered_attr[nan_col].fillna(filtered_attr[nan_col].mean())

['q_mean', 'runoff_ratio', 'slope_fdc', 'stream_elas', 'q5', 'q95', 'high_q_freq', 'high_q_dur', 'low_q_freq', 'low_q_dur', 'zero_q_freq', 'hfd_mean']


In [47]:
print(len(filtered_attr.columns))

30


In [48]:
BasicInputStaticProps = filtered_attr.to_numpy()
NpropperTimeStatic = len(filtered_attr.columns) - 1
print('NpropperTimeStatic', NpropperTimeStatic)

NpropperTimeStatic 29


# Save Datasets

In [49]:
if CAMELS_COMBINED:
  input_p.to_csv(root_path + "/BasicInputTimeSeries_us_combined.csv")
  filtered_attr.to_csv(root_path + "/BasicInputStaticProps_us_combined.csv")
  np.save(root_path + "/BasicInputTimeSeries_us_combined", BasicInputTimeSeries)
  np.save(root_path + "/BasicInputStaticProps_us_combined", BasicInputStaticProps)

else:
  input_p.to_csv(root_path + "/BasicInputTimeSeries_us.csv")
  filtered_attr.to_csv(root_path + "/BasicInputStaticProps_us.csv")
  np.save(root_path + "/BasicInputTimeSeries_us", BasicInputTimeSeries)
  np.save(root_path + "/BasicInputStaticProps_us", BasicInputStaticProps)

In [50]:
meta_data = {
    'Nloc': Nloc,
    'locs': locs.tolist(),
    'loc_names': attr_p['gauge_name'].tolist(),
    'BasicInputTimeSeries':{
      'fields': input_p.columns.values.tolist(),
      'index_fields': [BASIN_COL, DATE_COL],
      'initial_date': str(InitialDate),
      'end_date': str(EndDate),
      'time_delta': str(BasicInputTimeSeries[1, 1] - BasicInputTimeSeries[0, 1]),
      'time_steps': int(((EndDate-InitialDate + np.timedelta64(1, 'D'))/np.timedelta64(1, 'D'))),
    },
    'BasicInputStaticProps': {
        'fields': filtered_attr.columns.values.tolist(),
        'index_fileds': ['gauge_id'],
    },
    'NpropperTimeStatic': len(filtered_attr.columns) - 1
}

In [51]:
if CAMELS_COMBINED:
  with open(root_path + "/metadata_us_combined.json", 'w') as outfile:
    json.dump(meta_data, outfile, indent='\t')

else:
  with open(root_path + "/metadata_us.json", 'w') as outfile:
    json.dump(meta_data, outfile, indent='\t')

In [52]:
BasicInputTimeSeries

array([[Timestamp('1989-10-02 00:00:00'), 1013500, 10.38, 8.71,
        0.3215057525056905],
       [Timestamp('1989-10-02 00:00:00'), 1022500, 12.31, 10.22,
        0.5536971983815129],
       [Timestamp('1989-10-02 00:00:00'), 1030500, 3.21, 9.58,
        0.6797832874495239],
       ...,
       [Timestamp('2008-12-31 00:00:00'), 14325000, 28.09, 3.52,
        14.203440434346788],
       [Timestamp('2008-12-31 00:00:00'), 14362250, 9.32, 2.25,
        0.165380610569215],
       [Timestamp('2008-12-31 00:00:00'), 14400000, 12.77, 3.19,
        29.920203240566472]], dtype=object)

In [53]:
type(BasicInputTimeSeries[0,3])

float

In [54]:
str(InitialDate)

'1989-10-02T00:00:00.000000000'

In [55]:
str(BasicInputTimeSeries[1, 1] - BasicInputTimeSeries[0, 1])

'9000'

In [56]:
int(((EndDate-InitialDate + np.timedelta64(1, 'D'))/np.timedelta64(1, 'D')))

7031

In [57]:
meta_data

{'Nloc': 671,
 'locs': [1013500,
  1022500,
  1030500,
  1031500,
  1047000,
  1052500,
  1054200,
  1055000,
  1057000,
  1073000,
  1078000,
  1118300,
  1121000,
  1123000,
  1134500,
  1137500,
  1139000,
  1139800,
  1142500,
  1144000,
  1162500,
  1169000,
  1170100,
  1181000,
  1187300,
  1195100,
  1333000,
  1350000,
  1350080,
  1350140,
  1365000,
  1411300,
  1413500,
  1414500,
  1415000,
  1423000,
  1434025,
  1435000,
  1439500,
  1440000,
  1440400,
  1451800,
  1466500,
  1484100,
  1485500,
  1486000,
  1487000,
  1491000,
  1510000,
  1516500,
  1518862,
  1532000,
  1539000,
  1542810,
  1543000,
  1543500,
  1544500,
  1545600,
  1547700,
  1548500,
  1549500,
  1550000,
  1552000,
  1552500,
  1557500,
  1567500,
  1568000,
  1580000,
  1583500,
  1586610,
  1591400,
  1594950,
  1596500,
  1605500,
  1606500,
  1613050,
  1620500,
  1632000,
  1632900,
  1634500,
  1638480,
  1639500,
  1644000,
  1658500,
  1664000,
  1666500,
  1667500,
  1669000,
  1669520,