# Pipeline (download various dataset/scripts)

This code download the following items
- CFE model scripts from github repo
- Hydroshare resources
    - Hydofabric outputs
    - CFE (C-version) parameter config files for CAMELS catchments from Hydroshare
- CAMELS-US dataset from Zenodo by Gauch et al., (2020)
- CAMELS-US attributes from UCAR server

To be edited  
Written by Ryoko Araki (San Diego State University & UCSB, raraki8159@sdsu.edu) in 2023 SI 

In [1]:
import os
import numpy as np
import pandas as pd
import wget
from tqdm import tqdm
import tarfile
import yaml
import requests
from hsclient import HydroShare

In [4]:
# See https://github.com/hydroshare/hsclient/ to check Hydroshare Python Client documentations
hs = HydroShare()
hs.sign_in()

In [2]:
download_mode = 'Python' #['Python','CLI']

## Read in file paths

In [2]:
# Read the config file
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Access the config variables
data_dir = config['io_dir']['data_dir'].replace("${cwd}", "..")
camels_dir = config['io_dir']['camels_dir'].replace("${cwd}", "..")
camels_data_dir = config['io_dir']['gauch_2020_dir'].replace("${cwd}", "..")
camels_attr_dir = config['io_dir']['ucar_dir'].replace("${cwd}", "..")
usgs_dir = config['io_dir']['usgs_streamflow_dir'].replace("${cwd}", "..")
nldas_dir = config['io_dir']['nldas_forcing_dir'].replace("${cwd}", "..")
basin_filename = config['model_settings']['basin_file'].replace("${cwd}", "..")
config_dir = config['io_dir']['config_dir'].replace("${cwd}", "..")

if not os.path.exists(data_dir):
    os.mkdir(data_dir)
if not os.path.exists(camels_dir):
    os.mkdir(camels_dir)
if not os.path.exists(camels_data_dir):
    os.mkdir(camels_data_dir)
if not os.path.exists(camels_attr_dir):
    os.mkdir(camels_attr_dir) 
if not os.path.exists(config_dir):
    os.mkdir(config_dir) 



## Clone Git repo  

In [4]:
!git clone https://github.com/NWC-CUAHSI-Summer-Institute/cfe_py ../cfe_py

# if it already exist, git pull in the directory to get the updated version of cfe

## Download initial parameter configuration
https://www.hydroshare.org/resource/f7d6db8f8677402d808531924bbcf60c/

In [6]:
# Get the HydroShare identifier for the new resource
resIdentifier = "f7d6db8f8677402d808531924bbcf60c"
# Get an existing resource using its identifier
existing_resource = hs.resource(resIdentifier)
print('Just retrieved the resource with ID: ' + resIdentifier)


Just retrieved the resource with ID: f7d6db8f8677402d808531924bbcf60c


In [10]:
init_param_file = existing_resource.file(path="config/CFE_Config_Cver_from_Luciana.zip")
existing_resource.file_download(init_param_file, save_path=config_dir)

'..\\calibrate_cfe\\configs\\soil_ode\\CFE_Config_Cver_from_Luciana.zip'

In [13]:
import zipfile

init_param_path = os.path.join(config_dir, "CFE_Config_Cver_from_Luciana.zip")
with zipfile.ZipFile(init_param_path, 'r') as zip_ref:
    zip_ref.extractall(config_dir)

## Download CAMELS observation & forcing (Gauch et al., 2020)
This notebook downloads CAMELS data, loads them into memory, cleans that data then saves a cleaned data product, which may be used later on for some analysis.

Written by Ryoko Araki (San Diego State University & UCSB, raraki8159@sdsu.edu) in 2023 SI 


### Download files from on-line host.
Homepage: https://zenodo.org/record/4072701/

Citation: Gauch, Martin, Kratzert, Frederik, Klotz, Daniel, Nearing, Grey, Lin, Jimmy, & Hochreiter, Sepp. (2020). Data for "Rainfall-Runoff Prediction at Multiple Timescales with a Single Long Short-Term Memory Network" [Data set]. Zenodo. https://doi.org/10.5281/zenodo.4072701

In [5]:
url_head = r'https://zenodo.org/record/4072701/files/'
url_end = '?download=1'
camels_files = [
    'README.md',
    'usgs_streamflow_csv.tar.gz',
    'nldas_hourly_csv.tar.gz',
    'usgs-streamflow-nldas_hourly.nc',
    ]

#### Wget through Python

In [6]:
if download_mode == 'Python':
    for camels_file in tqdm(camels_files):
        url = url_head + camels_file + url_end
        print(f"Processing: {url}")
        wget.download(url, out=camels_data_dir)

#### In the Linux environment, you can also run a bash file

In [7]:
%%bash
DATA_DIR="../data/camels/"
if [ -d "$DATA_DIR" ]; then rm -Rf $DATA_DIR; fi
mkdir $DATA_DIR
filenames=(nldas_hourly_csv.tar.gz README.md usgs-streamflow-nldas_hourly.nc usgs_streamflow_csv.tar.gz)
for filename in ${filenames[@]}
do
    wget -O "${DATA_DIR}${filename}" "https://zenodo.org/record/4072701/files/${filename}?download=1"
done

### Unzip files

In [8]:
for i in [1, 2]:
    # 1 for the usgs-streamflow
    # 2 for the nldas-forcing
    filename = os.path.join(camels_data_dir, camels_files[i])
    if os.path.exists(filename):
        with tarfile.open(filename, 'r:gz') as tar:
            # Extract all files in the tar.gz file
            tar.extractall(path=camels_data_dir)
    else:
        print('The file hasn\'t been downloaded yet')
    

### Save the list of gauge ID

In [5]:
id_list = []
file_names = os.listdir(os.path.join(camels_data_dir, 'usgs_streamflow'))  # Get all file names in the folder

# Loop through files in the folder
for file_name in file_names:
    # Extract the first 8 digits of the file name
    file_id = file_name[:8]
    id_list.append(file_id)
sorted_id_list = sorted(id_list)

# Save the list as a text file
with open(basin_filename, 'w') as file:
    for item in sorted_id_list:
        file.write("%s\n" % item)

### Check the number of files 

In [None]:
print(f'# USGS streamflow files: {len(file_names)}') # Should be 516

In [6]:
id_list = []
file_names = os.listdir(os.path.join(camels_data_dir, 'nldas_hourly'))  # Get all file names in the folder

# Loop through files in the folder
for file_name in file_names:
    # Extract the first 8 digits of the file name
    file_id = file_name[:8]
    id_list.append(file_id)
sorted_id_list = sorted(id_list)

print(f'# NLDAS forcing files: {len(sorted_id_list)}') # Should be 671

516

##  Download CAMELs attributes (UCAR)
### Download the data

Home page: https://gdex.ucar.edu/dataset/camels.html

In [24]:
filenames=[
    "camels_clim.txt",
    "camels_geol.txt",
    "camels_hydro.txt",
    "camels_name.txt",
    "camels_soil.txt",
    "camels_topo.txt",
    "camels_vege.txt"
    ]

md5_checksums = [
    '67f22592f3fb72c57df81358ce68458b',
    'f5ce5de53eb1ea2532cda7e3b4813993',
    '55ebdeb36c42ee7acdb998229c3edb3a',
    'c96491b32c4df55a31bead7ceca7d64b',
    '8edb46a363a20b466a4b7105ba633767',
    '0f6267838c40b1507b64582433bc0b8e',
    'f40e843defc1e654a800be9fe5fd5090'
    ]
url_head = r'https://gdex.ucar.edu/api/v1/dataset/camels/file/'

#### Wget through Python

In [25]:
import requests
if download_mode == 'Python':

    # Download the data
    for camels_file, md5_checksum in tqdm(zip(filenames, md5_checksums)):
        url = url_head + camels_file
        print(f"Processing: {url}")
        response = requests.get(url)

        if response.status_code == 200:
            with open(os.path.join(camels_attr_dir, camels_file), 'wb') as f:
                f.write(response.content)
            print("File downloaded successfully.")
        else:
            print("Failed to download the file.")


0it [00:00, ?it/s]

Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_clim.txt



1it [00:00,  1.95it/s]

File downloaded successfully.
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_geol.txt



2it [00:00,  2.10it/s]

File downloaded successfully.
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_hydro.txt



3it [00:01,  2.05it/s]

File downloaded successfully.
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_name.txt



4it [00:01,  2.25it/s]

File downloaded successfully.
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_soil.txt



5it [00:02,  2.12it/s]

File downloaded successfully.
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_topo.txt



6it [00:02,  2.16it/s]

File downloaded successfully.
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_vege.txt


7it [00:03,  2.12it/s]

File downloaded successfully.





#### In the Linux environment, you can also run a bash file

In [None]:
%%bash
DATA_DIR="../data/camels/ucar/"
if [ -d "$DATA_DIR" ]; then rm -Rf $DATA_DIR; fi
mkdir $DATA_DIR
filenames=(camels_clim.txt, camels_geol.txt camels_hydro.txt camels_name.txt camels_soil.txt camels_topo.txt camels_vege.txt)
for filename in ${filenames[@]}
do 
    wget -O "${DATA_DIR}${filename}" "https://zenodo.org/record/4072701/files/${filename}?download=1"
done

### Check the data and save

In [26]:
dfs = {}
for filename in filenames:
    with open(os.path.join(camels_attr_dir, filename)) as f:
        dfs[filename] = pd.read_csv(f, sep=";", index_col="gauge_id")
df = pd.concat([dfs[filename] for filename in filenames], axis=1)
df.head()

Unnamed: 0_level_0,p_mean,pet_mean,p_seasonality,frac_snow,aridity,high_prec_freq,high_prec_dur,high_prec_timing,low_prec_freq,low_prec_dur,...,area_geospa_fabric,frac_forest,lai_max,lai_diff,gvf_max,gvf_diff,dom_land_cover_frac,dom_land_cover,root_depth_50,root_depth_99
gauge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1013500,3.126679,1.971555,0.18794,0.31344,0.630559,12.95,1.348958,son,202.2,3.427119,...,2303.95,0.9063,4.167304,3.340732,0.804567,0.371648,0.883452,Mixed Forests,,
1022500,3.608126,2.119256,-0.11453,0.245259,0.587356,20.55,1.205279,son,233.65,3.662226,...,620.38,0.9232,4.871392,3.746692,0.863936,0.337712,0.820493,Mixed Forests,0.237435,2.238444
1030500,3.274405,2.043594,0.047358,0.277018,0.624111,17.15,1.207746,son,215.6,3.514262,...,3676.09,0.8782,4.6852,3.665543,0.858502,0.351393,0.975258,Mixed Forests,,
1031500,3.522957,2.071324,0.104091,0.291836,0.58795,18.9,1.148936,son,227.35,3.473644,...,766.53,0.9548,4.903259,3.990843,0.870668,0.398619,1.0,Mixed Forests,0.25,2.4
1047000,3.323146,2.090024,0.147776,0.280118,0.628929,20.1,1.165217,son,235.9,3.691706,...,904.94,0.9906,5.086811,4.300978,0.891383,0.445473,0.85045,Mixed Forests,0.241027,2.34018


In [27]:
df.to_csv(os.path.join(camels_attr_dir, "camels_attributes_concat.csv"))