# Download CAMELS-US data
This notebook downloads CAMELS data, loads them into memory, cleans that data then saves a cleaned data product, which may be used later on for some analysis.

Written by Ryoko Araki (San Diego State University & UCSB, raraki8159@sdsu.edu) in 2023 SI 


In [None]:
# !pip install tqdm
# !pip install wget

In [2]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
import wget
from tqdm import tqdm
import tarfile

In [7]:
download_mode = 'Python' #['Python','CLI']
data_folder = '../data/camels/gauch_etal_2020'
attr_folder = '../data/camels/ucar'

if not os.path.exists('../data/'):
    os.mkdir('../data/')
if not os.path.exists('../data/camels'):
    os.mkdir('../data/camels')
if not os.path.exists(data_folder):
    os.mkdir(data_folder)
if not os.path.exists(attr_folder):
    os.mkdir(attr_folder) 


# CAMELs observation & forcing (Gauch et al., 2020)
## Download files from on-line host.
Homepage: https://zenodo.org/record/4072701/

Citation: Gauch, Martin, Kratzert, Frederik, Klotz, Daniel, Nearing, Grey, Lin, Jimmy, & Hochreiter, Sepp. (2020). Data for "Rainfall-Runoff Prediction at Multiple Timescales with a Single Long Short-Term Memory Network" [Data set]. Zenodo. https://doi.org/10.5281/zenodo.4072701

In [52]:
url_head = r'https://zenodo.org/record/4072701/files/'
url_end = '?download=1'
camels_files = [
    'README.md',
    'usgs_streamflow_csv.tar.gz',
    'nldas_hourly_csv.tar.gz',
    'usgs-streamflow-nldas_hourly.nc',
    ]

In [19]:
if download_mode == 'Python':
        
    # Download the data
    for camels_file in tqdm(camels_files):
        url = url_head + camels_file + url_end
        print(f"Processing: {url}")
        wget.download(url, out=data_folder)

NameError: name 'camels_files' is not defined

In the Linux environment, you can also run a bash file

In [2]:
%%bash
DATA_DIR="../data/camels/"
if [ -d "$DATA_DIR" ]; then rm -Rf $DATA_DIR; fi
mkdir $DATA_DIR
filenames=(nldas_hourly_csv.tar.gz README.md usgs-streamflow-nldas_hourly.nc usgs_streamflow_csv.tar.gz)
for filename in ${filenames[@]}
do 
    wget -O "${DATA_DIR}${filename}" "https://zenodo.org/record/4072701/files/${filename}?download=1"
done

SyntaxError: invalid syntax (3182278642.py, line 4)

## Unzip files

In [53]:
output_dir = data_folder
for i in [1, 2]:
    # 1 for the usgs-streamflow
    # 2 for the nldas-forcing
    filename = os.path.join(data_folder, camels_files[i])
    if os.path.exists(filename):
        with tarfile.open(filename, 'r:gz') as tar:
            # Extract all files in the tar.gz file
            tar.extractall(path=output_dir)
    else:
        print('The file hasn\'t been downloaded yet')
    

## Save the list of gauge ID (Gauch et al., 2020)

In [49]:
id_list = []
file_names = os.listdir(os.path.join(data_folder, 'usgs_streamflow'))  # Get all file names in the folder

# Loop through files in the folder
for file_name in file_names:
    # Extract the first 8 digits of the file name
    file_id = file_name[:8]
    id_list.append(file_id)
sorted_id_list = sorted(id_list)

# Save the list as a text file
output_file = os.path.join(data_folder, 'basin_561.txt')
with open(output_file, 'w') as file:
    for item in sorted_id_list:
        file.write("%s\n" % item)

In [31]:
len(file_names)

516

## (Skip the followings) 
## Comparison with previous groups' list (Krazert et al., 2020)

## Check if all the files necessary are extracted 

In [25]:
basin_file = r"G:\Shared drives\SI_NextGen_Aridity\data\camels\basin_list_531.txt"
with open(basin_file, 'r') as file:
    lines = file.readlines()
    # Remove leading/trailing whitespaces and newline characters
    lines = [line.strip() for line in lines]
basin_list = lines
print(len(basin_list))
print(basin_list[0:5])

531
['01022500', '01031500', '01047000', '01052500', '01054200']


4

In [30]:
file_names = os.listdir(os.path.join(data_folder, 'usgs_streamflow'))  # Get all file names in the folder
missing_basin_ids = [id for id in basin_list if not any(file_name.startswith(id) for file_name in file_names)]
print('Missing IDs:', missing_basin_ids)

Missing IDs: ['02112120', '02112360', '02125000', '02342933', '02464360', '04127918', '04161580', '04233000', '03238500', '03500240', '06291500', '09035800', '12383500', '11230500', '11237500']


#  CAMELs attributes (UCAR)
## Download the data

Home page: https://gdex.ucar.edu/dataset/camels.html

In [39]:
filenames=[
    "camels_clim.txt",
    "camels_geol.txt",
    "camels_hydro.txt",
    "camels_name.txt",
    "camels_soil.txt",
    "camels_topo.txt",
    "camels_vege.txt"
    ]

md5_checksums = [
    '67f22592f3fb72c57df81358ce68458b',
    'f5ce5de53eb1ea2532cda7e3b4813993',
    '55ebdeb36c42ee7acdb998229c3edb3a',
    'c96491b32c4df55a31bead7ceca7d64b',
    '8edb46a363a20b466a4b7105ba633767',
    '0f6267838c40b1507b64582433bc0b8e',
    'f40e843defc1e654a800be9fe5fd5090'
    ]
url_head = r'https://gdex.ucar.edu/api/v1/dataset/camels/file/'

In [44]:
# Still debugging
import subprocess
if download_mode == 'Win':

    # If it doesn't exist, create it
    if not os.path.exists(attr_folder):
        os.makedirs(attr_folder)
        
    # Download the data
    for camels_file, md5_checksum in tqdm(zip(filenames, md5_checksums)):
        url = url_head + camels_file
        print(f"Processing: {url}")
        
        # wget_options = f"--header='Content-MD5: {md5_checksum}'"
        # wget.download(url, out=attr_folder, options=wget_options)
        # cmd = f"wget --header='Content-MD5: {md5_checksum}' {url} -P {attr_folder}"
        # subprocess.run(cmd, shell=True)



Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_clim.txt
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_geol.txt
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_hydro.txt
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_name.txt
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_soil.txt
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_topo.txt
Processing: https://gdex.ucar.edu/api/v1/dataset/camels/file/camels_vege.txt


7it [00:00, 32.01it/s]


In [None]:
%%bash
DATA_DIR="../data/camels/ucar/"
if [ -d "$DATA_DIR" ]; then rm -Rf $DATA_DIR; fi
mkdir $DATA_DIR
filenames=(camels_clim.txt, camels_geol.txt camels_hydro.txt camels_name.txt camels_soil.txt camels_topo.txt camels_vege.txt)
for filename in ${filenames[@]}
do 
    wget -O "${DATA_DIR}${filename}" "https://zenodo.org/record/4072701/files/${filename}?download=1"
done

In [45]:
dfs = {}
for filename in filenames:
    with open(os.path.join(attr_folder, filename)) as f:
        dfs[filename] = pd.read_csv(f, sep=";", index_col="gauge_id")
df = pd.concat([dfs[filename] for filename in filenames], axis=1)
df.head()

Unnamed: 0_level_0,p_mean,pet_mean,p_seasonality,frac_snow,aridity,high_prec_freq,high_prec_dur,high_prec_timing,low_prec_freq,low_prec_dur,...,area_geospa_fabric,frac_forest,lai_max,lai_diff,gvf_max,gvf_diff,dom_land_cover_frac,dom_land_cover,root_depth_50,root_depth_99
gauge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1013500,3.126679,1.971555,0.18794,0.31344,0.630559,12.95,1.348958,son,202.2,3.427119,...,2303.95,0.9063,4.167304,3.340732,0.804567,0.371648,0.883452,Mixed Forests,,
1022500,3.608126,2.119256,-0.11453,0.245259,0.587356,20.55,1.205279,son,233.65,3.662226,...,620.38,0.9232,4.871392,3.746692,0.863936,0.337712,0.820493,Mixed Forests,0.237435,2.238444
1030500,3.274405,2.043594,0.047358,0.277018,0.624111,17.15,1.207746,son,215.6,3.514262,...,3676.09,0.8782,4.6852,3.665543,0.858502,0.351393,0.975258,Mixed Forests,,
1031500,3.522957,2.071324,0.104091,0.291836,0.58795,18.9,1.148936,son,227.35,3.473644,...,766.53,0.9548,4.903259,3.990843,0.870668,0.398619,1.0,Mixed Forests,0.25,2.4
1047000,3.323146,2.090024,0.147776,0.280118,0.628929,20.1,1.165217,son,235.9,3.691706,...,904.94,0.9906,5.086811,4.300978,0.891383,0.445473,0.85045,Mixed Forests,0.241027,2.34018


In [47]:
df.to_csv(os.path.join(attr_folder, "camels_attributes_concat.csv"))