In [None]:
import numpy as np
import json
import pandas as pd
import os
from pathlib import Path, PosixPath
from tqdm import tqdm
import sys
import tensorflow as tf
from typing import List
import gc
import time
import datetime
from datetime import timedelta,date,datetime
!pip install cloudmesh-common -U
from cloudmesh.common.StopWatch import StopWatch

In [None]:
CREATE_CSV_FILES = False

# create csv files



## read forcing files

In [None]:
def read_forcing_file(fpath: PosixPath):
    basin_id = int(fpath.name.split('_')[0])
    data = pd.read_csv(fpath, delim_whitespace=True,skiprows=3, parse_dates=[[0,1,2]])

    gauge_lat, gauge_elv_m, basin_area_m2  = np.genfromtxt(fpath, max_rows=3)

    data["basin_id"] = basin_id
    data["gauge_lat"] = gauge_lat
    data["gauge_elv(m)"] = gauge_elv_m
    data["basin_area(m2)"] = int(basin_area_m2)

    data.set_index(['basin_id', 'Year_Mnth_Day'], inplace=True)

    return data

def read_forcing_data(data_dir: str):
    files = list(Path(data_dir).glob('**/*_forcing_leap.txt'))

    data_list = []

    pbar = tqdm(files, file=sys.stdout, position=0)
    for fpath in pbar:
        pbar.set_description("process " + fpath.name)
        data = read_forcing_file(fpath)

        data_list.append(data)

    forcing_data = pd.concat(data_list, axis=0, copy=False)

    return forcing_data

In [None]:
if CREATE_CSV_FILES:
  maurer_ext_path = "/N/u2/d/dnperera/Colab Datasets/maurer_extended"
  forcing_data = read_forcing_data(maurer_ext_path)
  forcing_data.to_csv(maurer_ext_path + "/forcing_data.csv")
  forcing_data

## read discharge files

In [None]:
def read_discharge_file(fpath: PosixPath, area):
    data = pd.read_csv(fpath, delim_whitespace=True, parse_dates=[[1,2,3]], header=0,
                       names=["basin_id", "Year", "Mnth", "Day", "QObs(mm/d)", "flag"])
    # normalize discharge from cubic feed per second to mm per day
    data["QObs(mm/d)"] = 28316846.592 * data["QObs(mm/d)"] * 86400 / (area * 10**6)

    data.set_index(['basin_id', 'Year_Mnth_Day'], inplace=True)

    return data


def read_discharge_data(data_dir: str):
    files = list(Path(data_dir).glob('**/*_streamflow_qc.txt'))

    data_list = []

    pbar = tqdm(files, file=sys.stdout, position=0)
    for fpath in pbar:
        pbar.set_description("process " + fpath.name)
        basin_id = int(fpath.name.split('_')[0])
        area = forcing_data.loc[basin_id].iloc[0]['basin_area(m2)']

        data = read_discharge_file(fpath, area)

        data_list.append(data)

    discharge_data = pd.concat(data_list, axis=0, copy=False)

    return discharge_data

## read attributes files

In [None]:
def read_attributes(data_dir: str):
    files = list(Path(data_dir).glob('camels_*.txt'))

    data_list = []

    pbar = tqdm(files, file=sys.stdout, position=0)
    for fpath in pbar:
        pbar.set_description("process " + fpath.name)
        data = pd.read_csv(fpath, delimiter=";", index_col="gauge_id")

        data_list.append(data)

    attributes = pd.concat(data_list, axis=1, copy=False)

    return attributes

# Read csv files

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!cp /content/gdrive/My\ Drive/Caravan

In [None]:
!tar xjf ./Caravan

In [None]:
BASIN_COL = 'gauge_id'
DATE_COL = 'date'

## Merge Individual Catchment Timeseries Files into One

In [None]:
import glob
from tqdm import tqdm

merge_catchments = False
if merge_catchments:
  path = "/content/gdrive/MyDrive/Caravan/timeseries/csv/hysets"
  csv_files = glob.glob(path + "/*.csv")
  gauge_ids = [f.split('/')[-1][:-4] for f in csv_files]
  print(csv_files)
  print(gauge_ids)

  #combine all files in the list
  df_original = [pd.read_csv(f) for f in tqdm(csv_files)]

  df_addedGauge = [df_original[i].insert(1, 'gauge_id', gauge_ids[i]) for i in tqdm(range(len(df_original)))]
  combined_csv = pd.concat(df for df in df_original)
  #export to csv
  path = "/content/gdrive/MyDrive/Caravan/timeseries/csv/hysets_combined.csv"
  with open(path, 'w', encoding = 'utf-8-sig') as f:
    combined_csv.to_csv(f)

## Shift Date Forward by One Day

In [None]:
#camels
#camelsbr
#camelscl
#hysets
shift_date = False
if shift_date:
  input_df = pd.DataFrame()
  count = 0
  input_p = pd.DataFrame()
  for chunk in pd.read_csv("/content/gdrive/MyDrive/Caravan/timeseries/csv/hysets_combined.csv", chunksize = 100000, low_memory=False):
    input_p = pd.concat([input_p,chunk])

  input_p = input_p.set_index(pd.DatetimeIndex(input_p['date']))
  input_p = input_p.shift(1, freq = 'D')
  input_p['date'] = input_p.index
  input_p = input_p.reset_index(drop = True)

  path = "/content/gdrive/MyDrive/Caravan/timeseries/csv/hysets_combined_shifted.csv"
  with open(path, 'w', encoding = 'utf-8-sig') as f:
    input_p.to_csv(f)

## Remove Dynamic Variables

In [None]:
remove_variables = True
nation = 'lamah'
if remove_variables:
  TestInputTimeSeries = np.load('/content/gdrive/My Drive/Caravan/training/{}/BasicInputTimeSeries_{}.npy'.format(nation, nation), allow_pickle = True)
  with open('/content/gdrive/My Drive/Caravan/training/{}/metadata_{}.json'.format(nation, nation), 'r') as f:
    metadata = json.load(f)

  df = pd.DataFrame(TestInputTimeSeries, columns = metadata["BasicInputTimeSeries"]["fields"])
  df1 = df[['date', 'gauge_id', 'total_precipitation_sum', 'temperature_2m_mean', 'streamflow']]
  df1['date'] = pd.to_datetime(df1['date'], format='%Y-%m-%d')
  BasicInputTimeSeries = df1.to_numpy()
  np.save("/content/gdrive/MyDrive/Caravan/training/{}/BasicInputTimeSeries_{}_unique".format(nation, nation), BasicInputTimeSeries)

In [None]:
metadata["BasicInputTimeSeries"]["fields"] = ['date', 'gauge_id', 'total_precipitation_sum', 'temperature_2m_mean', 'streamflow']
with open('/content/gdrive/MyDrive/Caravan/training/{}/metadata_{}_unique.json'.format(nation, nation), 'w') as outfile:
  json.dump(metadata, outfile, indent='\t')

## Choose Nation

In [None]:
nation = "camels"   # camels, camelsaus, camelscl, camelsgb, camelsbr, hysets, lamah

attr_path = "/content/gdrive/MyDrive/Caravan/attributes/{}/attributes_caravan_{}.csv".format(nation, nation)
input_path = "/content/gdrive/MyDrive/Caravan/timeseries/csv/{}_combined.csv".format(nation)
other_path = "/content/gdrive/MyDrive/Caravan/attributes/{}/attributes_other_{}.csv".format(nation, nation)

## Read Nation CSVs

In [None]:
# Static
attr_p = pd.read_csv(attr_path)
other_p = pd.read_csv(other_path)

In [None]:
def timenow():
  now = datetime.now()
  return now.strftime("%m/%d/%Y, %H:%M:%S") + " UTC"

input_p = pd.DataFrame()
iterator = 0
if nation == "hysets":
  for chunk in pd.read_csv("/content/gdrive/MyDrive/Caravan/timeseries/csv/hysets_combined_processed_final.csv", chunksize = 1000000, low_memory=False):
    fred = chunk.shape
    print( str(iterator) + ' chunk ' + str(fred) + ' ' + timenow() )
    iterator += 1
    input_p = pd.concat([input_p,chunk])
else:
  input_p = pd.read_csv(input_path)

In [None]:
input_p = input_p.drop(input_p.columns[0],axis=1)

In [None]:
input_p

In [None]:
from datetime import timedelta
input_p.sort_values([DATE_COL, BASIN_COL], inplace=True, ignore_index=True)
input_p['gauge_id'] = [int(x.split('_')[-1].translate({ord(i): None for i in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'})) for x in input_p['gauge_id']]
input_p['date'] = pd.to_datetime(input_p['date'], format='%Y-%m-%d')
input_p = input_p[(input_p['date'] >= '1989-10-02') & (input_p['date'] <= '2008-12-31')]
input_p.reset_index(inplace = True, drop = True)
NaN_rows = []

# remove the three catchments with NaN values in camels_us
if nation == "camels":
  input_p.drop(input_p[input_p['gauge_id'] == 'camels_03066000'].index, inplace = True)
  input_p.drop(input_p[input_p['gauge_id'] == 'camels_03281100'].index, inplace = True)
  input_p.drop(input_p[input_p['gauge_id'] == 'camels_12141300'].index, inplace = True)
  input_p.reset_index(inplace = True, drop = True)

  input_p = input_p.loc[input_p['gauge_id'].isin(unique)]


## sanity check

In [None]:
def find_time_range(df: pd.DataFrame):
  groups = df[[BASIN_COL, DATE_COL]].groupby([BASIN_COL])
  counts = groups.count()

  abnormal_cols = counts[counts[DATE_COL] < 1000].index.values

  if len(abnormal_cols) > 0:
    print('abnormal cols', abnormal_cols)
    groups = df[[BASIN_COL, DATE_COL]].groupby([BASIN_COL])

  mins = groups.min()
  maxs = groups.max()

  min_d  = np.max(mins.values)
  max_d  = np.min(maxs.values)
  print(min_d, max_d)

  return min_d, max_d

In [None]:
InitialDate, EndDate = find_time_range(input_p)
print("InitialDate ", InitialDate)
print("EndDate ", EndDate)

In [None]:
# read basin meta data
basin_meta_p = pd.read_csv("/content/gdrive/MyDrive/Caravan/attributes/{}/attributes_other_{}.csv".format(nation, nation))
basin_meta_p.drop_duplicates(inplace=True, ignore_index=True)
basin_meta_p['gauge_id'] = [int(x.split('_')[-1].translate({ord(i): None for i in 'ABCDEFGHJKLMNPOQRSTUVWXYZ'})) for x in basin_meta_p['gauge_id']]
if nation == "camels":
  basin_meta_p.drop(basin_meta_p[basin_meta_p['gauge_id'] == '3066000'].index, inplace = True)
  basin_meta_p.drop(basin_meta_p[basin_meta_p['gauge_id'] == '3281100'].index, inplace = True)
  basin_meta_p.drop(basin_meta_p[basin_meta_p['gauge_id'] == '12141300'].index, inplace = True)

In [None]:
basin_meta_p

## Dynamic Data

In [None]:
# Edit gauge ids to only contain integers
input_p.sort_values([DATE_COL, BASIN_COL], inplace=True, ignore_index=True)
input_p['gauge_id'] = [int(x.split('_')[-1].translate({ord(i): None for i in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'})) for x in input_p['gauge_id']]

In [None]:
input_p

In [None]:
print(len(input_p.columns))
print(input_p.isna().sum())
print('Total number of data for each column: ', len(input_p['streamflow']))

In [None]:
import matplotlib.pyplot as plt
analyze_NaN = False

if analyze_NaN:
  gauge_id = list(input_p['gauge_id'][input_p['streamflow'].isna()])
  date = list(input_p['date'][input_p['streamflow'].isna()])
  distinct_date = list(set(date))

  count = [date.count(d) for d in distinct_date] # Very slow
  print(count)
  print(len(count))

  plt.bar(distinct_date, count)

### NaN Sequence **Over** Catchment Graph

In [None]:
# group intervals of NaN values
def find_intervals(df, date, column):
  m = df[column].isna()
  r = [[*g[date]] for _, g in df[m].groupby((~m).cumsum())]
  return r

In [None]:
import matplotlib.pyplot as plt
analyze_NaN = False
if analyze_NaN:
  # NaN interpolation
  us = False
  test_df = input_p

  # group dataframe by catchment
  catchments = list(test_df['gauge_id'].unique())
  print(len(catchments))
  print(catchments)

  NaN_intervals = []
  ids = []
  for c in catchments:
    sub_df = test_df[test_df['gauge_id'] == c] # Very slow
    ranges = find_intervals(sub_df, 'date', 'streamflow')
    if len(ranges) > 0:
      ranges = [(len(x)/7031)*100 for x in ranges]
      NaN_intervals.append(ranges)
      ids.append(c)


  print(NaN_intervals)
  print(len(NaN_intervals))
  print(ids)


  # Plot
  ind = 0
  width = 1

  position = ind

  plt.figure(figsize=(50,15))
  for i in range(len(ids)):
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
    color_i = 0
    b = 0


    for j in range(len(NaN_intervals[i])):

      cur_length = NaN_intervals[i][j]
      if cur_length >= 0 and cur_length < 20:
        plt.bar(position, cur_length, width, bottom = b, color = 'b')

      elif cur_length >= 20 and cur_length < 50:
        plt.bar(position, cur_length, width, bottom = b, color = 'g')

      elif cur_length >= 50 and cur_length < 100:
        plt.bar(position, cur_length, width, bottom = b, color = 'r')

      else:
        plt.bar(position, cur_length, width, bottom = b, color = 'y')

      b += NaN_intervals[i][j]

    position += width

  plt.title('{} NaN Distribution Plot'.format(nation))
  plt.ylabel('NaN Percentages (%)')
  plt.xlabel('Catchment')
  plt.legend(labels = ['Blue: x < 20%', 'Green: 20% <= x < 50%', 'Red: 50% <= x < 100% ', 'Yellow: 100%'])
  plt.show()

### Determine Dynamic Variable Normalization

In [None]:
# Check RAM usage
import sys

local_vars = list(locals().items())
for var, obj in local_vars:
    print(var, sys.getsizeof(obj))

In [None]:
input_p

In [None]:
for chunk in pd.read_csv("/content/gdrive/MyDrive/Caravan/timeseries/csv/hysets_combined_shifted.csv", chunksize = 500000, low_memory=False):
    fred = chunk.shape
    print( str(iterator) + ' chunk ' + str(fred) + ' ' + timenow() )
    iterator += 1
    input_p = pd.concat([input_p,chunk])

In [None]:
BasicInputTimeSeries = input_p.to_numpy()

print(BasicInputTimeSeries.shape)
print(BasicInputTimeSeries[0])

In [None]:
input_p = None
chunk = None
gc.collect()

In [None]:
input_p.columns

### Check NaN-containing Sequence Fraction

In [None]:
from datetime import datetime

if analyze_NaN:
  #TODO: Open numpy from Gdrive
  TestInputTimeSeries = np.load('/content/gdrive/My Drive/Caravan/training/{}/BasicInputTimeSeries_{}.npy'.format(nation, nation), allow_pickle = True)
  TestInputStaticProps = np.load('/content/gdrive/My Drive/Caravan/training/{}/BasicInputStaticProps_{}.npy'.format(nation, nation), allow_pickle = True)
  print(len(TestInputStaticProps))

  Tseq = 21
  Nloc = len(attr_p['gauge_id'])-3
  NuminputSeries = TestInputTimeSeries.shape[1]
  print(NuminputSeries)
  # InitialDate = datetime.strptime(str(InitialDate),'%Y-%m-%dT%H:%M:%S.%f000')
  # FinalDate = datetime.strptime(str(EndDate),'%Y-%m-%dT%H:%M:%S.%f000')

  init = datetime.strptime('1989-10-02T00:00:00.000000000','%Y-%m-%dT%H:%M:%S.%f000')
  final = datetime.strptime('2008-12-31T00:00:00.000000000','%Y-%m-%dT%H:%M:%S.%f000')

  NumberofTimeunits = (final-init).days + 1
  Num_Seq = int(NumberofTimeunits - Tseq)

  num_catchments = Nloc
  RawInputSeqDimension = Tseq
  print(num_catchments)

  # TODO: Reshape
  TestInputTimeSeries = np.delete(TestInputTimeSeries,[0,1],1)
  TestInputTimeSeries = np.reshape(TestInputTimeSeries,[NumberofTimeunits,Nloc,NuminputSeries-2])


  iseq = 0
  nans = [0 for i in range(num_catchments)]
  nan = 0
  no_nan = 0


  while iseq < Num_Seq:
    icatchment = 0
    while icatchment < num_catchments:
      # if pd.isna(TestInputTimeSeries[iseq:iseq+Tseq, icatchment]).any():
      #   nans[icatchment] += 1

      if pd.isna(TestInputTimeSeries[iseq:iseq+Tseq, icatchment]).any():
        nan += 1
        nans[icatchment] += 1
      else:
        no_nan += 1

      icatchment += 1

    iseq += 1

  print(nan, no_nan)


  nan_ratios = [x/Num_Seq for x in nans]
  plt.figure(figsize=(50,15))
  position = 0
  for i in range(len(nan_ratios)):
    if nan_ratios[i] > 0.5:
      plt.bar(position, nan_ratios[i], color = 'r')
    else:
      plt.bar(position, nan_ratios[i], color = 'g')
    position += 1

  plt.xlabel('catchment #')
  plt.ylabel('nan ratio')
  plt.show()

## Static data

In [None]:
# attr_p['gauge_id'] = [int(x.split('_')[-1]) for x in attr_p['gauge_id']]
attr_p['gauge_id'] = [int(x.split('_')[-1].translate({ord(i): None for i in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'})) for x in attr_p['gauge_id']]
locs = attr_p['gauge_id']
Nloc = len(attr_p['gauge_id'])

if nation == "camels":
  attr_p = attr_p.loc[attr_p['gauge_id'].isin(unique)]

meta_concat = basin_meta_p.drop(columns=['gauge_id', 'gauge_name', 'gauge_lat', 'gauge_lon', 'country'])
attr_p = pd.concat([attr_p, meta_concat], axis=1)

print(len(attr_p))

In [None]:
attr_p

## PCA

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

nation_li = ['camels', 'camelsaus', 'camelsbr', 'camelscl', 'camelsgb', 'hysets', 'lamah']
caravan_attr_combined = pd.DataFrame()

# process each nation's dataset
for nat in nation_li:
  attr = pd.read_csv("/content/gdrive/MyDrive/Caravan/attributes/{}/attributes_caravan_{}.csv".format(nat, nat))
  other = pd.read_csv("/content/gdrive/MyDrive/Caravan/attributes/{}/attributes_other_{}.csv".format(nat, nat))

  if nat == "camels":
    unique = [1139000, 1365000, 1664000, 2324400, 4045500,
              4127918, 5120500, 6280300, 6431500, 6470800,
              6479438, 6622700, 6632400, 7142300, 7197000,
              8086290, 8190000, 8377900, 9210500, 9492400,
              10172700, 10249300, 10259200, 10263500, 12381400,
              13083000, 13161500, 13240000, 13313000]
    attr['gauge_id'] = [int(x.split('_')[-1].translate({ord(i): None for i in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'})) for x in attr['gauge_id']]
    other['gauge_id'] = [int(x.split('_')[-1].translate({ord(i): None for i in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'})) for x in other['gauge_id']]
    other = other.loc[other['gauge_id'].isin(unique)]
    attr = attr.loc[attr['gauge_id'].isin(unique)]

  if nat == "hysets":
    other_concat = other.drop(columns=['gauge_id', 'gauge_name', 'country'])
    attr = pd.concat([attr, other_concat], axis=1)

  else:
    other_concat = other.drop(columns=['gauge_id', 'gauge_name', 'gauge_lat', 'gauge_lon', 'country'])
    attr = pd.concat([attr, other_concat], axis=1)


  if len(caravan_attr_combined) == 0:
    caravan_attr_combined = attr
  else:
    caravan_attr_combined = pd.concat([caravan_attr_combined, attr], axis=0)


# for column in caravan_attr_combined.columns:
#   print(sum(caravan_attr_combined[column].isna()))


# Standardize Data

data = caravan_attr_combined.drop(columns = ['gauge_id'])
data = (data - data.mean(axis = 0)) / data.std(axis = 0)


# Calculate Covariance Matrix
covariance_matrix = np.cov(data, ddof = 0, rowvar = False)


# Eigendecomposition on the Covariance Matrix
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
print(eigenvalues)
print("="*100)


# np.argsort can only provide lowest to highest; use [::-1] to reverse the list
order_of_importance = np.argsort(eigenvalues)[::-1]


# utilize the sort order to sort eigenvalues and eigenvectors
labels = data.columns
sorted_eigenvalues = eigenvalues[order_of_importance]
sorted_eigenvectors = eigenvectors[:,order_of_importance]
sorted_labels = labels[order_of_importance]
print("="*100)
for i in range(len(sorted_labels)):
  print("{} : {}".format(sorted_labels[i], eigenvalues[i]))

# import matplotlib.pyplot as plt
# labels = data.columns
# plt.bar(labels, eigenvalues, width = 0.4)

In [None]:
BasicInputStaticProps = attr_p.to_numpy()
NpropperTimeStatic = len(attr_p.columns) - 1
print('NpropperTimeStatic', NpropperTimeStatic)

# save datasets

In [None]:
np.save("/content/gdrive/MyDrive/Caravan/training/{}/BasicInputTimeSeries_{}_unique".format(nation, nation), BasicInputTimeSeries)

In [None]:
BasicInputTimeSeries.shape

In [None]:
type(BasicInputTimeSeries[0,3])

In [None]:
np.save("/content/gdrive/MyDrive/Caravan/training/{}/BasicInputStaticProps_{}_unique".format(nation, nation), BasicInputStaticProps)

In [None]:
str(InitialDate)

In [None]:
# str(BasicInputTimeSeries[1, 1] - BasicInputTimeSeries[0, 1])

In [None]:
int(((EndDate-InitialDate + np.timedelta64(1, 'D'))/np.timedelta64(1, 'D')))

In [None]:
str(BasicInputTimeSeries[500, 0] - BasicInputTimeSeries[0, 0])

In [None]:
meta_data = {
    'Nloc': len(attr_p),
    'locs': attr_p['gauge_id'].tolist(),
    'loc_names': basin_meta_p['gauge_name'].tolist(),
    'BasicInputTimeSeries':{
      'fields': [
			"date",
			"gauge_id",
			"snow_depth_water_equivalent_mean",
			"surface_net_solar_radiation_mean",
			"surface_net_thermal_radiation_mean",
			"surface_pressure_mean",
			"temperature_2m_mean",
			"dewpoint_temperature_2m_mean",
			"u_component_of_wind_10m_mean",
			"v_component_of_wind_10m_mean",
			"volumetric_soil_water_layer_1_mean",
			"volumetric_soil_water_layer_2_mean",
			"volumetric_soil_water_layer_3_mean",
			"volumetric_soil_water_layer_4_mean",
			"snow_depth_water_equivalent_min",
			"surface_net_solar_radiation_min",
			"surface_net_thermal_radiation_min",
			"surface_pressure_min",
			"temperature_2m_min",
			"dewpoint_temperature_2m_min",
			"u_component_of_wind_10m_min",
			"v_component_of_wind_10m_min",
			"volumetric_soil_water_layer_1_min",
			"volumetric_soil_water_layer_2_min",
			"volumetric_soil_water_layer_3_min",
			"volumetric_soil_water_layer_4_min",
			"snow_depth_water_equivalent_max",
			"surface_net_solar_radiation_max",
			"surface_net_thermal_radiation_max",
			"surface_pressure_max",
			"temperature_2m_max",
			"dewpoint_temperature_2m_max",
			"u_component_of_wind_10m_max",
			"v_component_of_wind_10m_max",
			"volumetric_soil_water_layer_1_max",
			"volumetric_soil_water_layer_2_max",
			"volumetric_soil_water_layer_3_max",
			"volumetric_soil_water_layer_4_max",
			"total_precipitation_sum",
			"potential_evaporation_sum",
			"streamflow"
		  ],
      'index_fields': [BASIN_COL, DATE_COL],
      'initial_date': str(InitialDate),
      'end_date': str(EndDate),
      'time_delta': str(BasicInputTimeSeries[500, 0] - BasicInputTimeSeries[0, 0]),
      # 'time_steps': int(((EndDate-InitialDate + np.timedelta64(1, 'D'))/np.timedelta64(1, 'D'))),
      'time_steps': 7031,
    },
    'BasicInputStaticProps': {
        'fields': attr_p.columns.values.tolist(),
        'index_fileds': ['gauge_id'],
    }
}

In [None]:
meta_data

In [None]:
with open('/content/gdrive/MyDrive/Caravan/training/{}/metadata_{}_unique.json'.format(nation, nation), 'w') as outfile:
  json.dump(meta_data, outfile, indent='\t')

In [None]:
!tar cjf hysets.tar.bz2 metadata_hysets.json Basic*.npy

In [None]:
!cp -f hysets.tar.bz2 /content/gdrive/My\ Drive/Caravan/training/hysets

In [None]:
a = np.load("/content/gdrive/MyDrive/Caravan/training/{}/BasicInputTimeSeries_{}.npy".format(nation, nation), allow_pickle=True)
a.shape

In [None]:
PreparedDataFile = "/content/gdrive/MyDrive/Caravan/timeseries/csv/{}.tar.bz2".format(nation)
APPLDIR = '/content/gdrive/MyDrive/Caravan/training/{}'.format(nation)

In [None]:
!tar xjf $PreparedDataFile -C $APPLDIR
!tar xjf $PreparedDataFile2 -C $APPLDIR

In [None]:
BasicInputStaticProps = np.load(APPLDIR + '/BasicInputStaticProps_{}.npy'.format(nation), allow_pickle = True)
BasicInputTimeSeries = np.load(APPLDIR + '/BasicInputTimeSeries_{}.npy'.format(nation), allow_pickle = True)

In [None]:
BasicInputTimeSeries.shape

In [None]:
BasicInputTimeSeries[0:5]

In [None]:
with open(APPLDIR + '/metadata_{}.json'.format(nation), 'r') as f:
  metadata = json.load(f)

In [None]:
metadata