In [1]:
# ---------------------------------------------
# IMPORT LIBRARIES
# ---------------------------------------------
import numpy as np
import pandas as pd
import os
import pickle

data_path = '/path/hru_region02_np_arr'
save_dir = '/path/HRU_TGCN_sub/'


# ---------------------------------------------
# LOAD FEATURE NAMES AND DESCRIPTIONS
# Feature description is a dictionary object that 
# contains feature_name: feature definition pairs 
# based on SWAT IO documentation
# --------------------------------------------
with open('/path/SWAT_feat_names.pkl', 'rb') as f:
    feat_names = pickle.load(f)
with open('/path/SWAT_feat_names.pkl', 'rb') as f:
    feat_names = pickle.load(f)

    
# ---------------------------------------------
# SUBSET OF FEATURES FOR SM PREDICTION
# **************** USER INPUT *****************
# Define features used for estimating soil moisture
# --------------------------------------------- 
sub_feat_names = ['AREA', 'PRECIP' , 'ET', 'SW_END', 'PERC', 'GW_RCHG', 'DA_RCHG', 'REVAP', 'SA_IRR', 'DA_IRR', 'SA_ST', 'DA_ST',
                 'WYLD', 'DAILYCN', 'TMP_AV', 'SOL_TMP', 'SOLAR']

watershed_names_list = os.listdir(data_path)
if '.DS_Store' in watershed_names_list: names_list.remove('.DS_Store')
if 'README' in watershed_names_list: names_list.remove('README')
    
    
# ---------------------------------------------
# INDEPENDENT HRU FILES
# Instead of HRUs clumped together in watersheds
# associate universal IDs with the hrus for tracability
# and save as independent files.
# --------------------------------------------- 
clustering_feature_names = feat_names[5:]
n_feat = len(sub_feat_names) 
n_tstep = 38*12

for name in watershed_names_list:
    features = np.load(data_path+'/'+name)

    # Delete annual summary (over a single year)
    # Delete simulation summary  (over 38 years)    
    df = pd.DataFrame(features)
    df.columns= feat_names[5:86]    
    df.drop(df[df.MON > 12].index, inplace=True)
    n_hru = df.MON.ne(1).idxmax()

    # select a subset of features 
    sub_df = df[sub_feat_names]

    # Rearrange data so data is in the format ( monthly time step, hrus, features) 
    num_features = sub_df.to_numpy()
    num_features = num_features.reshape(n_tstep, n_hru, n_feat )  

    # Save the data for each hru (time step , features)
    for i in range(n_hru):                                 
        np.save(save_dir+name.split('_')[0]+'.'+str(n_hru)+'.'+str(i), num_features[:,i,:])
        
# load the saved HRU names that correspond to the order 
# of hrus saved clusterwise in the cluster specific distance array.  
with open('/path/hru_names_clusterwise.pkl', 'rb') as f:  hru_names_clusterwise = pickle.load(f)      

    
# ---------------------------------------------
# Load hrus and save as anumpy array each array 
# is ordered according to the results from 03_clustering. 
# This along with the cluster-specific distance array 
# is used to generate the graphs. 
# --------------------------------------------- 
for i in range(12):
    n_hrus = len(hru_names_clusterwise[i])

    # Read hru data  
    # !!! Make sure data is in format (timeseries, hrus , features)
    data = np.zeros((n_tstep, n_hrus , n_feat))
    for j, name in enumerate(hru_names_clusterwise[i]):
        data[:,j,:] = np.load('/path/'+name+'.npy') # (tstep , features)
        
    np.save('/path/sm_est_hru_data_clstr_'+str(i),data)
    
    
# ---------------------------------------------
# VISUALIZE GRAPH FROM CLUSTER 
# Run at your own risk - Very slow for large graphs
# you have been warned !! 
# --------------------------------------------- 

# import networkx as nx
# from torch_geometric.utils.convert import to_networkx
# from sklearn.neighbors import radius_neighbors_graph

# cluster_id = 6
# clstr_distance = np.load('/path/'+'dist.in.cluster_'+str(cluster_id)+'.npy')
# adj_csr = radius_neighbors_graph( clstr_distance, 1, include_self = True)

# G = nx.Graph(adj_csr)
# nx.draw(G)