In [8]:
# ---------------------------------------------
# IMPORT LIBRARIES
# ---------------------------------------------
import numpy as np
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMinMax , TimeSeriesScalerMeanVariance
import matplotlib.pyplot as plt
import os


# ---------------------------------------------
# DEPENDENCIES 
# the following depend on 01_Gen_Clustering_Data:
# - clustering_feature_names
# - n_hrus
# - all_data
# ---------------------------------------------
with open('/Users/Documents/GitHub/GNNs_PrecisionAgriculture/SWAT Data Reader/clustering_feature_names.pkl', 'rb') as f:
    clustering_feature_names = pickle.load(f)
n_feat = len(feat_names)

data_path = '/Users/muneeza/Documents/GitHub/DATA_SMest/HRU_clustering'
save_path = '/Users/muneeza/Documents/GitHub/DATA_SMest/HRU_Clustering_results/'
names_list = os.listdir(data_path)
names_list.remove('.DS_Store')

# Since every watershed file has different number of HRUs 
# we strip the number of HRUs from the file name and 
# aggregate to allocate memory for all HRUs in region02
n_hrus = np.array([x.split('.')[-2] for x in names_list]).astype(int)
hrus_total = np.sum(n_hrus)
all_data= np.zeros((12, hrus_total,10))   # (months, hrus, features)


# ---------------------------------------------
# LOAD DATA
# loop through all watershed data and concatenate 
# all hrus in one array
# ---------------------------------------------
st = 0
en = 0
for i, name in enumerate(names_list):
    en += n_hrus[i]
    all_data[:,st:en,:] = np.load(data_path+'/'+name)
    st = en

all_data = all_data.transpose(1,0,2)


# ---------------------------------------------
# DEFINE NORMALIZATION AND CLUSTERING FUNCTIONS
# ---------------------------------------------
def normalization(type, all_data,n_feat):
    if type =='custom':
        max_arr = np.zeros(n_feat)
        min_arr = np.zeros(n_feat)
        X_train_norm = np.zeros(all_data.shape)
        for i in range(n_feat):
            max_arr[i] = np.max(all_data[:,:,i])
            min_arr[i] = np.min(all_data[:,:,i])
            X_train_norm[:,:,i] = (all_data[:,:,i] -  min_arr[i])/( max_arr[i]- min_arr[i])
    elif type == 'minmax':
        X_train_norm = TimeSeriesScalerMinMax(value_range=(0,1)).fit_transform(all_data)
    elif type == 'std':
        X_train_norm = TimeSeriesScalerMeanVariance(0,1).fit_transform(all_data)
    else: 
        X_train_norm = all_data
    return(X_train_norm)


# ---------------------------------------------
# NORMALIZE DATA AND PERFORM CLUSTERING
# Data (n_ts , sz, d) 
# n_ts : number of time series (hrus)
# sz : size of time series (n time steps)
# d : dimension of data (n features)
# CAUTION !! data shape must be (hrus , timesteps , features )
# ---------------------------------------------
X_train_norm = normalization('custom', all_data, n_feat)

model = TimeSeriesKMeans(n_clusters=12, metric="dtw", max_iter=10)
model.fit(X_train_norm)
labels = model.labels_
dist = model.transform(X_train_norm)
centers = model.cluster_centers()


# ---------------------------------------------
# SAVE RESULTS
# save hru names and their corresponding cluster labels
# save training data so as to keep the same order 
# save 
# ---------------------------------------------
zipped = zip(names_list, labels)
clustering_custom = list(zipped)
textfile = open("clustering_custom.txt", "w")
for element in clustering_custom:
    print(element)
    textfile.write(element[0]+' , ')
    textfile.write(element[1].astype(str))
    textfile.write('\n')
textfile.close()

np.save(save_path+'norm_hru_clustering_data', X_train_norm)
np.save(save_path+'hru_cluster_labels.npy', labels)
np.save(save_path+'hru_cluster_distance', dist)
np.save(save_path+'cluster_centers.npy', centers)