In [None]:
# ---------------------------------------------
# IMPORT LIBRARIES
# ---------------------------------------------
import numpy as np
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMinMax , TimeSeriesScalerMeanVariance
import matplotlib.pyplot as plt
import os


# ---------------------------------------------
# DEPENDENCIES 
# the following depend on 01_Gen_Clustering_Data:
# - clustering_feature_names
# - n_hrus
# - all_data
# ---------------------------------------------
with open('/Users/Documents/GitHub/GNNs_PrecisionAgriculture/SWAT Data Reader/clustering_feature_names.pkl', 'rb') as f:
    clustering_feature_names = pickle.load(f)
n_feat = len(clustering_feature_names)

data_path = '/Users/Documents/GitHub/DATA_SMest/HRU_clustering'
names_list = os.listdir(data_path)
names_list.remove('.DS_Store')

# Since every watershed file has different number of HRUs 
# we strip the number of HRUs from the file name and 
# aggregate to allocate memory for all HRUs in region02
n_hrus = np.array([x.split('.')[-2] for x in names_list]).astype(int)
hrus_total = np.sum(n_hrus)
all_data= np.zeros((12, hrus_total,10))   # (months, hrus, features)


# ---------------------------------------------
# LOAD DATA
# load all hrus from across different subbasins
# to the same array 'all_data'
# ---------------------------------------------
st = 0
en = 0
for i, name in enumerate(names_list):
    en += n_hrus[i]
    all_data[:,st:en,:] = np.load(data_path+'/'+name)
    st = en
    
# reshape data to (months, n_hrus, features)
all_data.transpose(1,0,2)
mean_t = np.mean(all_data,1)
var_t = np.var(all_data,1)


# ---------------------------------------------
# SUBSET SAMPLING
# Sample a subset of data that is 'close enough' 
# in mean and varaince sense to original data. 
# total hrus = 99047
# sampled hrus = 1000
# ---------------------------------------------
min_var_diff = 100000
min_mean_diff = 100000
iterations = 1000
for iter in range(iterations):
    sample_ids = np.random.uniform(0,99047,1000).astype(int)
    sample_data = all_data[:,sample_ids,:]
    sample_mean_t = np.mean(sample_data,1)
    sample_var_t = np.var(sample_data,1)
    mean_diff = np.linalg.norm(mean_t - sample_mean_t)/np.linalg.norm(mean_t)
    var_diff = np.linalg.norm(var_t - sample_var_t)/np.linalg.norm(var_t)
    if (var_diff < min_var_diff) and (mean_diff < min_mean_diff):
        min_var_diff = var_diff
        min_mean_diff = mean_diff
        min_sample_ids = sample_ids
        min_sample_data = all_data[:,min_sample_ids,:]

print('percent relative err mean: ', mean_diff*100)
print('percent relative err var ' ,  var_diff*100)


# ---------------------------------------------
# DEFINE NORMALIZATION AND CLUSTERING FUNCTIONS
# ---------------------------------------------
def normalization(type, X_train):
    if type =='custom':
        max_arr = np.zeros(9)
        min_arr = np.zeros(9)
        X_train_norm = np.zeros(X_train.shape)
        for i in range(9):
            max_arr[i] = np.max(X_train[:,:,i])
            min_arr[i] = np.min(X_train[:,:,i])
            X_train_norm[:,:,i] = (X_train[:,:,i] -  min_arr[i])/( max_arr[i]- min_arr[i])
    elif type == 'minmax':
        X_train_norm = TimeSeriesScalerMinMax(value_range=(0,1)).fit_transform(X_train)
    elif type == 'std':
        X_train_norm = TimeSeriesScalerMeanVariance(0,1).fit_transform(X_train)
    else: 
        X_train_norm = X_train
    return(X_train_norm)

def elbow_iter (k,X_train_norm, n_sample):
    model = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=10)
    model.fit(X_train_norm)
    labels = model.labels_
    dist = model.transform(X_train_norm)
    dist_clust = np.zeros(n_sample)
    for i in range (n_sample):
        dist_clust[i] = dist[i,labels[i]]
    distortion = np.average(dist_clust**2)
    return(distortion)


# ---------------------------------------------
# PERFORM ELBOW TEST
# --------------------------------------------- 
# Normalize Data (CUSTOM)
X_train_norm = normalization('custom', min_sample_data)

# reshape data to (n_hrus, n_time samples, n_features) 
# for use with tslearn functions
X_train_norm = X_train_norm.reshape(1000,12,10) 

# List of cluster numbers for elbow test
k_list = [2,5,10,15,20,25,30, 40,50,80]
distor = np.zeros(len(k_list))
for i,k in enumerate(k_list):
    distor[i] = elbow_iter(k,X_train_norm,1000)
    
    
# ---------------------------------------------
# VISUALIZE RESULTS 
# --------------------------------------------- 
fs = 14
plt.plot(np.array(k_list), distor)
plt.xlabel('Number of Clusters (K)',fontsize=fs)
plt.ylabel('Distortion', fontsize=fs)
plt.title('Elbow Test for Selecting Number of Clusters (K)', fontsize=fs)
plt.grid()
plt.savefig('Elbow_Test_results.png')