# Imports

In [2]:
import pandas as pd
from pathlib import Path
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
import datetime

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

import os, math

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Preprocessing
from sklearn.preprocessing import MinMaxScaler
# Algorithms
from tslearn.barycenters import dtw_barycenter_averaging
from tslearn.clustering import TimeSeriesKMeans, KernelKMeans, silhouette_score
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error

from collections import Counter
from tqdm import tqdm

# Set Tensorflow 

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# Set random seed for reproducibility

In [4]:
# np.random.seed(1234)
# tf.random.set_seed(1234)

# Loading Data

In [15]:
with open("options.txt", 'r') as f:
    options = f.readlines()
    options = {option.split("=")[0]: option.split("=")[1].strip() for option in options}
print(options)

{'hanoi_scenario_dir': 'C:\\Users\\mjnst\\Desktop\\Thesis\\Hanoi_CMH\\Scenario-1', 'RUG_dir': 'C:\\Users\\mjnst\\Desktop\\Thesis\\RUG_data_5years', 'RUG_raw_csv': 'C:\\Users\\mjnst\\Desktop\\Thesis\\rug_csv.csv', 'RUG_timeseries': 'C:\\Users\\mjnst\\Desktop\\Thesis\\rug_timeseries.pkl', 'RUG_obfuscated': 'C:\\Users\\mjnst\\Desktop\\Thesis\\obfuscated_data.pkl'}


In [26]:
RUG = pd.read_pickle(options['RUG_obfuscated'])

# Preparing and Transforming Data

In [28]:
RUG.interpolate(method='linear', inplace=True, limit=20)

In [9]:
def get_data(col_name):
    groups = RUG['Location 2 - consumption'].groupby(pd.Grouper(freq='D'))

    # get the calender date of the groups
    days = list(groups.first().index.strftime('%Y:%m:%d'))

    gro = [groups.get_group(x).reset_index(drop=True) for x in groups.groups]

    temp = pd.concat(gro, axis=1, keys=days)

    temp.index = pd.date_range("00:00", "23:59", freq="1min").strftime('%H:%M')

    # drop all columns of temp dataframe which contain nan values
    temp.dropna(axis=1, how='any', inplace=True)

    return temp[::10]

In [9]:
def scale_data(data):

    temp = data.copy()

    train_percentage = 0.8
    train_size = int(len(temp.columns) * train_percentage)
    
    train = temp.iloc[:, :train_size]
    test = temp.iloc[:, train_size:]

    scaler = MinMaxScaler(feature_range=(0, 1))

    scaled_list_train = [train[col] for col in train]
    scaled_list_train = scaler.fit_transform(scaled_list_train)

    scaled_list_test = [test[col] for col in test]
    scaled_list_test = scaler.transform(scaled_list_test)

    return scaler, scaled_list_train, scaled_list_test

# Principal Component Analysis

In [15]:
def create_pca(data):
    temp = data.copy()
    
    pca = PCA(n_components=0.85, svd_solver='full')
 
    # Fit and transform data
    pca_features = pca.fit_transform(temp)

    return pca_features

In [17]:
def create_kmeans(pca_data, scaled_train, scaled_test):
    temp_pca_data = pca_data.copy()
    temp_scaled_train = scaled_train.copy()
    temp_scaled_test = scaled_test.copy()

    kmeans_pca = TimeSeriesKMeans(n_clusters=4, metric="dtw", n_jobs=-1).fit(temp_pca_data)
    train_pca_features = kmeans_pca.predict(temp_scaled_train)
    test_pca_features = kmeans_pca.predict(temp_scaled_test)

    return train_pca_features, test_pca_features

# Train different lstm models

In [24]:
def func(train1, test1, scaler, look_back=3):

    training, testing = train1.copy(), test1.copy()

    look_back = 3
    
    def create_dataset(dataset, look_back=3):
        dataX, dataY = [], []
        for i in range(len(dataset)-look_back-1):
            a = dataset[i:(i+look_back), 0]
            dataX.append(a)
            dataY.append(dataset[i + look_back, 0])
        return np.array(dataX), np.array(dataY)


    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, patience=2, min_lr=0.001, verbose=2)

    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(4, input_shape=(1, look_back)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])

    if training.ndim > 1:
        for train_it in tqdm(training): 
            train_it = train_it.reshape(-1, 1)
            
            # reshape into X=t and Y=t+1
            trainX, trainY = create_dataset(train_it, look_back)
            # testX, testY = create_dataset(testing, look_back)

        # reshape input to be [samples, time steps, features]
            trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
            # testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

            model.fit(trainX, trainY, epochs=50, verbose=0, callbacks=[early_stopping, reduce_lr])
    else:
        train_it = training
        train_it = train_it.reshape(-1, 1)
        
        # reshape into X=t and Y=t+1
        trainX, trainY = create_dataset(train_it, look_back)
        # testX, testY = create_dataset(testing, look_back)

    # reshape input to be [samples, time steps, features]
        trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
        # testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

        model.fit(trainX, trainY, epochs=50, verbose=0, callbacks=[early_stopping, reduce_lr])

    rmse_train = []
    rmse_test = []

    mae_train = []
    mae_test = []

    mape_train = []
    mape_test = []

    if training.ndim > 1:
        for train_it in training:
            train_it = train_it.reshape(-1, 1)

            trainX, trainY = create_dataset(train_it, look_back)

            trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
            
            trainPredict = model.predict(trainX, verbose=0)
            
            trainPredict = np.repeat(trainPredict, train1.shape[1], axis=-1)
            trainPredict = scaler.inverse_transform(trainPredict)[:,0]
            
            trainY = np.repeat(trainY.reshape(-1, 1), train1.shape[1], axis=-1)
            trainY = scaler.inverse_transform(trainY)[:,0]
            
            rmse_train.append(np.sqrt(mean_squared_error(trainY, trainPredict)))
            mae_train.append(tf.keras.metrics.mean_absolute_error(trainY, trainPredict).numpy())
            mape_train.append(tf.keras.metrics.mean_absolute_percentage_error(trainY, trainPredict).numpy())
    else:
        train_it = training
        train_it = train_it.reshape(-1, 1)

        trainX, trainY = create_dataset(train_it, look_back)

        trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
        
        trainPredict = model.predict(trainX, verbose=0)
        
        trainPredict = np.repeat(trainPredict, train1.shape[1], axis=-1)
        trainPredict = scaler.inverse_transform(trainPredict)[:,0]
        
        trainY = np.repeat(trainY.reshape(-1, 1), train1.shape[1], axis=-1)
        trainY = scaler.inverse_transform(trainY)[:,0]
        
        rmse_train.append(np.sqrt(mean_squared_error(trainY, trainPredict)))
        mae_train.append(tf.keras.metrics.mean_absolute_error(trainY, trainPredict).numpy())
        mape_train.append(tf.keras.metrics.mean_absolute_percentage_error(trainY, trainPredict).numpy())


    if testing.ndim > 1:
        for test_it in testing:   
            try:
                
                test_it = test_it.reshape(-1, 1) 
                # reshape into X=t and Y=t+1
                
                testX, testY = create_dataset(test_it, look_back)
            # reshape input to be [samples, time steps, features]
                
                testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

            # make predictions
                
                testPredict = model.predict(testX, verbose=0)
                # invert predictions
            
                testPredict = np.repeat(testPredict, test1.shape[1], axis=-1)
                testPredict = scaler.inverse_transform(testPredict)[:,0]

                testY = np.repeat(testY.reshape(-1, 1), test1.shape[1], axis=-1)
                testY = scaler.inverse_transform(testY)[:,0]

                # calculate different evaluation metrics
                
                rmse_test.append(np.sqrt(mean_squared_error(testY, testPredict)))
                mae_test.append(tf.keras.metrics.mean_absolute_error(testY, testPredict).numpy())
                mape_test.append(tf.keras.metrics.mean_absolute_percentage_error(testY, testPredict).numpy())
            except:
                print("exception occured")
                rmse_train.append(-1)
                mae_train.append(-1)
                mape_train.append(-1)
    else:
        try:
            test_it = testing
            test_it = test_it.reshape(-1, 1) 
            # reshape into X=t and Y=t+1
            
            testX, testY = create_dataset(test_it, look_back)
        # reshape input to be [samples, time steps, features]
            
            testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

        # make predictions
            
            testPredict = model.predict(testX, verbose=0)
            # invert predictions
        
            testPredict = np.repeat(testPredict, test1.shape[1], axis=-1)
            testPredict = scaler.inverse_transform(testPredict)[:,0]

            testY = np.repeat(testY.reshape(-1, 1), test1.shape[1], axis=-1)
            testY = scaler.inverse_transform(testY)[:,0]

            # calculate different evaluation metrics
            
            rmse_test.append(np.sqrt(mean_squared_error(testY, testPredict)))
            mae_test.append(tf.keras.metrics.mean_absolute_error(testY, testPredict).numpy())
            mape_test.append(tf.keras.metrics.mean_absolute_percentage_error(testY, testPredict).numpy())
        except:
            print("exception occured")
            rmse_test.append(-1)
            mae_test.append(-1)
            mape_test.append(-1)

    return (rmse_train, rmse_test, mae_train, mae_test, mape_train, mape_test)
    # return (name, (rmse_train, rmse_test, mae_train, mae_test, mape_train, mape_test))

In [25]:


complete_results = []
for location in RUG.columns:
    print(location)
    data = get_data(location)

    scaler, scaled_list_train, scaled_list_test = scale_data(data)
    
    pca_features = create_pca(scaled_list_train)

    train_pca_features, test_pca_features = create_kmeans(pca_features, scaled_list_train, scaled_list_test)
    print(Counter(train_pca_features), Counter(test_pca_features))

    for cluster in [*Counter(train_pca_features)]:
        cluster_train = scaled_list_train[np.where(train_pca_features == cluster)]
        cluster_test = scaled_list_test[np.where(test_pca_features == cluster)]

        reply = func(cluster_train, cluster_test, scaler)
        
        complete_results.append([location, [cluster, [np.mean(reply[0]), np.mean(reply[1]), np.mean(reply[2]), np.mean(reply[3]), np.mean(reply[4]), np.mean(reply[5])]]])

Location 1 - flow




Counter({1: 1658, 3: 33, 2: 11}) Counter({1: 423, 3: 2, 2: 1})


100%|██████████| 1658/1658 [06:27<00:00,  4.28it/s]
100%|██████████| 33/33 [00:19<00:00,  1.69it/s]
100%|██████████| 11/11 [00:12<00:00,  1.10s/it]


Location 2 - consumption




Counter({2: 1649, 1: 52, 3: 1}) Counter({2: 422, 1: 4})


100%|██████████| 1649/1649 [06:18<00:00,  4.36it/s]
100%|██████████| 52/52 [00:23<00:00,  2.24it/s]
100%|██████████| 1/1 [00:02<00:00,  2.38s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Location 3 - consumption




Counter({1: 1688, 2: 8, 3: 6}) Counter({1: 425, 3: 1})


100%|██████████| 1688/1688 [06:41<00:00,  4.20it/s]
100%|██████████| 6/6 [00:07<00:00,  1.20s/it]
100%|██████████| 8/8 [00:09<00:00,  1.14s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Location 4 - consumption




Counter({0: 1691, 2: 8, 3: 3}) Counter({0: 424, 2: 1, 3: 1})


100%|██████████| 1691/1691 [06:37<00:00,  4.25it/s]
100%|██████████| 8/8 [00:07<00:00,  1.08it/s]
100%|██████████| 3/3 [00:04<00:00,  1.35s/it]


Location 5 - consumption




Counter({1: 1696, 3: 6}) Counter({1: 425, 3: 1})


100%|██████████| 1696/1696 [06:29<00:00,  4.36it/s]
100%|██████████| 6/6 [00:06<00:00,  1.07s/it]


Location 6 - head




Counter({2: 1696, 1: 6}) Counter({2: 424, 1: 2})


100%|██████████| 1696/1696 [06:29<00:00,  4.36it/s]
100%|██████████| 6/6 [00:06<00:00,  1.08s/it]


Location 7 - head




Counter({0: 1600, 1: 102}) Counter({0: 419, 1: 6, 3: 1})


100%|██████████| 1600/1600 [06:03<00:00,  4.40it/s]
100%|██████████| 102/102 [00:33<00:00,  3.00it/s]


Location 8 - flow




Counter({0: 1697, 2: 5}) Counter({0: 425, 2: 1})


100%|██████████| 1697/1697 [06:22<00:00,  4.44it/s]
100%|██████████| 5/5 [00:06<00:00,  1.22s/it]


Location 9 - head




Counter({0: 1686, 2: 16}) Counter({0: 425, 2: 1})


100%|██████████| 1686/1686 [06:50<00:00,  4.10it/s]
100%|██████████| 16/16 [00:13<00:00,  1.18it/s]


Location 10 - flow




Counter({0: 1308, 2: 392, 1: 2}) Counter({0: 226, 2: 200})


100%|██████████| 392/392 [01:47<00:00,  3.66it/s]
100%|██████████| 1308/1308 [04:57<00:00,  4.40it/s]
100%|██████████| 2/2 [00:02<00:00,  1.42s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Location 11 - head




Counter({0: 1479, 3: 221, 2: 2}) Counter({0: 375, 3: 51})


100%|██████████| 1479/1479 [05:51<00:00,  4.20it/s]
100%|██████████| 221/221 [00:54<00:00,  4.06it/s]
100%|██████████| 2/2 [00:02<00:00,  1.43s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Location 11 - flow




Counter({0: 1696, 3: 6}) Counter({0: 425, 3: 1})


100%|██████████| 1696/1696 [06:48<00:00,  4.15it/s]
100%|██████████| 6/6 [00:06<00:00,  1.13s/it]


Location 12 - head




Counter({1: 1684, 2: 11, 0: 7}) Counter({1: 423, 2: 2, 0: 1})


100%|██████████| 1684/1684 [06:27<00:00,  4.34it/s]
100%|██████████| 11/11 [00:11<00:00,  1.00s/it]
100%|██████████| 7/7 [00:08<00:00,  1.15s/it]


In [26]:
complete_results

[['Location 1 - flow',
  [1,
   [0.5268215716499038,
    0.5463612214867182,
    0.26355627,
    0.2817759,
    64.06256,
    98.609726]]],
 ['Location 1 - flow',
  [3,
   [0.5021563569856381,
    0.6352111145525676,
    0.24563965,
    0.3960265,
    21.834711,
    90.095024]]],
 ['Location 1 - flow',
  [2,
   [0.6262935132500647,
    0.7785258541264611,
    0.29811725,
    0.4877977,
    52.022095,
    128.21033]]],
 ['Location 2 - consumption',
  [2,
   [0.5249310207056396,
    0.5441503245155949,
    0.2593826,
    0.27772963,
    58.671265,
    90.83446]]],
 ['Location 2 - consumption',
  [1,
   [0.5513351836915648,
    0.6789877511067335,
    0.25420567,
    0.41334838,
    35.38569,
    85.69394]]],
 ['Location 2 - consumption',
  [3, [1.170838254320146, nan, 1.0903722, nan, 29.467968, nan]]],
 ['Location 3 - consumption',
  [1,
   [0.5271949852953033,
    0.5427170489333447,
    0.2617731,
    0.27891603,
    52.49678,
    80.89638]]],
 ['Location 3 - consumption',
  [3,
   [0.

In [27]:
import pickle

with open("complete_results.txt", 'wb') as f:
    pickle.dump(complete_results, f)

In [509]:

import pandas as pd
multi_index = pd.MultiIndex.from_tuples([(Location, Cluster) for Location in RUG.columns for Cluster in [*Counter(train_pca_features)][:2]],
                                       names=['Location','Cluster'])


cols = pd.MultiIndex.from_tuples([('Train', 'rmse'), ('Train', 'mae'), ('Train', 'mape'), ('Test', 'rmse'), ('Test', 'mae'), ('Test', 'mape')])

df = pd.DataFrame(complete_results, columns=cols,index=multi_index)
df


ValueError: Length of values (2) does not match length of index (26)

In [29]:
with open('complete_results.txt', 'rb') as f:
    temp = pickle.load(f)

In [30]:
temp

[['Location 1 - flow',
  [1,
   [0.5268215716499038,
    0.5463612214867182,
    0.26355627,
    0.2817759,
    64.06256,
    98.609726]]],
 ['Location 1 - flow',
  [3,
   [0.5021563569856381,
    0.6352111145525676,
    0.24563965,
    0.3960265,
    21.834711,
    90.095024]]],
 ['Location 1 - flow',
  [2,
   [0.6262935132500647,
    0.7785258541264611,
    0.29811725,
    0.4877977,
    52.022095,
    128.21033]]],
 ['Location 2 - consumption',
  [2,
   [0.5249310207056396,
    0.5441503245155949,
    0.2593826,
    0.27772963,
    58.671265,
    90.83446]]],
 ['Location 2 - consumption',
  [1,
   [0.5513351836915648,
    0.6789877511067335,
    0.25420567,
    0.41334838,
    35.38569,
    85.69394]]],
 ['Location 2 - consumption',
  [3, [1.170838254320146, nan, 1.0903722, nan, 29.467968, nan]]],
 ['Location 3 - consumption',
  [1,
   [0.5271949852953033,
    0.5427170489333447,
    0.2617731,
    0.27891603,
    52.49678,
    80.89638]]],
 ['Location 3 - consumption',
  [3,
   [0.

In [31]:
for i in temp:
    print(i)

['Location 1 - flow', [1, [0.5268215716499038, 0.5463612214867182, 0.26355627, 0.2817759, 64.06256, 98.609726]]]
['Location 1 - flow', [3, [0.5021563569856381, 0.6352111145525676, 0.24563965, 0.3960265, 21.834711, 90.095024]]]
['Location 1 - flow', [2, [0.6262935132500647, 0.7785258541264611, 0.29811725, 0.4877977, 52.022095, 128.21033]]]
['Location 2 - consumption', [2, [0.5249310207056396, 0.5441503245155949, 0.2593826, 0.27772963, 58.671265, 90.83446]]]
['Location 2 - consumption', [1, [0.5513351836915648, 0.6789877511067335, 0.25420567, 0.41334838, 35.38569, 85.69394]]]
['Location 2 - consumption', [3, [1.170838254320146, nan, 1.0903722, nan, 29.467968, nan]]]
['Location 3 - consumption', [1, [0.5271949852953033, 0.5427170489333447, 0.2617731, 0.27891603, 52.49678, 80.89638]]]
['Location 3 - consumption', [3, [0.6093396067607374, 0.7964783652206937, 0.32435653, 0.5029354, 33.27697, 108.62179]]]
['Location 3 - consumption', [2, [0.6406690976392875, nan, 0.31403244, nan, 52.48872, na