# Imports

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 
from collections import Counter
from tqdm import tqdm
import pickle

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

from tslearn.clustering import TimeSeriesKMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# Set Tensorflow 

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# Set random seed for reproducibility

In [None]:
# np.random.seed(1234)
# tf.random.set_seed(1234)

# Loading Data

In [None]:
with open("options.txt", 'r') as f:
    options = f.readlines()
    options = {option.split("=")[0]: option.split("=")[1].strip() for option in options}
print(options)

In [None]:
RUG = pd.read_pickle(options['RUG_no_outliers'])

# Preparing and Transforming Data

In [None]:
RUG.interpolate(method='linear', inplace=True, limit=20)

In [None]:
def get_data(col_name):
    df = RUG[col_name].copy()
    
    groups = df.groupby(pd.Grouper(freq='D'))

    # get the calender date of the groups
    days = list(groups.first().index.strftime('%Y:%m:%d'))

    gro = [groups.get_group(x).reset_index(drop=True) for x in groups.groups]

    temp = pd.concat(gro, axis=1, keys=days)

    temp.index = pd.date_range("00:00", "23:59", freq="1min").strftime('%H:%M')

    # drop all columns of temp dataframe which contain nan values
    temp.dropna(axis=1, how='any', inplace=True)
    return temp[::10]

In [None]:
def scale_data(data):

    temp = data.copy()

    train_percentage = 0.8
    train_size = int(len(temp.columns) * train_percentage)
    
    train = temp.iloc[:, :train_size]
    test = temp.iloc[:, train_size:]

    scaler = MinMaxScaler(feature_range=(0, 1))

    scaled_list_train = [train[col] for col in train]
    scaled_list_train = scaler.fit_transform(scaled_list_train)

    scaled_list_test = [test[col] for col in test]
    scaled_list_test = scaler.transform(scaled_list_test)

    return scaler, scaled_list_train, scaled_list_test

# Principal Component Analysis

In [None]:
def create_pca(data):
    temp = data.copy()
    
    pca = PCA(n_components=0.85, svd_solver='full')
 
    # Fit and transform data
    pca_features = pca.fit_transform(temp)

    return pca_features

In [None]:
def create_kmeans(pca_data, scaled_train, scaled_test, clusters=4):
    temp_pca_data = pca_data.copy()
    temp_scaled_train = scaled_train.copy()
    temp_scaled_test = scaled_test.copy()

    kmeans_pca = TimeSeriesKMeans(n_clusters=clusters, metric="dtw", n_jobs=-1).fit(temp_pca_data)
    train_pca_features = kmeans_pca.labels_
    test_pca_features = kmeans_pca.predict(temp_scaled_test)

    return train_pca_features, test_pca_features

# Train different lstm models

In [None]:
def func(train1, test1, scaler, look_back=3):

    training, testing = train1.copy(), test1.copy()

    look_back = 3
    
    def create_dataset(dataset, look_back=3):
        dataX, dataY = [], []
        for i in range(len(dataset)-look_back-1):
            a = dataset[i:(i+look_back), 0]
            dataX.append(a)
            dataY.append(dataset[i + look_back, 0])
        return np.array(dataX), np.array(dataY)


    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, patience=2, min_lr=0.001, verbose=2)

    # create and fit the LSTM network
    model = Sequential()
    model.add(LSTM(4, input_shape=(1, look_back)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])

    if training.ndim > 1:
        for train_it in tqdm(training): 
            train_it = train_it.reshape(-1, 1)
            
            # reshape into X=t and Y=t+1
            trainX, trainY = create_dataset(train_it, look_back)
            # testX, testY = create_dataset(testing, look_back)

        # reshape input to be [samples, time steps, features]
            trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
            # testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

            model.fit(trainX, trainY, epochs=50, verbose=0, callbacks=[early_stopping, reduce_lr])
    else:
        train_it = training
        train_it = train_it.reshape(-1, 1)
        
        # reshape into X=t and Y=t+1
        trainX, trainY = create_dataset(train_it, look_back)
        # testX, testY = create_dataset(testing, look_back)

    # reshape input to be [samples, time steps, features]
        trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
        # testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

        model.fit(trainX, trainY, epochs=50, verbose=0, callbacks=[early_stopping, reduce_lr])

    rmse_train = []
    rmse_test = []

    mae_train = []
    mae_test = []

    mape_train = []
    mape_test = []

    if training.ndim > 1:
        for train_it in training:
            train_it = train_it.reshape(-1, 1)

            trainX, trainY = create_dataset(train_it, look_back)

            trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
            
            trainPredict = model.predict(trainX, verbose=0)
            
            trainPredict = np.repeat(trainPredict, train1.shape[1], axis=-1)
            trainPredict = scaler.inverse_transform(trainPredict)[:,0]
            
            trainY = np.repeat(trainY.reshape(-1, 1), train1.shape[1], axis=-1)
            trainY = scaler.inverse_transform(trainY)[:,0]
            
            rmse_train.append(np.sqrt(mean_squared_error(trainY, trainPredict)))
            mae_train.append(tf.keras.metrics.mean_absolute_error(trainY, trainPredict).numpy())
            mape_train.append(tf.keras.metrics.mean_absolute_percentage_error(trainY, trainPredict).numpy())
    else:
        train_it = training
        train_it = train_it.reshape(-1, 1)

        trainX, trainY = create_dataset(train_it, look_back)

        trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
        
        trainPredict = model.predict(trainX, verbose=0)
        
        trainPredict = np.repeat(trainPredict, train1.shape[1], axis=-1)
        trainPredict = scaler.inverse_transform(trainPredict)[:,0]
        
        trainY = np.repeat(trainY.reshape(-1, 1), train1.shape[1], axis=-1)
        trainY = scaler.inverse_transform(trainY)[:,0]
        
        rmse_train.append(np.sqrt(mean_squared_error(trainY, trainPredict)))
        mae_train.append(tf.keras.metrics.mean_absolute_error(trainY, trainPredict).numpy())
        mape_train.append(tf.keras.metrics.mean_absolute_percentage_error(trainY, trainPredict).numpy())


    if testing.ndim > 1:
        for test_it in testing:   
            try:
                
                test_it = test_it.reshape(-1, 1) 
                # reshape into X=t and Y=t+1
                
                testX, testY = create_dataset(test_it, look_back)
            # reshape input to be [samples, time steps, features]
                
                testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

            # make predictions
                
                testPredict = model.predict(testX, verbose=0)
                # invert predictions
            
                testPredict = np.repeat(testPredict, test1.shape[1], axis=-1)
                testPredict = scaler.inverse_transform(testPredict)[:,0]

                testY = np.repeat(testY.reshape(-1, 1), test1.shape[1], axis=-1)
                testY = scaler.inverse_transform(testY)[:,0]

                # calculate different evaluation metrics
                
                rmse_test.append(np.sqrt(mean_squared_error(testY, testPredict)))
                mae_test.append(tf.keras.metrics.mean_absolute_error(testY, testPredict).numpy())
                mape_test.append(tf.keras.metrics.mean_absolute_percentage_error(testY, testPredict).numpy())
            except:
                print("exception occured")
                rmse_train.append(-1)
                mae_train.append(-1)
                mape_train.append(-1)
    else:
        try:
            test_it = testing
            test_it = test_it.reshape(-1, 1) 
            # reshape into X=t and Y=t+1
            
            testX, testY = create_dataset(test_it, look_back)
        # reshape input to be [samples, time steps, features]
            
            testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

        # make predictions
            
            testPredict = model.predict(testX, verbose=0)
            # invert predictions
        
            testPredict = np.repeat(testPredict, test1.shape[1], axis=-1)
            testPredict = scaler.inverse_transform(testPredict)[:,0]

            testY = np.repeat(testY.reshape(-1, 1), test1.shape[1], axis=-1)
            testY = scaler.inverse_transform(testY)[:,0]

            # calculate different evaluation metrics
            
            rmse_test.append(np.sqrt(mean_squared_error(testY, testPredict)))
            mae_test.append(tf.keras.metrics.mean_absolute_error(testY, testPredict).numpy())
            mape_test.append(tf.keras.metrics.mean_absolute_percentage_error(testY, testPredict).numpy())
        except:
            print("exception occured")
            rmse_test.append(-1)
            mae_test.append(-1)
            mape_test.append(-1)

    return (rmse_train, rmse_test, mae_train, mae_test, mape_train, mape_test)

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Num of clusters per column

based on elbow method and silhouette score

In [None]:
clusters = [4, 4, 3, 3, 4, 4, 4, 3, 3, 4, 3, 4, 4]

In [None]:
complete_results = []
for location, clust_n in zip(RUG.columns, clusters):
    print(location)
    data = get_data(location)

    scaler, scaled_list_train, scaled_list_test = scale_data(data)
    
    pca_features = create_pca(scaled_list_train)

    train_pca_features, test_pca_features = create_kmeans(pca_features, scaled_list_train, scaled_list_test, clust_n)
    print(Counter(train_pca_features), Counter(test_pca_features))

    for cluster in [*Counter(train_pca_features)]:
        cluster_train = scaled_list_train[np.where(train_pca_features == cluster)]
        cluster_test = scaled_list_test[np.where(test_pca_features == cluster)]

        reply = func(cluster_train, cluster_test, scaler)
        complete_results.append([location, [cluster, [np.mean(reply[0]), np.mean(reply[1]), np.mean(reply[2]), np.mean(reply[3]), np.mean(reply[4]), np.mean(reply[5])]]])

with open (r"results.txt", 'wb') as f:
    pickle.dump(complete_results, f)