# Imports

In [11]:
import pandas as pd
from pathlib import Path
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
import datetime


import os, math

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Preprocessing
from sklearn.preprocessing import MinMaxScaler
# Algorithms
from tslearn.clustering import TimeSeriesKMeans, KernelKMeans, silhouette_score
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error

from collections import Counter
from tqdm import tqdm
import pickle

import warnings
warnings.filterwarnings("ignore")


# Loading Data

In [12]:
with open("options.txt", 'r') as f:
    options = f.readlines()
    options = {option.split("=")[0]: option.split("=")[1].strip() for option in options}
print(options)

{'hanoi_scenario_dir': 'C:\\Users\\mjnst\\Desktop\\Thesis\\Hanoi_CMH\\Scenario-1', 'RUG_dir': 'C:\\Users\\mjnst\\Desktop\\Thesis\\RUG_data_5years', 'RUG_raw_csv': 'C:\\Users\\mjnst\\Desktop\\Thesis\\rug_csv.csv', 'RUG_timeseries': 'C:\\Users\\mjnst\\Desktop\\Thesis\\rug_timeseries.pkl', 'RUG_obfuscated': 'C:\\Users\\mjnst\\Desktop\\Thesis\\obfuscated_data.pkl', 'RUG_no_outliers': 'C:\\Users\\mjnst\\Desktop\\Thesis\\obfuscated_data_rm_outlier.pkl'}


In [13]:
RUG = pd.read_pickle(options['RUG_no_outliers'])

# Preparing and Transforming Data

In [14]:
RUG.interpolate(method='linear', inplace=True, limit=20)

In [15]:
def get_data(col_name):
    df = RUG[col_name].copy()
    
    groups = df.groupby(pd.Grouper(freq='D'))

    # get the calender date of the groups
    days = list(groups.first().index.strftime('%Y:%m:%d'))

    gro = [groups.get_group(x).reset_index(drop=True) for x in groups.groups]

    temp = pd.concat(gro, axis=1, keys=days)

    temp.index = pd.date_range("00:00", "23:59", freq="1min").strftime('%H:%M')

    # drop all columns of temp dataframe which contain nan values
    temp.dropna(axis=1, how='any', inplace=True)
    return temp[::10]

In [16]:
def scale_data(data):

    temp = data.copy()

    train_percentage = 0.8
    train_size = int(len(temp.columns) * train_percentage)
    
    train = temp.iloc[:, :train_size]
    test = temp.iloc[:, train_size:]

    scaler = MinMaxScaler(feature_range=(0, 1))

    scaled_list_train = [train[col] for col in train]
    scaled_list_train = scaler.fit_transform(scaled_list_train)

    scaled_list_test = [test[col] for col in test]
    scaled_list_test = scaler.transform(scaled_list_test)

    return scaler, scaled_list_train, scaled_list_test

# Principal Component Analysis

In [17]:
def create_pca(data):
    temp = data.copy()
    
    pca = PCA(n_components=0.85, svd_solver='full')
 
    # Fit and transform data
    pca_features = pca.fit_transform(temp)

    return pca_features

In [18]:
def create_kmeans(pca_data, scaled_train, scaled_test, clusters=4):
    temp_pca_data = pca_data.copy()
    temp_scaled_train = scaled_train.copy()
    temp_scaled_test = scaled_test.copy()

    kmeans_pca = TimeSeriesKMeans(n_clusters=clusters, metric="dtw", n_jobs=-1).fit(temp_pca_data)
    train_pca_features = kmeans_pca.predict(temp_scaled_train)
    test_pca_features = kmeans_pca.predict(temp_scaled_test)

    return train_pca_features, test_pca_features

# Num of clusters per column

based on elbow method and silhouette score

In [107]:
clusters = [4, 4, 3, 3, 4, 4, 4, 3, 3, 4, 3, 4, 4]

n_iterations = 3

In [187]:
# complete_results = []
complete_results_2 = []
for location, clust_n in zip(RUG.columns, clusters):
    if location != 'Location 6 - head':
        continue
    print(location)
    indice_results = []
    for it in range(n_iterations):
        print(it)
        data = get_data(location)

        scaler, scaled_list_train, scaled_list_test = scale_data(data)
        
        pca_features = create_pca(scaled_list_train)

        train_pca_features, test_pca_features = create_kmeans(pca_features, scaled_list_train, scaled_list_test, clust_n)
        # print(Counter(train_pca_features), Counter(test_pca_features))
        indice_results.append(train_pca_features)
    complete_results_2.append(indice_results)


Location 6 - head
0
1
2


In [172]:
from itertools import combinations

In [191]:
col_stabilty = {}
# check how many indices intersect between all combinations of labels
for column, name in zip(complete_results, RUG.columns):
    print(name)
    run_stability = []
    

    run_combinations = combinations(np.arange(n_iterations), 2)
    for run_base, run_compare in run_combinations:
        print(f"Run {run_base} -> Run {run_compare}")

        run_base_stats = dict(Counter(column[run_base]))
        run_compare_stats = dict(Counter(column[run_compare]))

        label_combinations = set(combinations(np.concatenate((np.unique(column[run_base]), np.unique(column[run_compare]))), 2))
        print(label_combinations)
        highest = []
        for label_base in np.unique(column[run_base]):
            temp_high = 0
            for label_compare in np.unique(column[run_compare]):
                
                if (label_base, label_compare)  in label_combinations:
                    res = len(np.intersect1d((column[run_base] == label_base).nonzero(), (column[run_compare] == label_compare).nonzero()))
                    # print(run_base_stats[label_base])
                    print(label_base, run_base_stats, label_compare, run_compare_stats, res)
                    res2 = (res/run_base_stats[label_base]*100 + res/run_compare_stats[label_compare]*100)/2
                    # print(res2)
                    if res2 > temp_high:
                        temp_high = res2
                highest.append(temp_high)
        run_stability.append(np.mean(highest))
    # break
    col_stabilty[name] = np.mean(run_stability)


Location 1 - flow
Run 0 -> Run 1
{(0, 1), (0, 0), (3, 1), (1, 1), (0, 3), (3, 0), (1, 0), (1, 3)}
0 {3: 573, 0: 1128, 1: 3} 0 {0: 1433, 1: 271} 857
0 {3: 573, 0: 1128, 1: 3} 1 {0: 1433, 1: 271} 271
1 {3: 573, 0: 1128, 1: 3} 0 {0: 1433, 1: 271} 3
1 {3: 573, 0: 1128, 1: 3} 1 {0: 1433, 1: 271} 0
3 {3: 573, 0: 1128, 1: 3} 0 {0: 1433, 1: 271} 573
3 {3: 573, 0: 1128, 1: 3} 1 {0: 1433, 1: 271} 0
Run 0 -> Run 2
{(0, 1), (1, 2), (0, 0), (0, 3), (3, 0), (0, 2), (1, 0), (3, 2), (1, 3)}
0 {3: 573, 0: 1128, 1: 3} 0 {2: 895, 0: 809} 800
0 {3: 573, 0: 1128, 1: 3} 2 {2: 895, 0: 809} 328
1 {3: 573, 0: 1128, 1: 3} 0 {2: 895, 0: 809} 3
1 {3: 573, 0: 1128, 1: 3} 2 {2: 895, 0: 809} 0
3 {3: 573, 0: 1128, 1: 3} 0 {2: 895, 0: 809} 6
3 {3: 573, 0: 1128, 1: 3} 2 {2: 895, 0: 809} 567
Run 1 -> Run 2
{(0, 1), (1, 2), (0, 0), (0, 2), (1, 0)}
0 {0: 1433, 1: 271} 0 {2: 895, 0: 809} 538
0 {0: 1433, 1: 271} 2 {2: 895, 0: 809} 895
1 {0: 1433, 1: 271} 0 {2: 895, 0: 809} 271
1 {0: 1433, 1: 271} 2 {2: 895, 0: 809} 0
Locati

In [192]:
pd.DataFrame(col_stabilty, index=[0]).T

Unnamed: 0,0
Location 1 - flow,62.684773
Location 2 - consumption,49.623881
Location 3 - consumption,62.723006
Location 4 - consumption,74.932615
Location 5 - consumption,65.79359
Location 6 - head,100.0
Location 7 - head,83.333333
Location 8 - flow,49.274492
Location 9 - head,75.0
Location 10 - flow,40.626417
