In [None]:
import pandas as pd
import pandas
import matplotlib.pyplot as plt
import numpy as np
import numpy
import sklearn
import tslearn
import os
import tslearn.utils 
import seaborn
import pylab
from tslearn.clustering import TimeSeriesKMeans, KShape
import matplotlib.ticker as ticker

def preprocessing(meter):
    meter_preprocessed = meter[["Time stamp", "Value"]].dropna(axis=0, how='any')
    meter_preprocessed["Value"] = meter_preprocessed["Value"].apply(lambda s: float(s.replace(",", "")))
    meter_preprocessed["Time stamp"] = pandas.to_datetime(meter_preprocessed["Time stamp"], format="%d/%m/%Y %I:%M:%S %p")
    meter_preprocessed = meter_preprocessed[meter_preprocessed["Value"] < 25]
    meter_preprocessed = meter_preprocessed[["Time stamp", "Value"]].dropna(axis=0, how='any')
    beg=meter_preprocessed["Time stamp"].min().replace(hour=23, minute=45, second=0)
    end = meter_preprocessed["Time stamp"].max().replace(hour=0, minute=0, second=0)
    helper = pandas.DataFrame({"Time stamp": pandas.date_range(beg, end, freq='15min')})
    meter_preprocessed = pandas.merge(meter_preprocessed, helper, on='Time stamp', how='outer').sort_values('Time stamp')    
    meter_preprocessed = pandas.merge(meter_preprocessed, helper, on='Time stamp', how='inner').sort_values('Time stamp')

    meter_preprocessed['Value'] = meter_preprocessed['Value'].interpolate(method='linear')
    meter_preprocessed = meter_preprocessed[meter_preprocessed["Time stamp"] > beg] 
    meter_preprocessed = meter_preprocessed[(meter_preprocessed["Time stamp"] < end)] 
    meter_preprocessed = meter_preprocessed.drop_duplicates(subset=["Time stamp"])
    return meter_preprocessed

In [None]:
n_clusters = 8
output_path = "./report_Library"
# Reading dataset for ladyhunting field
# point_path = "./iHUB - CoM - Lady Huntingfield/Trends from BMS/Meters/LH_ElectricMeter_EMG_9(MSB)_14Jul2020-18Aug2021.csv"
# meter = pandas.read_csv(point_path)
# meter_preprocessed = preprocessing(meter)

# Reading data for library (preprocessed
meter_preprocessed = pandas.read_csv("./library.csv")
meter_preprocessed = meter_preprocessed.drop(meter_preprocessed.columns[0], axis=1)
meter_preprocessed["Time stamp"] = pandas.to_datetime(meter_preprocessed["Time stamp"])

In [None]:
clusters = {
    "AgglomerativeClustering": {
        "model": sklearn.cluster.AgglomerativeClustering(n_clusters=n_clusters),
        "sklearn": True
    }, 
    "Birch": {
        "model": sklearn.cluster.Birch(n_clusters=n_clusters),
        "sklearn": True
    }, 
    "KMeansEuclidean": {
        "model": TimeSeriesKMeans(n_clusters=n_clusters),
        "sklearn": False
    }, 
    "KMeansDTW": {
        "model": TimeSeriesKMeans(n_clusters=n_clusters, metric='dtw'),
        "sklearn": False
    }, 
    "KMeansEuclidean": {
        "model": TimeSeriesKMeans(n_clusters=n_clusters),
        "sklearn": False
    }, 
    "KShape": {
        "model": KShape(n_clusters=n_clusters),
        "sklearn": False
    }, 
}

In [23]:
def getimgname(cluster_name, n_clusters, type):
    return os.path.join(output_path, "{}-{}-{}.png".format(cluster_name, n_clusters, type))


def get_cluster_heatmap(label, time):
    label_x = ["1", "2", "3", "4", "5", "6", "7"]
    label_y = []
    heatmap = []
    temp = [-1] * 7
    for i in range(len(time)):
        temp[time[i].dayofweek] = label[i]
        if time[i].dayofweek == 6 or i == len(time) - 1:
            label_y.append("{}-{}".format(time[i].year, time[i].week))
            # print(str(time[i])+"  {}-{}".format(time[i].year, time[i].week))
            heatmap.append(temp)
            temp = [-1] * 7
    plt.figure(figsize=(5, 15))
    fig = seaborn.heatmap(heatmap, annot=True, cmap = pylab.get_cmap('PiYG',n_clusters + 1), xticklabels=label_x, yticklabels=label_y).get_figure()
    return fig
def draw_multiple_timelines(dataset, timestamp, index_list, ax, ylim=20.0):
    if (len(index_list) == 0):
        return [], [], []
    max_line = numpy.max(dataset[index_list], axis=0)
    min_line = numpy.min(dataset[index_list], axis=0)
    median_line = numpy.median(dataset[index_list], axis=0)
    for index in index_list:
        df = pandas.DataFrame({'time': timestamp[index], 'value': dataset[index]})
        df["time"] = pandas.to_timedelta(df["time"])
        df.plot.line(x='time', y='value', ax=ax, color="blue", alpha=0.1, legend=False, ylim=(0, ylim))
    # max
    df = pandas.DataFrame({'time': timestamp[0], 'value': max_line})
    df["time"] = pandas.to_timedelta(df["time"])
    df.plot.line(x='time', y='value', ax=ax, color="red", alpha=1, legend=False, ylim=(0, ylim))
    # min
    df = pandas.DataFrame({'time': timestamp[0], 'value': min_line})
    df["time"] = pandas.to_timedelta(df["time"])
    df.plot.line(x='time', y='value', ax=ax, color="red", alpha=1, legend=False, ylim=(0, ylim))
    # median
    df = pandas.DataFrame({'time': timestamp[0], 'value': median_line})
    df["time"] = pandas.to_timedelta(df["time"])
    df.plot.line(x='time', y='value', ax=ax, color="yellow", alpha=1, legend=False, ylim=(0, ylim))
    return max_line, min_line, median_line

In [None]:
for cluster_name, obj in clusters.items(): 
    using_sklearn = obj["sklearn"]
    km = obj["model"]
    # cluster_name="Birch"
    using_sklearn = True
    #km = KShape(n_clusters=n_clusters)
    # km = sklearn.cluster.Birch(n_clusters=n_clusters)
    
    # grouping data by day
    grouped_day = meter_preprocessed.groupby(pandas.Grouper(key="Time stamp", freq="D"))
    dataset = [] 
    timestamp=[]
    time = []
    for t in grouped_day:
        time.append(t[0])
        dataset.append(numpy.array(t[1]["Value"]))
        timestamp.append(numpy.array(t[1]["Time stamp"].apply(lambda x: x.strftime("%H:%M:%S"))))
        # print(t[0], len(t[1]["Time stamp"]))
    timestamp = numpy.array(timestamp)
    dataset = numpy.array(dataset)
    # meter_preprocessed.plot(x="Time stamp", y="Value")
    ts_dataset = tslearn.utils.to_time_series_dataset(dataset)
    if using_sklearn:
        ts_dataset = ts_dataset.squeeze()
    if using_sklearn:
        label = km.fit_predict(ts_dataset)
    else: 
        km.fit(ts_dataset)
        label = km.predict(ts_dataset)
    
    #%%\
    
    # Graph: Class heatmap
    fig = get_cluster_heatmap(label, time)
    fig.savefig(getimgname(cluster_name, n_clusters, "heatmap"), dpi=400)
    
    #%%
    
    fig, axs = plt.subplots(n_clusters, 4, figsize=(16, 4 * n_clusters))
    # fig, axs = plt.subplots(n_clusters, 5, figsize=(20, 4 * n_clusters))
    color_week= ['g', 'g', 'g', 'g', 'g', 'y', 'y']
    color_month= ['b','b','b','b','r','r','r','r','r','r','b','b']
    ylim = np.max(dataset)
    for cluster_no in range(n_clusters):
        index_list = numpy.where(label==cluster_no)[0].tolist()
        weekday = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0}
        month = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0}
        year = {}
        for index in index_list:
            if time[index].year not in year:
                year[time[index].year] = 0
            year[time[index].year] += 1
            weekday[time[index].dayofweek] += 1
            month[time[index].month - 1] += 1
        indexes = numpy.where(label==cluster_no)[0].tolist()
        draw_multiple_timelines(dataset, timestamp, indexes, axs[cluster_no][3], ylim)
        # if not using_sklearn:
        #     df = pandas.DataFrame({'time': timestamp[0], 'value': km.cluster_centers_[cluster_no].reshape(-1)})
        #     df["time"] = pandas.to_timedelta(df["time"])
        #     df.plot.line(x="time", y="value", ax=axs[cluster_no][4], color="red", alpha=1, legend=False, ylim=(0, 20.0))
        # plt.subplot(n_clusters, 5, cluster_no * 5 + 1)
        plt.subplot(n_clusters, 4, cluster_no * 4 + 1)
        plt.bar([str(x + 1) for x in weekday.keys()], weekday.values(), color=color_week)
        plt.ylim(0, 30)
        plt.title("Weekly Distribution\nCluster # {}, including {} days".format(cluster_no, sum(weekday.values())))
        # plt.subplot(n_clusters, 5, cluster_no * 5 + 2)
        plt.subplot(n_clusters, 4, cluster_no * 4 + 2)
        plt.bar([str(x + 1) for x in month.keys()], month.values(), color=color_month)
        plt.ylim(0, 30)
        plt.title("Monthly Distribution\nCluster # {}, including {} days".format(cluster_no, sum(month.values())))
        # plt.subplot(n_clusters, 5, cluster_no * 5 + 3)
        plt.subplot(n_clusters, 4, cluster_no * 4 + 3)
        plt.bar(year.keys(), year.values())    
        plt.ylim(0, 150)
        plt.title("Yearly Distribution\nCluster # {}, including {} days".format(cluster_no, sum(year.values())))
    
    plt.tight_layout()
    plt.savefig(getimgname(cluster_name, n_clusters,"distribution"), dpi=400)