In [4]:
import numpy as np
import matplotlib.pyplot as plt

from tslearn.clustering import TimeSeriesKMeans
from pprint import pprint
from collections import defaultdict

from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

### Simulated Data

In [5]:
t = np.linspace(0,10,100)

# data of shape [timesteps x variables]
# data = np.array([2.4*np.cos(t), 2*np.sin(t), 2*np.sin(t)+0.1, 2.4*np.cos(t+0.1)]).T
# data = np.array([2.4*np.cos(t), 2*np.sin(t), 2*np.sin(t)+0.1, 2.4*np.cos(t+0.1), 2.4*np.cos(t+0.1)*np.cos(t+0.1)]).T
data = np.random.rand(100,10)


In [6]:
def distinct_clusters(X, labels, cluster_cnt):
        # computes most disinct clusters using Simplex Volume Maximization (SiVM) matrix factorization
        # see Understanding Building Operation from Semantic Context, IECON

        # pick the one with lowest similarity to others
        n = len(labels)
        XX = np.zeros((n, n))
        ij = 0
        for i in range(len(labels)):
            for j in range(i+1, len(labels)):
                XX[i, j] = XX[j, i] = X[ij]
                ij += 1
        centers = [np.argmin(XX.sum(axis=1), axis=0)]

        for k in range(1, cluster_cnt):
            XXsub = XX[:, centers]
            # add new centers with lowest similarity to others centers (most distinct)
            centers = centers + [np.argmin(XXsub.sum(axis=1), axis=0)]
        # cluster now based on which has the highest similarity to one of the centers
        XXsub = XX[:, centers]
        cluster = np.argmin(XXsub, axis=1)
        print(cluster)

        # compute groups
        groups = defaultdict(list)
        for i in range(len(centers)):
            grp = labels[i]
            for j in range(len(cluster)):
                if cluster[j] == i:
                    groups[grp].append(labels[j])
        return groups

# compute distances
X = []
for i in range(data.shape[-1]):
    for j in range(i+1, data.shape[-1]):
        d = euclidean(data[:,i], data[:,j])
        # d, _ = fastdtw(data[:,i], data[:,j], dist=euclidean)
        X.append(-d) # negative distance = similarity

# SiVM cluster
groups = distinct_clusters(X, [x for x in range(data.shape[-1])], 5)

pprint(groups)

defaultdict(<class 'list'>, {0: [0, 2, 7, 9], 1: [1, 8], 2: [4, 5, 6], 4: [3]})


### Real Data

In [9]:
# data of shape [timesteps x variables]
data = np.load('../Data/Building/buildingheating.npy')
n_clusters = 10

# compute distances
X = []
for i in range(data.shape[-1]):
    for j in range(i+1, data.shape[-1]):
        d = euclidean(data[:,i], data[:,j])
        # d, _ = fastdtw(data[:,i], data[:,j], dist=euclidean)
        X.append(-d) # negative distance = similarity

# SiVM cluster
groups = distinct_clusters(X, [x for x in range(data.shape[-1])], n_clusters)
for i in range(n_clusters):
    if len(groups[i]) > 0:
        print(i, groups[i])


0 [0, 1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29]
1 [3, 8, 13, 23]


## KMeans

In [11]:
decomp_period = 144
periods = [i*decomp_period for i in range(len(data)//decomp_period + 1)]

# warmup for 10% of data
for i, (start, end) in enumerate(zip(periods, periods[1:])):
    if i > 0:
        km = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", init=km.cluster_centers_, n_jobs=-1)
        y_pred = km.fit_predict(np.expand_dims(data[start:end, :].transpose(), axis=-1))
    else:
        km = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", n_jobs=-1)
        y_pred = km.fit_predict(np.expand_dims(data[start:end, :].transpose(), axis=-1))
    if i > len(periods)/10:
        break

for i, (start, end) in enumerate(zip(periods, periods[1:])):
    km = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", init=km.cluster_centers_, n_jobs=-1)
    y_pred = km.fit_predict(np.expand_dims(data[start:end, :].transpose(), axis=-1))

    cluster_dict = {x: y for x, y in zip([x for x in range(len(data[0]))], y_pred)}
    cluster_map = {}
    for k, v in cluster_dict.items():
        cluster_map[v] = cluster_map.get(v, []) + [k]
    print(i, cluster_map)

0 {5: [0, 4, 9, 14], 7: [1, 2, 6, 7, 11, 12, 17, 21, 22, 25, 27, 28, 29], 3: [3], 8: [5, 10, 16], 0: [8, 13], 6: [15], 9: [18], 1: [19], 4: [20, 24, 26], 2: [23]}
1 {5: [0, 4, 9, 14], 7: [1, 2, 6, 7, 11, 12, 17, 21, 22, 25, 27, 28, 29], 3: [3], 8: [5, 10, 16], 0: [8, 13], 6: [15], 9: [18], 1: [19], 4: [20, 24, 26], 2: [23]}
2 {5: [0], 7: [1, 2, 6, 7, 11, 12, 21, 22, 25, 27, 28], 3: [3], 4: [4, 9, 14, 20, 23, 24, 26], 8: [5, 10, 16], 0: [8, 13], 6: [15], 1: [17, 19, 29], 9: [18]}
3 {5: [0, 4, 9, 14], 7: [1, 2, 6, 7, 11, 12, 17, 21, 22, 25, 27, 28, 29], 3: [3], 8: [5, 10, 16], 0: [8, 13], 6: [15], 9: [18], 1: [19], 4: [20, 24, 26], 2: [23]}
4 {5: [0, 4], 7: [1, 2, 6, 7, 11, 12, 17, 21, 22, 25, 27, 28, 29], 3: [3], 8: [5, 9, 10, 14, 16], 0: [8, 13, 23], 6: [15], 9: [18], 1: [19], 4: [20, 24, 26]}
5 {5: [0, 4], 7: [1, 2, 6, 7, 11, 12, 17, 21, 22, 25, 27, 28, 29], 3: [3], 8: [5, 10, 16], 0: [8, 13, 23], 6: [9, 14, 15], 1: [18, 19], 4: [20, 24, 26]}
6 {5: [0], 7: [1, 2, 6, 7, 11, 12, 17, 21,

## Uninitialised K-Means

In [12]:
decomp_period = 144
periods = [i*decomp_period for i in range(len(data)//decomp_period + 1)]

for i, (start, end) in enumerate(zip(periods, periods[1:])):
    km = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", n_jobs=-1)
    y_pred = km.fit_predict(np.expand_dims(data[start:end, :].transpose(), axis=-1))

    cluster_dict = {x: y for x, y in zip([x for x in range(len(data[0]))], y_pred)}
    cluster_map = {}
    for k, v in cluster_dict.items():
        cluster_map[v] = cluster_map.get(v, []) + [k]
    print(i, cluster_map)

0 {4: [0, 5, 10, 16], 0: [1, 2, 6, 7, 11, 12, 17, 21, 22, 25, 27, 28, 29], 2: [3], 9: [4, 9, 14], 1: [8, 13], 8: [15], 6: [18], 7: [19], 5: [20, 24, 26], 3: [23]}
1 {4: [0, 5, 10, 16], 6: [1, 2, 6, 7, 11, 12, 21, 22, 25, 27, 28], 3: [3], 8: [4, 9, 14], 1: [8, 13], 7: [15], 0: [17, 19, 29], 9: [18], 5: [20, 24, 26], 2: [23]}
2 {0: [0], 4: [1, 2, 6, 7, 11, 12, 21, 22, 25, 27, 28], 2: [3], 5: [4, 9, 14], 9: [5, 10, 16], 1: [8, 13], 6: [15, 18], 7: [17, 19, 29], 8: [20, 24, 26], 3: [23]}
3 {7: [0, 5, 10, 16], 4: [1, 2, 6, 7, 11, 12, 21, 22, 25, 27, 28], 3: [3], 0: [4, 9, 14], 1: [8, 13], 8: [15], 9: [17, 19, 29], 6: [18], 5: [20, 24, 26], 2: [23]}
4 {9: [0, 4, 9, 14], 4: [1, 2, 6, 7, 11, 12, 17, 21, 22, 25, 27, 28, 29], 0: [3], 1: [5, 10, 16], 2: [8, 13], 7: [15], 6: [18], 8: [19], 5: [20, 24, 26], 3: [23]}
5 {0: [0, 4, 5, 10, 16], 4: [1, 2, 6, 7, 11, 12, 21, 22, 25, 27, 28], 2: [3], 3: [8, 13], 6: [9, 14, 15], 7: [17, 29], 9: [18], 8: [19], 5: [20, 24, 26], 1: [23]}
6 {4: [0, 5, 10, 16], 

In [None]:
decomp_period = 144
periods = [i*decomp_period for i in range(len(data)//decomp_period + 1)]

for i, (start, end) in enumerate(zip(periods, periods[1:])):
    km = TimeSeriesKMeans(n_clusters=n_clusters, metric="softdtw", n_jobs=-1)
    y_pred = km.fit_predict(np.expand_dims(data[start:end, :].transpose(), axis=-1))

    cluster_dict = {x: y for x, y in zip([x for x in range(len(data[0]))], y_pred)}
    cluster_map = {}
    for k, v in cluster_dict.items():
        cluster_map[v] = cluster_map.get(v, []) + [k]
    print(i, cluster_map)

In [None]:
decomp_period = 144
periods = [i*decomp_period for i in range(len(data)//decomp_period + 1)]

for i, (start, end) in enumerate(zip(periods, periods[1:])):
    km = TimeSeriesKMeans(n_clusters=n_clusters, metric="euclidean", n_jobs=-1)
    y_pred = km.fit_predict(np.expand_dims(data[start:end, :].transpose(), axis=-1))

    cluster_dict = {x: y for x, y in zip([x for x in range(len(data[0]))], y_pred)}
    cluster_map = {}
    for k, v in cluster_dict.items():
        cluster_map[v] = cluster_map.get(v, []) + [k]
    print(i, cluster_map)