In [2]:
import sys
import os
# Definitions
ROOT_DIR = os.path.dirname(os.path.dirname(os.getcwd())) # Based on the notebook path
sys.path.insert(0, ROOT_DIR)

import h5py
from sklearn.cluster import KMeans

from idea.data.hdf5.utils import get_table_df



In [3]:
hd5_filepath = os.path.join(ROOT_DIR, 'idea/data/hdf5/files/task_data.hdf5')
file = h5py.File(hd5_filepath)
branches_df = get_table_df(file, 'branches', ['node_from', 'node_to', 'flow'])

In [4]:
print(branches_df)

      node_from  node_to       flow hour
0           1.0      2.0  -0.399469    1
1           1.0     39.0   0.399469    1
2           2.0      3.0  10.019257    1
3           2.0     25.0  -0.018727    1
4           3.0      4.0  -2.959464    1
...         ...      ...        ...  ...
1099       22.0     35.0  -6.870000    9
1100       23.0     36.0  -5.800000    9
1101       25.0     37.0  -5.640000    9
1102       29.0     38.0  -8.650000    9
1103       31.0      6.0   6.368000    9

[1104 rows x 4 columns]


In [5]:
# Make sure that the nodes (from and to) order is always the same
print(branches_df.groupby(['node_from', 'node_to']).agg('count'))

                   flow  hour
node_from node_to            
1.0       2.0        24    24
          39.0       24    24
2.0       3.0        24    24
          25.0       24    24
          30.0       24    24
3.0       4.0        24    24
          18.0       24    24
4.0       5.0        24    24
          14.0       24    24
5.0       6.0        24    24
          8.0        24    24
6.0       7.0        24    24
          11.0       24    24
7.0       8.0        24    24
8.0       9.0        24    24
9.0       39.0       24    24
10.0      11.0       24    24
          13.0       24    24
          32.0       24    24
12.0      11.0       24    24
          13.0       24    24
13.0      14.0       24    24
14.0      15.0       24    24
15.0      16.0       24    24
16.0      17.0       24    24
          19.0       24    24
          21.0       24    24
          24.0       24    24
17.0      18.0       24    24
          27.0       24    24
19.0      20.0       24    24
          

In [12]:
branches_in_hours = branches_df.loc[branches_df['hour'].isin({1, 2, 3})]
print(branches_in_hours)

     node_from  node_to       flow hour
0          1.0      2.0  -0.399469    1
1          1.0     39.0   0.399469    1
2          2.0      3.0  10.019257    1
3          2.0     25.0  -0.018727    1
4          3.0      4.0  -2.959464    1
..         ...      ...        ...  ...
823       22.0     35.0  -6.870000    3
824       23.0     36.0  -5.800000    3
825       25.0     37.0  -5.640000    3
826       29.0     38.0  -8.650000    3
827       31.0      6.0   6.368000    3

[138 rows x 4 columns]


In [14]:
branches_avg = branches_in_hours.abs().groupby(['node_from', 'node_to']).mean()

In [15]:
# Clustering settings
columns = ['flow'] # Which columns use for clustering
n_clusters = 3 # Number of clusters
algorithm = KMeans(n_clusters=n_clusters)

In [24]:
y = algorithm.fit_predict(branches_avg[columns])
branches_avg['cluster'] = y
print(branches_avg)

                        flow  Cluster  cluster
node_from node_to                             
1.0       2.0       1.934551        1        0
          39.0      1.934551        1        0
2.0       3.0      11.109459        0        1
          25.0      0.426147        1        0
          30.0     10.400000        0        1
3.0       4.0       2.341373        1        0
          18.0      5.481236        2        2
4.0       5.0       4.636625        2        2
          14.0      1.139447        1        0
5.0       6.0       5.089671        2        2
          8.0       2.805420        1        0
6.0       7.0       3.163743        1        0
          11.0      2.033938        1        0
7.0       8.0       0.882572        1        0
8.0       9.0       2.120812        1        0
9.0       39.0      2.120812        1        0
10.0      11.0      2.229926        1        0
          13.0      5.306392        2        2
          32.0      7.250000        0        1
12.0      11.

In [26]:
branch_to_cluster = dict()

for _, row in branches_avg.iterrows():
    key = tuple([abs(x) for x in row.name])
    print(key)
    branch_to_cluster[key] = int(row['cluster'])
    
print(branch_to_cluster)

(1.0, 2.0)
(1.0, 39.0)
(2.0, 3.0)
(2.0, 25.0)
(2.0, 30.0)
(3.0, 4.0)
(3.0, 18.0)
(4.0, 5.0)
(4.0, 14.0)
(5.0, 6.0)
(5.0, 8.0)
(6.0, 7.0)
(6.0, 11.0)
(7.0, 8.0)
(8.0, 9.0)
(9.0, 39.0)
(10.0, 11.0)
(10.0, 13.0)
(10.0, 32.0)
(12.0, 11.0)
(12.0, 13.0)
(13.0, 14.0)
(14.0, 15.0)
(15.0, 16.0)
(16.0, 17.0)
(16.0, 19.0)
(16.0, 21.0)
(16.0, 24.0)
(17.0, 18.0)
(17.0, 27.0)
(19.0, 20.0)
(19.0, 33.0)
(20.0, 34.0)
(21.0, 22.0)
(22.0, 23.0)
(22.0, 35.0)
(23.0, 24.0)
(23.0, 36.0)
(25.0, 26.0)
(25.0, 37.0)
(26.0, 27.0)
(26.0, 28.0)
(26.0, 29.0)
(28.0, 29.0)
(29.0, 38.0)
(31.0, 6.0)
{(1.0, 2.0): 0, (1.0, 39.0): 0, (2.0, 3.0): 1, (2.0, 25.0): 0, (2.0, 30.0): 1, (3.0, 4.0): 0, (3.0, 18.0): 2, (4.0, 5.0): 2, (4.0, 14.0): 0, (5.0, 6.0): 2, (5.0, 8.0): 0, (6.0, 7.0): 0, (6.0, 11.0): 0, (7.0, 8.0): 0, (8.0, 9.0): 0, (9.0, 39.0): 0, (10.0, 11.0): 0, (10.0, 13.0): 2, (10.0, 32.0): 1, (12.0, 11.0): 0, (12.0, 13.0): 0, (13.0, 14.0): 2, (14.0, 15.0): 2, (15.0, 16.0): 2, (16.0, 17.0): 0, (16.0, 19.0): 1, (16.0, 21.