In [None]:
import time
import pandas as pd
import numpy as np
from ScenarioLoader import ScenarioCollection, Scenario

# Variables
We want to simulate different leakage-scenarios. Each setting consists of a combination of 

- a leak position (starting from node with the ID '2' to the node with ID '32'),
- a leak diameter (5, 10 or 15 cm).

For each such setting, we define a *setting start ID*, i.e., a time point (measured in time IDs 0,1,2,3...) and a *setting end ID*. Within such time window *setting end ID* - *setting start ID*, a leak, defined by its location and size, is active, starting from a *leak start ID* and ending at the *leak end ID*. Especially, *setting start ID* $\leq$ *leak start ID* $<$ *leak end ID* $\leq$ *setting end ID* holds. These time IDs (not to be confused with the node IDs!) are defined in such a way, that the settings' timewindow can be a chosen number *length setting* = *setting end ID* - *setting start ID* and that the total number of samples (over all settings) where any leak is activate is approximately as large as the number of samples where there is no leak active in the WDN at all.

Moreover, we put different node IDs, i.e., leak locations into *groups*. Later on, we want to find out how the models we work with behave on such different groups. (We could also define a group per node ID, corresponding to a leak position, but this would not scale with larger WDNs).

In [None]:
# --- setting definition
# define node IDs for nodes on which we want to simulate leaks
nodes = [str(x+1) for x in range(1,32)] 
# define diameters (in cm) the leaks should have (without duplicates)
diameters = [15, 5, 10]
# define length of each setting
length_setting = 1440 # ten days if delta = 10min = 600s

# --- group definition
# node IDs groups
group1 = ['23', '24', '25', '28', '29', '30', '31', '32']
group2 = ['2', '3', '14', '15', '16', '17', '18', '19', '20', '21', '22', '26', '27'] # evtl. put 26 to group 1
group3 = ['4', '5', '6', '7', '8', '9', '10', '11', '12', '13'] # evtl. put 4 and 5 to group 2

# generate helpful dictionary to access group information quickly
groups_values = [group1, group2, group3]
groups_keys = ['group' + str(i+1) for i in range(len(groups_values))]
groups_per_group = dict(zip(groups_keys, groups_values))
groups_per_node = dict()
for node in nodes:
    for group in groups_per_group:
        if node in groups_per_group[group]:
            groups_per_node[node] = group

# Label generation

In [None]:
def LeakInformation(nodes,
                    diameters,
                    length_setting,
                    groups_per_node,
                    option=3,
                    offset=0,
                    print_info=False):
    
    """
    Inputs
    nodes:               list of node IDs for nodes on which leak should be simulated
    diameters:           list of diameters the leaks should have
    length_settings:     integer corresponding to the length of each setting
    groups_per_node:     dictionary with keys=node ID, value=group name
    option:              integer, 1 - leak appears in the middle of each setting
                                  2 - leak appears at the start of each setting
                                  3 - leak appears at the start of each setting,
                                      first setting is not taken into account 
                                      for balanced label creation
    offset:              an offset that in the end will be added on top of each time ID
                         (this is useful if a time window before the actual time series
                          is required for, e.g., preprocessing)
    print info:          boolean for whether intermediate results should be printed
    """
    
    # ----- leak information in data frame style
    # --- initialize data frame
    # generate setting numbers (each combination of node ID and diameter)
    nb_settings = len(nodes) * len(diameters)
    settings = [idx+1 for idx in range(0,nb_settings)]
    # generate diameters (in cm) the leaks should have (with duplicates)
    diameters_old = diameters.copy()
    diameters = list()
    for diameter in diameters_old:
        diameters += [diameter] * len(nodes) 
    # generate placehoders
    groups = [np.nan for _ in range(0,nb_settings)]
    setting_start = groups.copy()
    leak_start = groups.copy()
    leak_end = groups.copy()
    setting_end = groups.copy()
    
    data = {'group': groups,
            'node ID': nodes * len(diameters_old), 
            'diameter': diameters, 
            'setting start ID': setting_start,
            'leak start ID': leak_start, 
            'leak end ID': leak_end,
            'setting end ID': setting_end}
    df = pd.DataFrame(data=data, index=settings)
    
    # --- fill data frame
    # some computations which will assure that |{Y=0}| approx. |{Y=1}| holds
    n = length_setting
    K = nb_settings
    rd = round( ( (K + 1) * n + 1 ) / (2 * K) )
    
    if print_info:
        print('Nb. of settings with leak: {}'.format(K))
        print('Length of each setting: {} (= {} d)'.format(n, 
                                                           n*600/60/60/24))
        print('rd: {}'.format(rd))
        
     # OPTION 1
    if option==1:
        i_s = int((n-rd)/2)
        i_e = int((n+rd)/2)
        for idx in df.index:
            # access group of node
            node_id = df.loc[idx, 'node ID']
            df.loc[idx, 'group'] = groups_per_node[node_id]
            # calculate time IDs
            df.loc[idx, 'setting start ID'] = idx*n + 1
            df.loc[idx, 'leak start ID'] = idx*n + 1 + i_s
            df.loc[idx, 'leak end ID'] = idx*n + 1 + i_e
            df.loc[idx, 'setting end ID'] = (idx+1)*n

        # information for double check
        setting_0 = (n - 1 + 1) - ((1 + i_e - 1) - (1 + i_s) + 1)
        setting_1 = (1 + i_e - 1) - (1 + i_s) + 1
        total_0 = (n + 1) + K * setting_0
        total_1 = K * setting_1

    # OPTION 2
    if option==2:
        i_s = 0
        i_e = rd
        for idx in df.index:
            # access group of node
            node_id = df.loc[idx, 'node ID']
            df.loc[idx, 'group'] = groups_per_node[node_id]
            # calculate time IDs
            df.loc[idx, 'setting start ID'] = idx*n + 1
            df.loc[idx, 'leak start ID'] = idx*n + 1 + i_s
            df.loc[idx, 'leak end ID'] = idx*n + 1 + i_e
            df.loc[idx, 'setting end ID'] = (idx+1)*n

        # information for double check
        setting_0 = n - (1 + i_e) + 1 
        setting_1 = (1 + i_e - 1) - 1 + 1 
        total_0 = (n + 1) + K * setting_0
        total_1 = K * setting_1
        
    # OPTION 3
    if option==3:
        i_s = 0
        i_e = int(n/2)
        for idx in df.index:
            # access group of node
            node_id = df.loc[idx, 'node ID']
            df.loc[idx, 'group'] = groups_per_node[node_id]
            # calculate time IDs
            df.loc[idx, 'setting start ID'] = idx*n + 1
            df.loc[idx, 'leak start ID'] = idx*n + 1 + i_s
            df.loc[idx, 'leak end ID'] = idx*n + 1 + i_e
            df.loc[idx, 'setting end ID'] = (idx+1)*n

        # information for double check
        setting_0 = n - (1 + i_e) + 1 
        setting_1 = (1 + i_e - 1) - 1 + 1 
        total_0 = K * setting_0
        total_1 = K * setting_1
        
    # compute offset
    col_filter = ['setting start ID', 'leak start ID', 'leak end ID', 'setting end ID']
    df.loc[:, col_filter] = df.loc[:, col_filter] + offset
    
    # double check
    df['Y = 0 on ride side'] = (df['leak start ID'] - 1) - df['setting start ID'] + 1
    df['Y = 1'] = (df['leak end ID'] - 1) - df['leak start ID'] + 1
    df['Y = 0 on left side'] = df['setting end ID'] - df['leak end ID'] + 1
    df['Y = 0'] = df['Y = 0 on ride side'] + df['Y = 0 on left side']

    if print_info:
        print('\n')    
        print('i_e: {}'.format(i_e))
        print('i_e - i_s: {}'.format(i_e - i_s))
        print('Expected samples Y = 0 per setting: {}'.format(setting_0))
        print('Expected samples Y = 1 per setting: {}'.format(setting_1))
        print('Expected samples Y = 0 in total: {}'.format(total_0))
        print('Expected samples Y = 1 in total: {}'.format(total_1))
        print('Expected samples in total: {}'.format(total_0+total_1))
        print('\nNote that there are some rounding errors, find exact results by removing round() and int() in the code.')
    
    df_leak_information = df.copy()
    
    # ----- leak information in dictionary style
    leak_information = dict()
    for idx in df.index:
        information = dict()
        information['group'] = df.loc[idx, 'group']
        information['diameter'] = df.loc[idx, 'diameter']
        information['leak start ID'] = df.loc[idx, 'leak start ID']
        information['leak end ID'] = df.loc[idx, 'leak end ID']

        node_id = df.loc[idx, 'node ID']
        if node_id not in leak_information.keys():
            leak_information[node_id] = dict()
            leak_information[node_id]['1'] = information
        else: 
            nb_leaks = len(leak_information[node_id])
            leak_information[node_id][str(nb_leaks + 1)] = information

    return df_leak_information, leak_information

In [None]:
def GenerateLabels(nodes,
                   df_leak_information,
                   leak_information,
                   length_setting,
                   groups_values,
                   offset=0,
                   print_info=False):
    
    """
    Inputs
    nodes:               list of node IDs for nodes on which leak should be simulated
    df_leak_information: data frame, first output from LeakInformation()
    leak_information:    dictionary, second output from LeakInformation()
    groups_values:       list of lists of all groups
    length_settings:     integer corresponding to the length of each setting
    groups_per_node:     dictionary with keys=node ID, value=group name
    offset:              an offset that in the end will be added on top of each time ID
                         (this is useful if a time window before the actual time series
                          is required for, e.g., preprocessing)
    print info:          boolean for whether intermediate results should be printed
    """
    
    nb_settings = len(df_leak_information.index)
    # nb_settings + 1 setting for a setting with no leak at all
    nb_samples_incl_offset = (nb_settings + 1) * length_setting + offset + 1
    if print_info == True:
        print('Nb. of samples inclusive offset: {}'.format(nb_samples_incl_offset))
      
    # --- initialize labels
    y = np.empty(nb_samples_incl_offset)
    y_values = [y.copy() for _ in range(len(groups_values))]
    y_keys = ['group' + str(i+1) for i in range(len(groups_values))]
    y_per_group = dict(zip(y_keys, y_values))

    # --- compute labels
    for i in range(len(y)):
        # usage of this boolean makes code faster, but not necessarily needed
        leak_found = False
        
        for node_id in nodes:
            
            for leak in leak_information[node_id]:

                if leak_found == False:
                
                    # for each time ID i, check whether this time ID 
                    # lies between some leak start ID and some leak end ID
                    start_id = leak_information[node_id][leak]['leak start ID']
                    end_id = leak_information[node_id][leak]['leak end ID']
                    if (start_id <= i) and (i < end_id):
                        y[i] = 1
                        group = leak_information[node_id][leak]['group']
                        y_per_group[group][i] = 1
    
                        leak_found = True
    
    # transform numpy array to pandas series
    y = pd.Series(y)
    for group in y_per_group:
        y_per_group[group] = pd.Series(y_per_group[group])
        
    return y, y_per_group

In [None]:
# --- generate leak information
df_leak_information, leak_information = LeakInformation(nodes=nodes, 
                                                        diameters=diameters, 
                                                        length_setting=length_setting,
                                                        groups_per_node=groups_per_node,
                                                        option=2,
                                                        offset=100,
                                                        print_info=True)

In [None]:
df_leak_information

In [None]:
# --- generate labels based on leak information
start = time.time()
y, y_per_group = GenerateLabels(nodes=nodes,
                                df_leak_information=df_leak_information,
                                leak_information=leak_information,
                                length_setting=length_setting,
                                groups_values=groups_values,
                                offset=100)
y_noleaks = np.zeros(len(y))
end = time.time()
print('Time needed: {}'.format(end-start))

# Feature generation

In [None]:
# --- generate features
# initialize scenario collection
collection = ScenarioCollection('../1_FeatureGeneration/scenarios')

# list scenarios and configs of interest
scenarios = collection.list_scenarios()
scenarios.sort()
scenario = scenarios[0]
print('Scenarios:', scenarios)
print('Configs:', collection.list_configs(scenario))
print('\n') 

# load data
sensor_config = collection.list_configs(scenario)['SensorConfigs'][0]
sensorfault_config = collection.list_configs(scenario)['SensorfaultConfigs'][0]
leak_config_leaks = collection.list_configs(scenario)['LeakConfigs'][0]
leak_config_noleaks = collection.list_configs(scenario)['LeakConfigs'][-1]
print('Scenario:', scenario)
print('Sensor config:', sensor_config)
print('Sensor fault config:', sensorfault_config)
print('Leak config "Leaks":', leak_config_leaks)
print('Leak config "NoLeaks":', leak_config_noleaks)

df_leaks = collection.get(scenario, leak_config_leaks, sensor_config, sensorfault_config)
df_noleaks = collection.get(scenario, leak_config_noleaks, sensor_config, sensorfault_config)
df_leaks = df_leaks['pressure'].reset_index().drop('time', axis=1)
df_noleaks = df_noleaks['pressure'].reset_index().drop('time', axis=1)

In [None]:
df_leaks

In [None]:
df_noleaks

# Data generation

In [None]:
# --- add labels to features
for group in y_per_group:
    df_leaks['y_' + group] = y_per_group[group]
    df_noleaks['y_' + group] = y_noleaks
df_leaks['y'] = y
df_noleaks['y'] = y_noleaks

In [None]:
df_leaks

In [None]:
df_noleaks

In [None]:
df_leak_information

In [None]:
# --- store data
df_leaks.to_excel('data_leaks.xlsx',
                  sheet_name='leaks')
df_noleaks.to_excel('data_noleaks.xlsx',
                   sheet_name='noleaks')

# --- store leak information
df_leak_information.to_excel('information_leaks.xlsx',
                             sheet_name='information')