# Import data

In [1]:
#imports
import os
import pandas as pd
import json
import ast
import re
from pprint import pprint
import numpy as np

In [2]:
#build df from database exctracted data
filename='Buildings_infos_2025-07-30.csv'
extraced_date=filename[-14:-4]
filepath = os.path.join('INs', filename)
df = pd.read_csv(filepath, 
    converters={
        'sub_components': ast.literal_eval,
        'sensors': ast.literal_eval,
        #'missing_WW_sensors': ast.literal_eval
    }
)

In [3]:
#inspest column
df.columns

Index(['building_id', 'customerID', 'customer_name', 'address', 'postal_code',
       'city', 'device_type', 'device_id', 'LayoutID', 'sub_components',
       'modular_system', 't_sensor_count', 'sensors', 'missing_WW_sensors',
       'gfid'],
      dtype='object')

# DF infos

In [4]:
#filter out uncesseray infos
df=df[['building_id','customerID','address','device_type','device_id','sensors', 'sub_components','missing_WW_sensors','gfid']]

In [5]:
#df.head()

In [6]:
df.count()

building_id           2209
customerID            2209
address               2209
device_type           1120
device_id             1120
sensors               2209
sub_components        2209
missing_WW_sensors     457
gfid                  1749
dtype: int64

## Display one building Detail

In [7]:
bid=420 

In [8]:
df.loc[df['building_id'] == bid, 'sub_components'].item()

{'global(1)': 'id : 988 category : NOT_SPECIFIED type : GLOBAL sub_type : NOT_SPECIFIED',
 'hp(n)(1)': 'id : 989 category : PRODUCER type : HEAT_PUMP sub_type : NOT_SPECIFIED',
 'hc(n)(1)': 'id : 990 category : CONSUMER type : HEATING_CIRCUIT sub_type : NOT_SPECIFIED',
 'hw(n)(1)': 'id : 991 category : CONSUMER type : WARM_WATER sub_type : FRESH_WATER_STATION'}

In [9]:
df.loc[df['building_id'] == bid, 'missing_WW_sensors'].item()

nan

In [10]:
df.loc[df['building_id'] == bid, 'sensors'].item()

{'t_hp(n)_secflow(1)': 'id : 7728 gfid : 369 sub_system_id : 989 unit : °C source : GREENBOX_MQTT',
 'e_g_bldg_y(None)': 'id : 19387 gfid : 460 sub_system_id : None unit : kWh source : MANUALLY_ENTERED',
 't_g_out_api(1)': 'id : 7726 gfid : 434 sub_system_id : 988 unit : °C source : OPEN_WEATHER_API',
 't_hp(n)_primflow(1)': 'id : 9510 gfid : 403 sub_system_id : 989 unit : °C source : GREENBOX_MQTT',
 't_hp(n)_primreturn(1)': 'id : 9509 gfid : 408 sub_system_id : 989 unit : °C source : GREENBOX_MQTT',
 't_hp(n)_secreturn(1)': 'id : 7732 gfid : 371 sub_system_id : 989 unit : °C source : GREENBOX_MQTT',
 't_hw(n)_circ(1)': 'id : 7731 gfid : 118 sub_system_id : 991 unit : °C source : GREENBOX_MQTT',
 't_hw(n)_pwh(1)': 'id : 7730 gfid : 119 sub_system_id : 991 unit : °C source : GREENBOX_MQTT',
 't_hc(n)_flow(1)': 'id : 7733 gfid : 112 sub_system_id : 990 unit : °C source : GREENBOX_MQTT',
 't_hw(n)_tank(1)': 'id : 7735 gfid : 124 sub_system_id : 991 unit : °C source : GREENBOX_MQTT',
 't_

In [11]:
df.loc[df['building_id'] == bid, ['device_id','customerID','address','gfid']]

Unnamed: 0,device_id,customerID,address,gfid
34,23321007,32,Rendsburger Str. 61,GFP-390


## Get rid of nan device_type and nan LayoutID (API buildings ???) 

In [12]:
type_of_devices=set()
for dev_type in df['device_type']:
    type_of_devices.add(dev_type)
type_of_devices   

{'ECR_LW300', 'RUT956', 'RevPiConnectSE', 'RevPiCore32SE', 'WAGOPFC200', nan}

In [13]:
#df = df.dropna(subset=['device_type'])

In [14]:
df.count()

building_id           2209
customerID            2209
address               2209
device_type           1120
device_id             1120
sensors               2209
sub_components        2209
missing_WW_sensors     457
gfid                  1749
dtype: int64

# Find FWS

In [15]:
def find_fws_ids(sub_components_dict):
    """
    Searches a sub_components dictionary for all FRESH_WATER_STATIONs.
    
    Returns:
        A list of all found component IDs. Returns an empty list if none are found.
    """
    # Return an empty list if the input isn't a dictionary
    if not isinstance(sub_components_dict, dict):
        return []
    
    found_ids = [] # Initialize an empty list to store IDs
    
    # Iterate through all component strings
    for component_string in sub_components_dict.values():
        if 'sub_type : FRESH_WATER_STATION' in component_string:
            try:
                parts = component_string.split()
                id_index = parts.index('id')
                component_id = int(parts[id_index + 2])
                # Add the found ID to the list instead of returning
                found_ids.append(component_id)
            except (ValueError, IndexError):
                # Malformed string, skip to the next one
                continue
                
    # After checking all components, return the list of found IDs
    return found_ids

In [16]:
# Apply the function to filter the DataFrame
df['fws_ids'] = df['sub_components'].apply(find_fws_ids)

In [17]:
df.loc[df['building_id'] == bid, 'fws_ids'].item()

[991]

In [18]:
# Show the building_ids
buildings_with_fws = df.loc[df['fws_ids'].str.len() > 0, 'building_id'].tolist()
print(len(buildings_with_fws),buildings_with_fws)

58 [781, 782, 784, 785, 420, 442, 538, 562, 566, 434, 545, 1569, 792, 793, 797, 798, 799, 800, 801, 2453, 1474, 1475, 1476, 1477, 1478, 1481, 654, 392, 1401, 1404, 1405, 525, 532, 626, 700, 701, 370, 1605, 1606, 473, 474, 475, 476, 1364, 1457, 1461, 1593, 2034, 2250, 469, 472, 2431, 2470, 2382, 2383, 477, 2463, 2483]


In [19]:
buildings_with_fws=df.loc[df['fws_ids'].str.len() > 0]
buildings_with_fws.head()

Unnamed: 0,building_id,customerID,address,device_type,device_id,sensors,sub_components,missing_WW_sensors,gfid,fws_ids
23,781,66,Jahnstraße 14,RUT956,2420017,{'None(1)': 'id : 28260 gfid : None sub_system...,{'global(1)': 'id : 2282 category : NOT_SPECIF...,"{'missing_gfids': [124], 'ww_subsystem_ids': [...",GFP-13262,[2286]
24,782,66,Jahnstraße 30,RUT956,2420014,{'t_b(n)_flow(1)': 'id : 18129 gfid : 71 sub_s...,{'global(1)': 'id : 2288 category : NOT_SPECIF...,,GFP-13285,"[2292, 2293]"
26,784,66,Jahnstraße 42,RUT956,2402007,{'None(1)': 'id : 18633 gfid : None sub_system...,{'global(1)': 'id : 2300 category : NOT_SPECIF...,,GFP-13331,[2304]
27,785,66,Jahnstraße 46,RUT956,2403013,{'t_hw(n)_pwh(1)': 'id : 18118 gfid : 119 sub_...,{'global(1)': 'id : 2306 category : NOT_SPECIF...,,GFP-13354,"[2310, 2311]"
34,420,32,Rendsburger Str. 61,ECR_LW300,23321007,{'t_hp(n)_secflow(1)': 'id : 7728 gfid : 369 s...,{'global(1)': 'id : 988 category : NOT_SPECIFI...,,GFP-390,[991]


# Export

In [20]:
if True:
    filename1='Building_infos_'+extraced_date+'.csv'
    filename2='Buildings_with_fws_'+extraced_date+'.csv'
    filepath1 = os.path.join('OUTs', filename1)
    filepath2 = os.path.join('OUTs', filename2)
    df.to_csv(filepath1, index=False)
    buildings_with_fws.to_csv(filepath2, index=False)
