# Import data

In [1]:
#imports
import os
import pandas as pd
import json
import ast
import re
from pprint import pprint
import numpy as np

In [2]:
#build df from database exctracted data
filename='Buildings_infos_2025-08-01.csv'
extraced_date=filename[-14:-4]
filepath = os.path.join('INs', filename)
df = pd.read_csv(filepath, 
    converters={
        'sub_components': ast.literal_eval,
        'sensors': ast.literal_eval,
        #'missing_WW_sensors': ast.literal_eval
    }
)

In [3]:
#inspest column
df.columns

Index(['building_id', 'customerID', 'customer_name', 'address', 'postal_code',
       'city', 'device_type', 'device_id', 'LayoutID', 'sub_components',
       'modular_system', 't_sensor_count', 'sensors', 'missing_WW_sensors',
       'gfid'],
      dtype='object')

# DF infos

In [4]:
#filter out uncesseray infos
df=df[['building_id','customerID','address','device_type','device_id','sensors','t_sensor_count', 'sub_components','missing_WW_sensors','gfid']]

In [5]:
#df.head()

In [6]:
df.count()

building_id           2215
customerID            2215
address               2215
device_type           1132
device_id             1132
sensors               2215
t_sensor_count        2215
sub_components        2215
missing_WW_sensors     457
gfid                  1756
dtype: int64

## Display one building Detail

In [88]:
bid=783 

In [89]:
df.loc[df['building_id'] == bid, 'sub_components'].item()

{'global(1)': 'id : 148820 category : NOT_SPECIFIED type : GLOBAL sub_type : NOT_SPECIFIED',
 'b(n)(1)': 'id : 148821 category : PRODUCER type : BOILER sub_type : NOT_SPECIFIED',
 'st(n)(1)': 'id : 148822 category : PRODUCER type : SOLAR_THERMAL sub_type : NOT_SPECIFIED',
 'hc(n)(1)': 'id : 148823 category : CONSUMER type : HEATING_CIRCUIT sub_type : NOT_SPECIFIED',
 'hw(n)(1)': 'id : 148824 category : CONSUMER type : WARM_WATER sub_type : NOT_SPECIFIED',
 'buffer(n)(1)': 'id : 148825 category : NOT_SPECIFIED type : BUFFER sub_type : NOT_SPECIFIED'}

In [90]:
df.loc[df['building_id'] == bid, 'missing_WW_sensors'].item()

"{'missing_gfids': [124], 'ww_subsystem_ids': [148824]}"

In [91]:
df.loc[df['building_id'] == bid, 'sensors'].item()

{'None(1)': 'id : 28262 gfid : None sub_system_id : None unit : None source : GREENBOX_MQTT',
 't_st(n)_return(1)': 'id : 25158 gfid : 444 sub_system_id : 148822 unit : °C source : GREENBOX_MQTT',
 'v_b(n)_gm(1)': 'id : 25162 gfid : 128 sub_system_id : 148821 unit : m³ source : GREENBOX_MQTT',
 't_b(n)_flow(1)': 'id : 20321 gfid : 71 sub_system_id : 148821 unit : °C source : GREENBOX_MQTT',
 't_b(n)_return(1)': 'id : 20323 gfid : 76 sub_system_id : 148821 unit : °C source : GREENBOX_MQTT',
 't_st(n)_flow(1)': 'id : 20325 gfid : 446 sub_system_id : 148822 unit : °C source : GREENBOX_MQTT',
 't_buffer(n)_tank_1(1)': 'id : 20327 gfid : 77 sub_system_id : 148825 unit : °C source : GREENBOX_MQTT',
 't_hw(n)_pwh(1)': 'id : 20329 gfid : 119 sub_system_id : 148824 unit : °C source : GREENBOX_MQTT',
 't_hc(n)_flow(1)': 'id : 20333 gfid : 112 sub_system_id : 148823 unit : °C source : GREENBOX_MQTT',
 't_hc(n)_return(1)': 'id : 20335 gfid : 117 sub_system_id : 148823 unit : °C source : GREENBOX_M

In [92]:
df.loc[df['building_id'] == bid, 't_sensor_count'].item()

10

In [93]:
df.loc[df['building_id'] == bid, ['device_id','customerID','address','gfid']]

Unnamed: 0,device_id,customerID,address,gfid
25,2420009,66,Jahnstraße 40,GFP-13308


## Get rid of nan device_type and nan LayoutID (API buildings ???) 

In [28]:
type_of_devices=set()
for dev_type in df['device_type']:
    type_of_devices.add(dev_type)
type_of_devices   

{'ECR_LW300', 'RUT956', 'RevPiConnectSE', 'RevPiCore32SE', 'WAGOPFC200'}

In [29]:
df = df.dropna(subset=['device_type'])
# get rid of customer_id 35 (GF insternal)
df = df.loc[df['customerID'] != 35]

In [30]:
df.loc[df['device_id'] == 2412011]

Unnamed: 0,building_id,customerID,address,device_type,device_id,sensors,t_sensor_count,sub_components,missing_WW_sensors,gfid,fws_ids


In [31]:
df.count()

building_id           1125
customerID            1125
address               1125
device_type           1125
device_id             1125
sensors               1125
t_sensor_count        1125
sub_components        1125
missing_WW_sensors     105
gfid                  1125
fws_ids               1125
dtype: int64

In [32]:
#df.loc[df['customerID'] != 35]

# Find FWS

In [33]:
def find_fws_ids(sub_components_dict):
    """
    Searches a sub_components dictionary for all FRESH_WATER_STATIONs.
    
    Returns:
        A list of all found component IDs. Returns an empty list if none are found.
    """
    # Return an empty list if the input isn't a dictionary
    if not isinstance(sub_components_dict, dict):
        return []
    
    found_ids = [] # Initialize an empty list to store IDs
    
    # Iterate through all component strings
    for component_string in sub_components_dict.values():
        if 'sub_type : FRESH_WATER_STATION' in component_string:
            try:
                parts = component_string.split()
                id_index = parts.index('id')
                component_id = int(parts[id_index + 2])
                # Add the found ID to the list instead of returning
                found_ids.append(component_id)
            except (ValueError, IndexError):
                # Malformed string, skip to the next one
                continue
                
    # After checking all components, return the list of found IDs
    return found_ids

In [34]:
# Apply the function to filter the DataFrame
df['fws_ids'] = df['sub_components'].apply(find_fws_ids)

In [35]:
df.loc[df['building_id'] == bid, 'fws_ids'].item()

[991]

In [36]:
# Show the building_ids
buildings_with_fws = df.loc[df['fws_ids'].str.len() > 0, 'building_id'].tolist()
buildings_with_fws.sort()
print(len(buildings_with_fws),buildings_with_fws)

56 [370, 392, 420, 434, 442, 469, 472, 473, 474, 475, 476, 477, 525, 532, 538, 545, 562, 566, 626, 654, 700, 701, 781, 782, 784, 785, 792, 793, 797, 798, 799, 800, 801, 1364, 1401, 1404, 1405, 1457, 1461, 1474, 1475, 1476, 1477, 1478, 1481, 1569, 1605, 1606, 2250, 2382, 2383, 2431, 2453, 2463, 2470, 2483]


In [37]:
buildings_with_fws=df.loc[df['fws_ids'].str.len() > 0]
buildings_with_fws.head()

Unnamed: 0,building_id,customerID,address,device_type,device_id,sensors,t_sensor_count,sub_components,missing_WW_sensors,gfid,fws_ids
23,781,66,Jahnstraße 14,RUT956,2420017,{'None(1)': 'id : 28260 gfid : None sub_system...,10,{'global(1)': 'id : 2282 category : NOT_SPECIF...,"{'missing_gfids': [124], 'ww_subsystem_ids': [...",GFP-13262,[2286]
24,782,66,Jahnstraße 30,RUT956,2420014,{'t_b(n)_flow(1)': 'id : 18129 gfid : 71 sub_s...,11,{'global(1)': 'id : 2288 category : NOT_SPECIF...,,GFP-13285,"[2292, 2293]"
26,784,66,Jahnstraße 42,RUT956,2402007,{'None(1)': 'id : 18633 gfid : None sub_system...,11,{'global(1)': 'id : 2300 category : NOT_SPECIF...,,GFP-13331,[2304]
27,785,66,Jahnstraße 46,RUT956,2403013,{'t_hw(n)_pwh(1)': 'id : 18118 gfid : 119 sub_...,11,{'global(1)': 'id : 2306 category : NOT_SPECIF...,,GFP-13354,"[2310, 2311]"
34,420,32,Rendsburger Str. 61,ECR_LW300,23321007,{'t_hp(n)_secflow(1)': 'id : 7728 gfid : 369 s...,10,{'global(1)': 'id : 988 category : NOT_SPECIFI...,,GFP-390,[991]


# get rid of FWS and missing_WW_sensors=Nan

In [38]:
buildings_without_fws = df.loc[df['fws_ids'].str.len() == 0]
buildings_without_fws=buildings_without_fws.dropna(subset=['missing_WW_sensors'])

In [39]:
list_buildings_without_fws = buildings_without_fws['building_id'].tolist()
list_buildings_without_fws.sort()
print(len(list_buildings_without_fws),list_buildings_without_fws)

83 [326, 328, 333, 341, 342, 343, 347, 348, 349, 352, 372, 388, 390, 414, 421, 422, 423, 437, 441, 443, 446, 463, 494, 495, 508, 523, 551, 561, 568, 585, 660, 673, 689, 690, 705, 772, 783, 791, 1343, 1360, 1365, 1406, 1412, 1431, 1433, 1447, 1451, 1458, 1459, 1479, 1489, 1496, 1541, 1601, 1658, 1788, 1789, 1794, 1819, 1838, 1844, 1858, 1865, 1892, 1935, 1939, 1940, 1948, 2001, 2002, 2166, 2167, 2193, 2249, 2261, 2295, 2296, 2299, 2316, 2338, 2384, 2400, 2461]


In [40]:
print(list_buildings_without_fws[2:5].sort())

None


In [41]:
buildings_without_fws.tail()

Unnamed: 0,building_id,customerID,address,device_type,device_id,sensors,t_sensor_count,sub_components,missing_WW_sensors,gfid,fws_ids
1991,2338,174,Horstmarer Landweg 74,RUT956,2507049,{'e_g_bldg_y(1)': 'id : 36067 gfid : 460 sub_s...,5,{'global(1)': 'id : 220296 category : NOT_SPEC...,"{'missing_gfids': [118, 119, 124], 'ww_subsyst...",OPS-21587,[]
2017,2384,177,Tivolistr. 106,RUT956,2507035,{'e_g_bldg_y(1)': 'id : 36493 gfid : 460 sub_s...,9,{'global(1)': 'id : 220404 category : NOT_SPEC...,"{'missing_gfids': [118], 'ww_subsystem_ids': [...",OPS-21987,[]
2042,2400,178,Oscar-Kjellberg-Straße 3,RUT956,2505022,{'e_g_bldg_y(1)': 'id : 36519 gfid : 460 sub_s...,10,{'global(1)': 'id : 220711 category : NOT_SPEC...,"{'missing_gfids': [123], 'ww_subsystem_ids': [...",OPS-22212,[]
2171,2461,187,Reichelsweiherstr. 9,RUT956,2505050,{'e_g_bldg_y(1)': 'id : 37616 gfid : 460 sub_s...,6,{'global(1)': 'id : 220835 category : NOT_SPEC...,"{'missing_gfids': [123, 124], 'ww_subsystem_id...",OPS-23006,[]
2201,2299,167,Garmischer Straße 201,RUT956,2508003,{'t_dh_primflow(1)': 'id : 37026 gfid : 88 sub...,8,{'global(1)': 'id : 219997 category : NOT_SPEC...,"{'missing_gfids': [118, 119, 123, 124], 'ww_su...",OPS-175,[]


# Missing sensor analysis

In [42]:
df_missing_ww_s=df.dropna(subset=['missing_WW_sensors'])

In [43]:
df_missing_ww_s[['building_id','missing_WW_sensors','fws_ids']].head()

Unnamed: 0,building_id,missing_WW_sensors,fws_ids
14,772,"{'missing_gfids': [123], 'ww_subsystem_ids': [...",[]
23,781,"{'missing_gfids': [124], 'ww_subsystem_ids': [...",[2286]
25,783,"{'missing_gfids': [124], 'ww_subsystem_ids': [...",[]
35,421,"{'missing_gfids': [124], 'ww_subsystem_ids': [...",[]
36,441,"{'missing_gfids': [124], 'ww_subsystem_ids': [...",[]


In [83]:
# Define the function to check conditions
def check_conditions(row):
    """
    Checks if a row meets all the specified conditions.

    Conditions:
    - 124 is in 'missing_WW_sensors' -> 'missing_gfids' list
    - 118 and 119 are NOT in 'missing_WW_sensors' -> 'missing_gfids' list

    Returns:
        int: 1 if all conditions are met, 0 otherwise.
    """
    try:
        missing_sensors_dict = ast.literal_eval(row['missing_WW_sensors'])
        missing_gfids = missing_sensors_dict.get('missing_gfids', [])

        cond1 = 124 in missing_gfids
        cond2 = (118 not in missing_gfids) and (119 not in missing_gfids)
        
        if cond1 and cond2 :
            return 1
        else:
            return 0
    except (ValueError, SyntaxError):
        return 0

In [84]:
df['man_rec_sys'] = df.apply(check_conditions, axis=1)

In [95]:
df_man_rec=df.loc[df['man_rec_sys']== 1]
df_man_rec = df_man_rec.loc[df['fws_ids'].str.len() == 0]

In [96]:
df_man_rec[['building_id','missing_WW_sensors','fws_ids']].head()

Unnamed: 0,building_id,missing_WW_sensors,fws_ids
25,783,"{'missing_gfids': [124], 'ww_subsystem_ids': [...",[]
35,421,"{'missing_gfids': [124], 'ww_subsystem_ids': [...",[]
36,441,"{'missing_gfids': [124], 'ww_subsystem_ids': [...",[]
38,443,"{'missing_gfids': [123, 124], 'ww_subsystem_id...",[]
40,495,"{'missing_gfids': [123, 124], 'ww_subsystem_id...",[]


In [100]:
df_man_rec.columns

Index(['building_id', 'customerID', 'address', 'device_type', 'device_id',
       'sensors', 't_sensor_count', 'sub_components', 'missing_WW_sensors',
       'gfid', 'fws_ids', 'man_rec_sys'],
      dtype='object')

In [103]:
df_man_rec[['building_id','customerID', 'address','missing_WW_sensors','sensors','gfid']].to_csv('man_rec.csv', index=False)

# Export

In [34]:
if True:
    filename1='Building_infos_'+extraced_date+'.csv'
    filename2='Buildings_with_fws_'+extraced_date+'.csv'
    filename3='Buildings_nofws_missing_ww_sensors_'+extraced_date+'.csv'
    filepath1 = os.path.join('OUTs', filename1)
    filepath2 = os.path.join('OUTs', filename2)
    filepath3 = os.path.join('OUTs', filename3)
    df.to_csv(filepath1, index=False)
    buildings_with_fws.to_csv(filepath2, index=False)
    buildings_without_fws.to_csv(filepath3, index=False)
