# Analysis of Layout ID

## Relevant documents

- (Python Client Repo)[https://github.com/Green-Fusion/energy-management-backend/tree/main/python_client]
- (Klemmenbelegung)[https://docs.google.com/spreadsheets/d/1nkdkx2rI6nVKgoKBgkCUtfwEwuv8kptrRUXcXtfv0NM/edit?gid=247168398#gid=247168398]
- (Hypothesis for Klemmenbelegung)[https://docs.google.com/spreadsheets/d/1TSTxMCgEvuoayzOfx1MUqlV0tiqsVTBRN8aldlnFXxA/edit?gid=0#gid=0]

# Import data

In [1]:
#imports
import os
import pandas as pd
import json
import ast
import re
from pprint import pprint

In [2]:
#build df from database exctracted data
filename='Buildings_infos_2025-07-30.csv'
extraced_date=filename[-14:-4]
filepath = os.path.join('INs', filename)
df = pd.read_csv(filepath)
#inspest column
df.columns

Index(['building_id', 'customerID', 'customer_name', 'address', 'postal_code',
       'city', 'device_type', 'device_id', 'LayoutID', 'sub_components',
       'modular_system', 't_sensor_count', 'sensors', 'missing_WW_sensors',
       'gfid'],
      dtype='object')

## filter out uncesseray infos

In [3]:
#filter out uncesseray infos
df=df[['building_id','customerID','address','device_type','device_id','t_sensor_count', 'sub_components','LayoutID','modular_system']]

In [4]:
#df.head()

In [5]:
df.count()

building_id       2209
customerID        2209
address           2209
device_type       1120
device_id         1120
t_sensor_count    2209
sub_components    2209
LayoutID          1160
modular_system    2209
dtype: int64

## Get rid of nan device_type and nan LayoutID (API buildings ???) 

In [6]:
type_of_devices=set()
for dev_type in df['device_type']:
    type_of_devices.add(dev_type)
type_of_devices   

{'ECR_LW300', 'RUT956', 'RevPiConnectSE', 'RevPiCore32SE', 'WAGOPFC200', nan}

In [7]:
df = df.dropna(subset=['device_type'])
df = df.dropna(subset=['LayoutID'])

In [8]:
df.count()

building_id       1037
customerID        1037
address           1037
device_type       1037
device_id         1037
t_sensor_count    1037
sub_components    1037
LayoutID          1037
modular_system    1037
dtype: int64

# Explore data 

## Explore modular_system

In [9]:
if True: #test single buildings
    building_id = 2165#2389#1801#2317#1315#1593#1809 #1801 #
    mod_sys = df.loc[df['building_id'] == building_id, 'modular_system'].iloc[0]
    modular_system = ast.literal_eval(mod_sys)
    #print(df.loc[df['building_id'] == building_id, 'LayoutID_Tops'].iloc[0])
    pprint(modular_system)

{'buildingID': 2165,
 'id': 'modular-system----Stadtwerke--Essen--AG-----Am--Brauhaus--15--45359--Essen',
 'layoutID': 'gas:1--heat-exchanger:1--heating-circuit:1--heating-circuit:2--heating-circuit:3--heating-circuit:4--heating-circuit:5--heating-circuit:6--heating-circuit:7',
 'name': 'modular-system-entry',
 'published': True,
 'publishedAt': '2025-03-17T09:15:07.723Z',
 'subSystems': [{'connections': [{'status': 'hot',
                                  'to': {'direction': 'right',
                                         'id': 'heating-circuit:7',
                                         'subSystem': {'hydraulicLocationIndex': 7,
                                                       'name': 'heating-circuit',
                                                       'side': 'bottom'}}},
                                 {'from': {'direction': 'right',
                                           'id': 'heating-circuit:7',
                                           'subSystem': {'hydraul

## Get variantions sub components from sub_components

In [10]:
# Initialize a set to collect unique key-value variations
unique_subcomponents = set()

# Convert string to dict and process each component
for comp_dict in df['sub_components'].apply(ast.literal_eval):
    for k, v in comp_dict.items():
        # Remove (n), (1), (2)... etc. from the key
        key_cleaned = re.sub(r'\(\d+\)', '', k)
        variation = f"{key_cleaned}: {v}"
        unique_subcomponents.add(variation)

# Display results
#for variation in sorted(unique_subcomponents):
#    print(variation)

## Get variations of sub components from LayoutID

In [11]:
# Sample: df['LayoutID_Tops'] = your actual column
def extract_words(layout_str):
    # Split by '--', then split each part by ':'
    parts = layout_str.split('--')
    words = []
    for part in parts:
        words.extend(part.split(':'))
    return words

# Apply and flatten the list
all_words = df['LayoutID'].dropna().apply(extract_words).explode()

# Get unique values
unique_words = all_words.unique().tolist()

# Optional: sort alphabetically or numerically
unique_words.sort()

#get rid of pipes and numbers
cleaned_words = [
    word for word in unique_words
    if not word.isdigit() and
    'pipe' not in word.lower() and
    'valve' not in word.lower()
]

In [12]:
cleaned_words

['1-heating-circuit',
 'buffer-tank',
 'chp',
 'district-heating',
 'gas',
 'global-separation-circuit',
 'heat-exchanger',
 'heat-pump',
 'heating-circuit',
 'hydraulic-separator',
 'local-heating-station',
 'solar-thermal',
 'warm-water']

## Get Subcomponent variations from modular_system

In [13]:
def extract_unique_subcomponents(df, column='modular_system'):
    subcomponent_set = set()

    for entry in df[column]:
        try:
            # Safely convert string representation of dict to actual dict
            data = ast.literal_eval(entry)
        except Exception as e:
            print(f"Skipping invalid entry due to error: {e}")
            continue

        sub_systems = data.get("subSystems", [])
        for subsystem in sub_systems:
            parameters = subsystem.get("parameters", {})
            sub_components = parameters.get("subComponent", {})
            
            if isinstance(sub_components, dict):
                for key in sub_components.keys():
                    # Remove trailing --<number>
                    base_key = re.sub(r'--\d+$', '', key)

                    # Check prefix
                    if base_key.startswith(("district-heating--", "local-heating-station--", "heating-circuit--","warm-water")):
                        subcomponent_set.add(base_key)

    return sorted(subcomponent_set)

In [14]:
unique_subcomponents = extract_unique_subcomponents(df)
for compo in unique_subcomponents:
    print(compo)

district-heating--heat-exchanger
district-heating--pump-sec
district-heating--sec-flow-temp
district-heating--valve
heating-circuit--placeholder-cold-connection-node
heating-circuit--pump
heating-circuit--secondary-flow-temp
heating-circuit--valve
local-heating-station--heat-exchanger
local-heating-station--pump-prim
local-heating-station--pump-sec
warm-water--flow-dead-pump


# Group by LayoutID

In [15]:
unique_count = df['LayoutID'].nunique()
print(f"Number of unique LayoutID values: {unique_count}")

Number of unique LayoutID values: 219


In [16]:
# Group by LayoutID
grouped = df.groupby('LayoutID').agg(
    Occurrence=('LayoutID', 'count'),
    buildingIDs=('building_id', list),
    sensor_count=('t_sensor_count', set)
).reset_index()

# Add a simple index column starting from 0
grouped = grouped.sort_values(by='Occurrence', ascending=False).reset_index(drop=True)
grouped['KBn']=grouped.index+1
#grouped['KBn'] = grouped['KBn'].fillna(-1).astype(int)

In [17]:
# Show result
grouped.head(20)

Unnamed: 0,LayoutID,Occurrence,buildingIDs,sensor_count,KBn
0,gas:1--heating-circuit:1--warm-water:1,104,"[768, 769, 770, 778, 787, 1924, 1926, 1927, 46...",{0},1
1,district-heating:1--heating-circuit:1--warm-wa...,75,"[774, 1909, 1910, 1911, 1912, 1913, 1314, 1315...",{0},2
2,gas:1--heating-circuit:1,70,"[758, 759, 499, 1525, 1649, 1654, 1657, 1791, ...",{0},3
3,gas:1--heat-exchanger:1--heating-circuit:1--wa...,44,"[1925, 1931, 1509, 1526, 1527, 1531, 1532, 153...",{0},4
4,local-heating-station:1--heating-circuit:1--wa...,42,"[509, 790, 1005, 1628, 1629, 1630, 1631, 672, ...",{0},5
5,gas:1--heat-exchanger:1--heating-circuit:1,39,"[1524, 1795, 1560, 1638, 1639, 2455, 604, 617,...",{0},6
6,gas:1--gas:2--heat-exchanger:1--heating-circui...,26,"[510, 1528, 1529, 1530, 1539, 1773, 1777, 1778...",{0},7
7,district-heating:1--heating-circuit:1--heating...,22,"[600, 1488, 1724, 796, 1772, 1385, 1386, 1389,...",{0},8
8,gas:1--heating-circuit:1--heating-circuit:2--w...,22,"[1511, 1515, 1516, 1517, 1518, 1537, 2219, 179...",{0},9
9,district-heating:1--heating-circuit:1,20,"[762, 766, 777, 712, 450, 389, 391, 1725, 1726...",{0},10


In [18]:
# Sum the number of occurrences for top 10 and top 20
top_10_sum = grouped.head(10)['Occurrence'].sum()
top_20_sum = grouped.head(20)['Occurrence'].sum()
print(f"total LayoutID: {df.LayoutID.count()}")
print(f"top 10 LayoutIDs count: {top_10_sum}")
print(f"top 20 LayoutIDs count: {top_20_sum}")

print(f"Coverage top 10 LayoutIDs: {100*top_10_sum/df.LayoutID.count(): .1f} %")
print(f"Coverage top 20 LayoutIDs: {100*top_20_sum/df.LayoutID.count(): .1f} %")

total LayoutID: 1037
top 10 LayoutIDs count: 464
top 20 LayoutIDs count: 592
Coverage top 10 LayoutIDs:  44.7 %
Coverage top 20 LayoutIDs:  57.1 %


# New Grouping

## Add KBn

In [19]:
# add KBns
df = df.merge(grouped[['LayoutID', 'KBn']], on='LayoutID', how='left')
cols = ['KBn'] + [col for col in df.columns if col != 'KBn']
df = df[cols]
df.head()

Unnamed: 0,KBn,building_id,customerID,address,device_type,device_id,t_sensor_count,sub_components,LayoutID,modular_system
0,40,756,66,Wolzogenstr.28,RUT956,2403001,0,{'global(1)': 'id : 2204 category : NOT_SPECIF...,gas:1--gas:2--heating-circuit:1--heating-circu...,"{'name': 'modular-system-entry', 'id': 'modula..."
1,31,757,66,Hagelberger Str. 26,RUT956,2406009,0,{'global(1)': 'id : 2209 category : NOT_SPECIF...,gas:1--gas:2--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
2,3,758,66,Hochstr. 8,RUT956,2420012,0,{'global(1)': 'id : 2213 category : NOT_SPECIF...,gas:1--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
3,3,759,66,Planufer 82a,RUT956,2403011,0,{'global(1)': 'id : 2216 category : NOT_SPECIF...,gas:1--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
4,36,761,66,Gabainstr. 13,RUT956,2440001,0,{'global(1)': 'id : 2219 category : NOT_SPECIF...,gas:1--gas:2--global-separation-circuit:1--hea...,"{'name': 'modular-system-entry', 'id': 'modula..."


## Fix layout WW -> FWS

### Get FWS systems 

In [20]:
# Ensure sub_components is a dictionary
def has_fresh_water_station(comp_str):
    try:
        comp_dict = ast.literal_eval(comp_str)
        return any("sub_type : FRESH_WATER_STATION" in val for val in comp_dict.values())
    except:
        return False

# Apply the function to filter the DataFrame
df_fresh = df[df['sub_components'].apply(has_fresh_water_station)]

# Show the building_ids
list_fws=df_fresh['building_id'].tolist()
print(len(list_fws),list_fws)

51 [781, 782, 784, 785, 420, 442, 538, 562, 566, 434, 545, 1569, 792, 793, 797, 798, 799, 800, 801, 2453, 1474, 1475, 1476, 1477, 1478, 1481, 654, 1401, 1405, 525, 532, 626, 700, 701, 370, 1605, 1606, 473, 474, 475, 1364, 1457, 1461, 2250, 469, 472, 2431, 2382, 2383, 477, 2463]


In [21]:
print(df.loc[df['building_id'] == 370, 'sub_components'].values[0])

{'global(1)': 'id : 827 category : NOT_SPECIFIED type : GLOBAL sub_type : NOT_SPECIFIED', 'hc(n)(1)': 'id : 828 category : CONSUMER type : HEATING_CIRCUIT sub_type : NOT_SPECIFIED', 'hw(n)(1)': 'id : 829 category : CONSUMER type : WARM_WATER sub_type : FRESH_WATER_STATION', 'hw(n)(2)': 'id : 830 category : CONSUMER type : WARM_WATER sub_type : FRESH_WATER_STATION'}


### Fix layout 

In [22]:
def fix_func(row):
    layout = row['LayoutID']
    sub_components = row['sub_components']

    if pd.isna(layout):
        return layout

    try:
        comp_dict = ast.literal_eval(sub_components)
    except Exception:
        return layout

    # Only modify if FRESH_WATER_STATION is present
    if not any("sub_type : FRESH_WATER_STATION" in val for val in comp_dict.values()):
        return layout

    # Replace warm-water:<n> with fresh-water-station:<n>
    parts = layout.split("--")
    new_parts = []
    for part in parts:
        match = re.match(r'warm-water:(\d+)', part)
        if match:
            new_parts.append(f"fresh-water-station:{match.group(1)}")
        else:
            new_parts.append(part)

    return "--".join(new_parts)

In [23]:
df['LayoutID_Tops'] = df.apply(fix_func, axis=1)

In [24]:
df_fws = df[df['building_id'].isin(list_fws)][['building_id','LayoutID_Tops','sub_components']].reset_index(drop=True)

In [25]:
for idx, row in df_fws[['building_id', 'LayoutID_Tops', 'sub_components']].iterrows():
    print(row['building_id'], row['LayoutID_Tops'])

781 gas:1--solar-thermal:1--buffer-tank:1--heating-circuit:1--fresh-water-station:1
782 gas:1--hydraulic-separator:1--heating-circuit:1--solar-thermal:1--buffer-tank:1--fresh-water-station:1
784 gas:1--hydraulic-separator:1--heating-circuit:1--solar-thermal:1--buffer-tank:1--fresh-water-station:1
785 gas:1--hydraulic-separator:1--heating-circuit:1--solar-thermal:1--buffer-tank:1--fresh-water-station:1
420 heat-pump:1--down-right-pipes:1--buffer-tank:1--heating-circuit:1--down-left-pipes:1--fresh-water-station:1
442 gas:1--empty-pipes:1--heat-exchanger:1--down-right-pipes:1--empty-pipes:2--buffer-tank:1--chp:1--heating-circuit:1--fresh-water-station:1
538 gas:1--solar-thermal:1--buffer-tank:1--heating-circuit:1--heating-circuit:2--fresh-water-station:1
562 gas:1--solar-thermal:1--buffer-tank:1--heating-circuit:1--heating-circuit:2--fresh-water-station:1
566 gas:1--gas:2--fresh-water-station:1--buffer-tank:1--heating-circuit:1
434 gas:1--gas:2--gas:3--heat-exchanger:1--heating-circuit:1-

## get "buffer-tank" only onnected to FWS

In [26]:
pattern = r'(?:solar-thermal:\d+--buffer-tank:\d+|buffer-tank:\d+--solar-thermal:\d+)'
df_filtered = df_fws[~df_fws['LayoutID_Tops'].str.contains(pattern, regex=True)]

In [27]:
df_filtered = df[
    df['LayoutID_Tops'].str.contains('buffer-tank') &
    df['LayoutID_Tops'].str.contains('fresh-water-station') &
    ~df['LayoutID_Tops'].str.contains('solar-thermal')
]

In [28]:
for idx, row in df_filtered[['building_id', 'LayoutID_Tops', 'sub_components']].iterrows():
    print(row['building_id'], row['LayoutID_Tops'])

420 heat-pump:1--down-right-pipes:1--buffer-tank:1--heating-circuit:1--down-left-pipes:1--fresh-water-station:1
442 gas:1--empty-pipes:1--heat-exchanger:1--down-right-pipes:1--empty-pipes:2--buffer-tank:1--chp:1--heating-circuit:1--fresh-water-station:1
566 gas:1--gas:2--fresh-water-station:1--buffer-tank:1--heating-circuit:1
2453 heat-pump:1--buffer-tank:1--right-down-pipes:1--gas:1--heating-circuit:1--heating-circuit:2--fresh-water-station:1
1478 heat-pump:1--gas:1--down-right-pipes:1--buffer-tank:1--heating-circuit:1--down-left-pipes:1--fresh-water-station:1
654 heat-pump:1--right-down-pipes:1--heat-pump:2--right-down-pipes:2--heat-pump:3--heat-exchanger:1--buffer-tank:1--heating-circuit:1--fresh-water-station:1
370 district-heating:1--heating-circuit:1--buffer-tank:1--fresh-water-station:1
473 heat-pump:1--gas:1--down-right-pipes:1--buffer-tank:1--heating-circuit:1--down-left-pipes:1--fresh-water-station:1
474 heat-pump:1--gas:1--down-right-pipes:1--buffer-tank:1--heating-circuit:1

### verify

In [29]:
building_id= 765#1474 #765 #has no FWS

In [30]:
df.loc[df['building_id'] == building_id, 'sub_components'].values[0]

"{'global(1)': 'id : 2224 category : NOT_SPECIFIED type : GLOBAL sub_type : NOT_SPECIFIED', 'dh(1)': 'id : 2225 category : PRODUCER type : DISTRICT_HEATING sub_type : NOT_SPECIFIED', 'hc(n)(1)': 'id : 2226 category : CONSUMER type : HEATING_CIRCUIT sub_type : NOT_SPECIFIED', 'hw(n)(1)': 'id : 2227 category : CONSUMER type : WARM_WATER sub_type : STORAGE_TANK', 'hw(n)(2)': 'id : 2228 category : CONSUMER type : WARM_WATER sub_type : STORAGE_TANK'}"

In [31]:
df.loc[df['building_id'] == building_id, 'LayoutID_Tops'].values[0]

'district-heating:1--down-right-pipes:1--heat-exchanger:1--heating-circuit:1--down-left-pipes:1--warm-water:1--down-left-pipes:2--warm-water:2'

## Combine HX with warmwater

In [32]:
def combine_HX_WW(layout):
    if pd.isna(layout):
        return layout

    parts = layout.split('--')
    new_parts = []
    skip_next = False

    for i in range(len(parts)):
        if skip_next:
            skip_next = False
            continue

        part = parts[i]

        if re.match(r'heat-exchanger:\d+', part):
            if i + 1 < len(parts) and re.match(r'warm-water:(\d+)', parts[i + 1]):
                match = re.match(r'warm-water:(\d+)', parts[i + 1])
                new_parts.append(f"warm-water_external:{match.group(1)}")
                skip_next = True
            else:
                # If no warm-water next, keep heat-exchanger
                new_parts.append(part)
        else:
            new_parts.append(part)

    return '--'.join(new_parts)

In [33]:
#appy combine_HX
df['LayoutID_Tops2'] = df['LayoutID_Tops'].apply(combine_HX_WW)
df['LayoutID_Tops'] = df['LayoutID_Tops'].apply(combine_HX_WW)


### verify

In [34]:
# Find rows where Rule 1 actually modified the layout
df_changed_by_combine_HX_WW = df[df['LayoutID_Tops'] != df['LayoutID_Tops2']].reset_index(drop=True)
print(len(list(df_changed_by_combine_HX_WW['building_id'])),list(df_changed_by_combine_HX_WW['building_id']))
#df_changed_by_combine_HX_WW[['building_id','LayoutID','LayoutID_Tops']]

0 []


In [35]:
df_changed_by_combine_HX_WW.columns

Index(['KBn', 'building_id', 'customerID', 'address', 'device_type',
       'device_id', 't_sensor_count', 'sub_components', 'LayoutID',
       'modular_system', 'LayoutID_Tops', 'LayoutID_Tops2'],
      dtype='object')

In [36]:
#### export
if True:
    df_changed_by_combine_HX_WW=df_changed_by_combine_HX_WW[['building_id', 'customerID', 'address', 
       'device_id', 'LayoutID','LayoutID_Tops2']]
    filename='warm-water_external_'+extraced_date+'.csv'
    filepath = os.path.join('OUTs', filename)
    df_changed_by_combine_HX_WW.to_csv(filepath, index=False)
   

In [37]:
df=df.drop('LayoutID_Tops2',axis=1)

In [38]:
df.columns

Index(['KBn', 'building_id', 'customerID', 'address', 'device_type',
       'device_id', 't_sensor_count', 'sub_components', 'LayoutID',
       'modular_system', 'LayoutID_Tops'],
      dtype='object')

### Remove HX and Sperators

In [39]:
def remove_HXnsep (layout):
    if pd.isna(layout):
        return layout

    parts = layout.split('--')
    filtered_parts = []

    for part in parts:
        if re.match(r'global-separation-circuit:\d+', part):
            continue
        elif re.match(r'hydraulic-separator:\d+', part):
            continue
        elif re.match(r'heat-exchanger:\d+', part):
            continue
        else:
            filtered_parts.append(part)

    return '--'.join(filtered_parts)

In [40]:
df['LayoutID_Tops'] = df['LayoutID_Tops'].apply(remove_HXnsep)

### verify

In [41]:
# Find rows where Rule 1 actually modified the layout
df_changed_by_remove_HXnsep = df[df['LayoutID'] != df['LayoutID_Tops']].reset_index(drop=True)
print(len(list(df_changed_by_remove_HXnsep['building_id'])),list(df_changed_by_remove_HXnsep['building_id']))
df_changed_by_remove_HXnsep[['building_id','LayoutID','LayoutID_Tops']]

433 [761, 765, 773, 780, 781, 782, 784, 785, 788, 304, 371, 418, 419, 420, 421, 442, 495, 1925, 1928, 1929, 1931, 478, 505, 507, 510, 1006, 2149, 348, 1509, 1520, 1523, 1524, 1526, 1527, 1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1536, 1539, 1541, 1542, 1543, 1545, 1547, 1548, 1549, 1773, 1775, 1777, 1778, 1779, 1780, 1781, 1795, 1797, 2167, 2168, 538, 562, 566, 568, 571, 574, 575, 738, 739, 598, 601, 602, 603, 437, 326, 333, 427, 430, 431, 434, 452, 453, 454, 455, 456, 457, 545, 548, 549, 550, 552, 554, 1484, 1485, 1487, 1489, 1493, 1494, 1497, 1501, 1503, 2468, 1560, 1561, 1562, 1563, 1564, 1566, 1567, 1568, 1569, 5, 387, 388, 390, 569, 641, 1638, 1639, 438, 414, 440, 570, 792, 793, 794, 797, 798, 799, 800, 801, 2453, 2454, 2455, 1336, 1506, 1381, 1387, 1393, 1612, 1613, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 604, 613, 616, 617, 621, 623, 624, 341, 342, 343, 2244, 1474, 1475, 1476, 1477, 1478, 1481, 1753, 643, 644, 645, 646, 648, 649, 650, 651, 652, 653, 654, 655, 656

Unnamed: 0,building_id,LayoutID,LayoutID_Tops
0,761,gas:1--gas:2--global-separation-circuit:1--hea...,gas:1--gas:2--heating-circuit:1--heating-circu...
1,765,district-heating:1--down-right-pipes:1--heat-e...,district-heating:1--down-right-pipes:1--heatin...
2,773,gas:1--gas:2--global-separation-circuit:1--hea...,gas:1--gas:2--heating-circuit:1
3,780,gas:1--gas:2--gas:3--gas:4--gas:5--gas:6--glob...,gas:1--gas:2--gas:3--gas:4--gas:5--gas:6--heat...
4,781,gas:1--solar-thermal:1--buffer-tank:1--heating...,gas:1--solar-thermal:1--buffer-tank:1--heating...
...,...,...,...
428,2438,gas:1--gas:2--heat-exchanger:1--heating-circuit:1,gas:1--gas:2--heating-circuit:1
429,2440,gas:1--heat-exchanger:1--heating-circuit:1--he...,gas:1--heating-circuit:1--heating-circuit:2
430,2441,gas:1--heat-exchanger:1--heating-circuit:1,gas:1--heating-circuit:1
431,2299,district-heating:1--heating-circuit:1--heating...,district-heating:1--heating-circuit:1--heating...


## get new sensor count

In [42]:
sensor_count_subsystem={'buffer-tank': 1,
 'chp': 2,
 'district-heating': 2,
 'gas': 2,
 'heat-pump': 4,
 'heating-circuit': 2,
 'local-heating-station': 2,
 'solar-thermal': 2,
 'fresh-water-station': 3,
 'warm-water': 4}

def calculate_sensor_count(layout):
    if pd.isna(layout):
        return 0

    total = 0
    parts = layout.split('--')
    for part in parts:
        subsystem = part.split(':')[0]
        # Remove suffixes like _mixed_ctrl or _external if present
        subsystem_clean = subsystem.split('_')[0]
        count = sensor_count_subsystem.get(subsystem_clean, 0)
        total += count
    return total

In [43]:
# Apply the function
df['new_sensor_count'] = df['LayoutID_Tops'].apply(calculate_sensor_count)
df['count_diff'] = df['t_sensor_count']-df['new_sensor_count']

## sensor cout diff for modified LayoutID

In [44]:
# mismatch_df
df_modified = df[df['LayoutID'] != df['LayoutID_Tops']].reset_index(drop=True)
mismatch_df = df_modified.loc[df_modified['t_sensor_count'] < df_modified['new_sensor_count']].reset_index()
mismatch_df=mismatch_df[['building_id', 'KBn', 't_sensor_count', 'new_sensor_count','count_diff']].sort_values(by='KBn', ascending=True)
print(len(list(mismatch_df['building_id'])),list(mismatch_df['building_id']))

433 [1461, 1401, 477, 2431, 2382, 1605, 1531, 1532, 1536, 1781, 1549, 1547, 1534, 1535, 1683, 1702, 1867, 541, 1783, 2006, 1670, 1864, 1952, 1665, 1454, 330, 331, 579, 542, 1527, 1706, 555, 1931, 1925, 1526, 2314, 799, 486, 485, 484, 2297, 471, 646, 466, 666, 1679, 1681, 1509, 1533, 2338, 1662, 515, 1818, 458, 514, 1795, 1708, 1701, 1687, 1694, 2441, 1822, 1823, 707, 1715, 1722, 1674, 1671, 2434, 2433, 1711, 1678, 2455, 1524, 1639, 1638, 623, 1664, 656, 1878, 351, 1825, 337, 617, 604, 1560, 2285, 678, 624, 1773, 1530, 1528, 1539, 1529, 1777, 1778, 2231, 2227, 689, 2166, 2005, 1485, 1747, 1745, 2380, 657, 797, 2426, 728, 2377, 644, 2244, 801, 510, 1723, 2438, 1720, 1717, 1716, 1506, 696, 668, 1826, 1666, 2391, 1908, 1667, 730, 680, 1692, 1689, 1497, 1503, 520, 1501, 1842, 601, 616, 507, 1797, 1347, 1358, 1357, 1340, 2463, 552, 539, 1838, 1484, 650, 304, 1494, 2378, 1489, 558, 1832, 1845, 480, 481, 483, 482, 487, 488, 568, 1887, 1566, 1452, 489, 1603, 1604, 1833, 2257, 478, 743, 645, 521

## group by LayoutID_Tops

In [45]:
unique_count = df['LayoutID_Tops'].nunique()
print(f"Number of unique LayoutID values: {unique_count}")

Number of unique LayoutID values: 202


In [46]:
from collections import Counter

def most_common(series):
    if series.empty:
        return None
    return Counter(series).most_common(1)[0][0]

# Group by LayoutID_Tops with additional 'most_common_sensor_count'
grouped_new_std = df.groupby('LayoutID_Tops').agg(
    Occurrence=('LayoutID_Tops', 'count'),
    buildingIDs=('building_id', list),
    sensor_count_set=('t_sensor_count', set),
    new_sensor_count=('new_sensor_count', set),
    Variations=('LayoutID', set),
    Variations_KBn=('KBn', set),
    most_common_count=('t_sensor_count', most_common)
).reset_index()

# Sort by Occurrence
grouped_new_std = grouped_new_std.sort_values(by='Occurrence', ascending=False).reset_index(drop=True)

# Reorder columns
grouped_new_std = grouped_new_std[
    ['LayoutID_Tops', 'Occurrence', 'most_common_count',
     'new_sensor_count',  'Variations_KBn', 'buildingIDs', 'Variations', 'sensor_count_set']
]

# Format Variations_KBn nicely
grouped_new_std['Variations_KBn'] = grouped_new_std['Variations_KBn'].apply(
    lambda s: ', '.join(f"KB{int(x)}" for x in sorted(s))
)

In [47]:
grouped_new_std[['LayoutID_Tops', 'Occurrence', 'most_common_count','new_sensor_count', 'Variations_KBn', 'buildingIDs', 'Variations','sensor_count_set']].head(20)

Unnamed: 0,LayoutID_Tops,Occurrence,most_common_count,new_sensor_count,Variations_KBn,buildingIDs,Variations,sensor_count_set
0,gas:1--heating-circuit:1--warm-water:1,155,0,{8},"KB1, KB4, KB17","[768, 769, 770, 778, 787, 304, 1924, 1925, 192...","{gas:1--heating-circuit:1--warm-water:1, gas:1...",{0}
1,gas:1--heating-circuit:1,122,0,{4},"KB3, KB6, KB13","[758, 759, 499, 507, 1524, 1525, 1649, 1654, 1...",{gas:1--global-separation-circuit:1--heating-c...,{0}
2,district-heating:1--heating-circuit:1--warm-wa...,73,0,{8},KB2,"[774, 1909, 1910, 1911, 1912, 1913, 1314, 1315...",{district-heating:1--heating-circuit:1--warm-w...,{0}
3,local-heating-station:1--heating-circuit:1--wa...,42,0,{8},KB5,"[509, 790, 1005, 1628, 1629, 1630, 1631, 672, ...",{local-heating-station:1--heating-circuit:1--w...,{0}
4,gas:1--gas:2--heating-circuit:1--warm-water:1,37,0,{10},"KB7, KB20, KB65","[478, 510, 1528, 1529, 1530, 1539, 1773, 1777,...",{gas:1--gas:2--heat-exchanger:1--heating-circu...,{0}
5,gas:1--heating-circuit:1--heating-circuit:2--w...,34,0,{10},"KB9, KB23, KB54, KB164","[1511, 1515, 1516, 1517, 1518, 1523, 1537, 154...",{gas:1--heat-exchanger:1--heating-circuit:1--h...,{0}
6,gas:1--gas:2--heating-circuit:1,32,0,{6},"KB12, KB28, KB31, KB181","[757, 773, 429, 1506, 668, 680, 696, 745, 529,...",{gas:1--gas:2--global-separation-circuit:1--he...,{0}
7,gas:1--gas:2--heating-circuit:1--heating-circu...,22,0,{12},"KB24, KB27, KB39","[1928, 1930, 506, 333, 388, 1641, 414, 651, 68...",{gas:1--gas:2--heat-exchanger:1--heating-circu...,{0}
8,district-heating:1--heating-circuit:1--heating...,22,0,{10},KB8,"[600, 1488, 1724, 796, 1772, 1385, 1386, 1389,...",{district-heating:1--heating-circuit:1--heatin...,{0}
9,district-heating:1--heating-circuit:1,20,0,{4},KB10,"[762, 766, 777, 712, 450, 389, 391, 1725, 1726...",{district-heating:1--heating-circuit:1},{0}


In [48]:
# Sum the number of occurrences for top 10 and top 20
top_10_sum = grouped_new_std.head(10)['Occurrence'].sum()
top_20_sum = grouped_new_std.head(20)['Occurrence'].sum()
print(f"total LayoutID: {df.LayoutID.count()}")
print(f"top 10 LayoutIDs count: {top_10_sum}")
print(f"top 20 LayoutIDs count: {top_20_sum}")

print(f"Coverage top 10 LayoutIDs: {100*top_10_sum/df.LayoutID.count(): .1f} %")
print(f"Coverage top 20 LayoutIDs: {100*top_20_sum/df.LayoutID.count(): .1f} %")

total LayoutID: 1037
top 10 LayoutIDs count: 559
top 20 LayoutIDs count: 702
Coverage top 10 LayoutIDs:  53.9 %
Coverage top 20 LayoutIDs:  67.7 %


In [49]:
#buildings_with_unmixed_unctrl = df[df['new_LayoutID'].str.contains('heating-circuit_unmixed_unctrl', na=False)]

# Display the result
#print(buildings_with_unmixed_unctrl[['buildingID', 'new_LayoutID']])

## exctract infos from modular system 

In [50]:
df['sub_components_ops']=df['sub_components'].apply(ast.literal_eval)

In [51]:
def update_sub_components_ops(modular_system, sub_components_ops):
    # Your original function here (update sub_components_ops based on modular_system)
    pumps_per_index = set()
    valves_per_index = set()

    for subsystem in modular_system.get('subSystems', []):
        name = subsystem.get('name', '').lower()
        if name != 'heating-circuit':
            continue
        sub_components = subsystem.get('parameters', {}).get('subComponent', {})
        index = subsystem.get('hydraulicLocationIndex')
        if not index:
            continue

        for key in sub_components:
            if f'pump--{index}' in key:
                pumps_per_index.add(index)
            if f'valve--{index}' in key:
                valves_per_index.add(index)

    # Update the sub_components_ops dict
    new_sub_components_ops = sub_components_ops.copy()
    for key in list(new_sub_components_ops.keys()):
        if key.startswith('hc(n)'):
            # Extract the index number inside parentheses
            import re
            match = re.search(r'hc\(n\)\((\d+)\)', key)
            if match:
                idx = int(match.group(1))
                has_pump = idx in pumps_per_index
                has_valve = idx in valves_per_index

                if has_pump and has_valve:
                    sub_type = "unmixed_uncontrolled"
                elif has_pump:
                    sub_type = "mixed_uncontrolled"
                elif has_valve:
                    sub_type = "unmixed_controlled"
                else:
                    sub_type = "mixed_controlled"

                # Replace sub_type value in string
                old_val = new_sub_components_ops[key]
                new_val = re.sub(r'sub_type\s*:\s*\S+', f'sub_type : {sub_type}', old_val)
                new_sub_components_ops[key] = new_val

    return new_sub_components_ops


def update_sub_components_ops_safe(modular_system, sub_components_ops):
    import json
    if isinstance(modular_system, str):
        try:
            modular_system = json.loads(modular_system)  # try parse as JSON
        except json.JSONDecodeError:
            # fallback: try eval or ast.literal_eval to parse Python dict string
            import ast
            modular_system = ast.literal_eval(modular_system)
    return update_sub_components_ops(modular_system, sub_components_ops)


# Then apply to your DataFrame:
df['sub_components_ops'] = df.apply(
    lambda row: update_sub_components_ops_safe(row['modular_system'], row['sub_components_ops']),
    axis=1
)

# examin a building

In [52]:
building_id= 2165#765#2165#1474 #765 #has no FWS

In [53]:
print(df.loc[df['building_id'] == building_id, 'sub_components'].values[0])

{'global(1)': 'id : 101919 category : NOT_SPECIFIED type : GLOBAL sub_type : NOT_SPECIFIED', 'b(n)(1)': 'id : 219468 category : PRODUCER type : BOILER sub_type : NOT_SPECIFIED', 'hc(n)(1)': 'id : 219469 category : CONSUMER type : HEATING_CIRCUIT sub_type : NOT_SPECIFIED', 'hc(n)(2)': 'id : 219470 category : CONSUMER type : HEATING_CIRCUIT sub_type : NOT_SPECIFIED', 'hc(n)(3)': 'id : 219471 category : CONSUMER type : HEATING_CIRCUIT sub_type : NOT_SPECIFIED', 'hc(n)(4)': 'id : 219472 category : CONSUMER type : HEATING_CIRCUIT sub_type : NOT_SPECIFIED', 'hc(n)(5)': 'id : 219473 category : CONSUMER type : HEATING_CIRCUIT sub_type : NOT_SPECIFIED'}


In [54]:
df.loc[df['building_id'] == building_id, 'sub_components_ops'].values[0]

{'global(1)': 'id : 101919 category : NOT_SPECIFIED type : GLOBAL sub_type : NOT_SPECIFIED',
 'b(n)(1)': 'id : 219468 category : PRODUCER type : BOILER sub_type : NOT_SPECIFIED',
 'hc(n)(1)': 'id : 219469 category : CONSUMER type : HEATING_CIRCUIT sub_type : unmixed_uncontrolled',
 'hc(n)(2)': 'id : 219470 category : CONSUMER type : HEATING_CIRCUIT sub_type : unmixed_uncontrolled',
 'hc(n)(3)': 'id : 219471 category : CONSUMER type : HEATING_CIRCUIT sub_type : unmixed_uncontrolled',
 'hc(n)(4)': 'id : 219472 category : CONSUMER type : HEATING_CIRCUIT sub_type : unmixed_uncontrolled',
 'hc(n)(5)': 'id : 219473 category : CONSUMER type : HEATING_CIRCUIT sub_type : unmixed_uncontrolled'}

In [55]:
df.loc[df['building_id'] == building_id, 'LayoutID'].values[0]

'gas:1--heat-exchanger:1--heating-circuit:1--heating-circuit:2--heating-circuit:3--heating-circuit:4--heating-circuit:5--heating-circuit:6--heating-circuit:7'

In [56]:
df.loc[df['building_id'] == building_id, 'LayoutID_Tops'].values[0]

'gas:1--heating-circuit:1--heating-circuit:2--heating-circuit:3--heating-circuit:4--heating-circuit:5--heating-circuit:6--heating-circuit:7'

In [57]:
if True: #test single buildings
    building_id = building_id#2389#1801#2317#1315#1593#1809 #1801 #
    mod_sys = df.loc[df['building_id'] == building_id, 'modular_system'].iloc[0]
    modular_system = ast.literal_eval(mod_sys)
    #print(df.loc[df['building_id'] == building_id, 'LayoutID_Tops'].iloc[0])
    pprint(modular_system)

{'buildingID': 2165,
 'id': 'modular-system----Stadtwerke--Essen--AG-----Am--Brauhaus--15--45359--Essen',
 'layoutID': 'gas:1--heat-exchanger:1--heating-circuit:1--heating-circuit:2--heating-circuit:3--heating-circuit:4--heating-circuit:5--heating-circuit:6--heating-circuit:7',
 'name': 'modular-system-entry',
 'published': True,
 'publishedAt': '2025-03-17T09:15:07.723Z',
 'subSystems': [{'connections': [{'status': 'hot',
                                  'to': {'direction': 'right',
                                         'id': 'heating-circuit:7',
                                         'subSystem': {'hydraulicLocationIndex': 7,
                                                       'name': 'heating-circuit',
                                                       'side': 'bottom'}}},
                                 {'from': {'direction': 'right',
                                           'id': 'heating-circuit:7',
                                           'subSystem': {'hydraul

# exports

In [58]:
if True:
    filename1='Building_ID_VS_LayoutID_'+extraced_date+'.csv'
    filename2='LayoutID_occurence_'+extraced_date+'.csv'
    filename3='LayoutID_occurence_new_std_'+extraced_date+'.csv'
    filename4='mismatch_df_'+extraced_date+'.csv'
    filepath1 = os.path.join('OUTs', filename1)
    filepath2 = os.path.join('OUTs', filename2)
    filepath3 = os.path.join('OUTs', filename3)
    filepath4 = os.path.join('OUTs', filename4)
    df.to_csv(filepath1, index=False)
    grouped.to_csv(filepath2, index=False)
    grouped_new_std.to_csv(filepath3, index=False)
    mismatch_df.to_csv(filepath4, index=False)