# Analysis of Layout ID

## Relevant documents

- (Python Client Repo)[https://github.com/Green-Fusion/energy-management-backend/tree/main/python_client]
- (Klemmenbelegung)[https://docs.google.com/spreadsheets/d/1nkdkx2rI6nVKgoKBgkCUtfwEwuv8kptrRUXcXtfv0NM/edit?gid=247168398#gid=247168398]
- (Hypothesis for Klemmenbelegung)[https://docs.google.com/spreadsheets/d/1TSTxMCgEvuoayzOfx1MUqlV0tiqsVTBRN8aldlnFXxA/edit?gid=0#gid=0]

# import data

In [1]:
#imports
import os
import pandas as pd
import json
import ast
import re
from pprint import pprint

In [2]:
#build df from database exctracted data
filename='Building_device_Layout2025-07-14.csv'
extraced_date=filename[-14:-4]
filepath = os.path.join('INs', filename)
df = pd.read_csv(filepath)
#inspest column
df.columns

Index(['building_id', 'customerID', 'customer_name', 'address', 'postal_code',
       'city', 'coordinates', 'LayoutID', 'device_type', 'device_id',
       'modular_system', 'gfid'],
      dtype='object')

In [3]:
#filter out uncesseray infos
df=df[['building_id','customerID', 'customer_name','device_type', 'customer_name','address','LayoutID','modular_system']]

In [4]:
df.head()

Unnamed: 0,building_id,customerID,customer_name,device_type,customer_name.1,address,LayoutID,modular_system
0,756,66,HwS,RUT956,HwS,Wolzogenstr.28,gas:1--gas:2--heating-circuit:1--heating-circu...,"{'name': 'modular-system-entry', 'id': 'modula..."
1,757,66,HwS,RUT956,HwS,Hagelberger Str. 26,gas:1--gas:2--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
2,758,66,HwS,RUT956,HwS,Hochstr. 8,gas:1--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
3,759,66,HwS,RUT956,HwS,Planufer 82a,gas:1--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
4,761,66,HwS,RUT956,HwS,Gabainstr. 13,gas:1--gas:2--global-separation-circuit:1--hea...,"{'name': 'modular-system-entry', 'id': 'modula..."


In [5]:
df.count()

building_id       2014
customerID        2014
customer_name     2014
device_type       1086
customer_name     2014
address           2014
LayoutID          1138
modular_system    2014
dtype: int64

# Group by LayoutID

In [6]:
unique_count = df['LayoutID'].nunique()
print(f"Number of unique LayoutID values: {unique_count}")

Number of unique LayoutID values: 233


In [7]:
# Group by LayoutID
grouped = df.groupby('LayoutID').agg(
    Occurrence=('LayoutID', 'count'),
    buildingIDs=('building_id', list)
).reset_index()

# Add a simple index column starting from 0
grouped = grouped.sort_values(by='Occurrence', ascending=False).reset_index(drop=True)
grouped['KBn']=grouped.index+1
#grouped['KBn'] = grouped['KBn'].fillna(-1).astype(int)

In [8]:
# Show result
grouped.head(20)

Unnamed: 0,LayoutID,Occurrence,buildingIDs,KBn
0,gas:1--heating-circuit:1--warm-water:1,129,"[768, 769, 770, 778, 787, 1924, 1926, 1927, 46...",1
1,district-heating:1--heating-circuit:1--warm-wa...,84,"[774, 1909, 1910, 1911, 1912, 1913, 1314, 1315...",2
2,gas:1--heating-circuit:1,77,"[758, 759, 499, 1525, 1649, 1654, 1657, 1791, ...",3
3,gas:1--heat-exchanger:1--heating-circuit:1--wa...,46,"[1925, 1931, 1781, 1509, 1526, 1527, 1531, 153...",4
4,local-heating-station:1--heating-circuit:1--wa...,45,"[509, 790, 1005, 1628, 1629, 1630, 1631, 672, ...",5
5,gas:1--heat-exchanger:1--heating-circuit:1,40,"[1524, 1795, 1560, 1638, 1639, 604, 617, 623, ...",6
6,gas:1--heating-circuit:1--heating-circuit:2--w...,34,"[2219, 1511, 1515, 1516, 1517, 1518, 1537, 179...",7
7,gas:1--gas:2--heat-exchanger:1--heating-circui...,26,"[510, 1773, 1777, 1778, 1528, 1529, 1530, 1539...",8
8,district-heating:1--heating-circuit:1,22,"[762, 766, 777, 712, 450, 389, 391, 1725, 1726...",9
9,district-heating:1--heating-circuit:1--heating...,22,"[600, 1488, 1724, 796, 1772, 1385, 1386, 1389,...",10


In [9]:
# Sum the number of occurrences for top 10 and top 20
top_10_sum = grouped.head(10)['Occurrence'].sum()
top_20_sum = grouped.head(20)['Occurrence'].sum()
print(f"total LayoutID: {df.LayoutID.count()}")
print(f"top 10 LayoutIDs count: {top_10_sum}")
print(f"top 20 LayoutIDs count: {top_20_sum}")

print(f"Coverage top 10 LayoutIDs: {100*top_10_sum/df.LayoutID.count(): .1f} %")
print(f"Coverage top 20 LayoutIDs: {100*top_20_sum/df.LayoutID.count(): .1f} %")

total LayoutID: 1138
top 10 LayoutIDs count: 525
top 20 LayoutIDs count: 657
Coverage top 10 LayoutIDs:  46.1 %
Coverage top 20 LayoutIDs:  57.7 %


# new standard

In [10]:
# preparation add KBns
df = df.merge(grouped[['LayoutID', 'KBn']], on='LayoutID', how='left')
cols = ['KBn'] + [col for col in df.columns if col != 'KBn']
df = df[cols]
df.head()

Unnamed: 0,KBn,building_id,customerID,customer_name,customer_name.1,device_type,customer_name.2,customer_name.3,address,LayoutID,modular_system
0,46.0,756,66,HwS,HwS,RUT956,HwS,HwS,Wolzogenstr.28,gas:1--gas:2--heating-circuit:1--heating-circu...,"{'name': 'modular-system-entry', 'id': 'modula..."
1,34.0,757,66,HwS,HwS,RUT956,HwS,HwS,Hagelberger Str. 26,gas:1--gas:2--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
2,3.0,758,66,HwS,HwS,RUT956,HwS,HwS,Hochstr. 8,gas:1--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
3,3.0,759,66,HwS,HwS,RUT956,HwS,HwS,Planufer 82a,gas:1--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
4,32.0,761,66,HwS,HwS,RUT956,HwS,HwS,Gabainstr. 13,gas:1--gas:2--global-separation-circuit:1--hea...,"{'name': 'modular-system-entry', 'id': 'modula..."


In [11]:
# Define substrings to remove
remove_parts = ['--heat-exchanger:1', '--global-separation-circuit:1']
# Copy LayoutID and remove unwanted parts
df['LayoutID_Tops'] = df['LayoutID']
# Loop over each part and remove it
for part in remove_parts:
    df['LayoutID_Tops'] = df['LayoutID_Tops'].str.replace(part, '', regex=False)

In [12]:
unique_count = df['LayoutID_Tops'].nunique()
print(f"Number of unique LayoutID values: {unique_count}")

Number of unique LayoutID values: 193


In [13]:
# Group by LayoutID
grouped_new_std = df.groupby('LayoutID_Tops').agg(
    Occurrence=('LayoutID_Tops', 'count'),
    buildingIDs=('building_id', list),
    Variations=('LayoutID', set),
    Variations_KBn=('KBn', set)
).reset_index()

# Add a simple index column starting from 0
grouped_new_std = grouped_new_std.sort_values(by='Occurrence', ascending=False).reset_index(drop=True)
grouped_new_std=grouped_new_std[['LayoutID_Tops','Occurrence','Variations_KBn','buildingIDs','Variations']]

grouped_new_std['Variations_KBn'] = grouped_new_std['Variations_KBn'].apply(
    lambda s: ', '.join(f"KB{int(x)}" for x in sorted(s))
)

In [14]:
grouped_new_std.head(20)

Unnamed: 0,LayoutID_Tops,Occurrence,Variations_KBn,buildingIDs,Variations
0,gas:1--heating-circuit:1--warm-water:1,195,"KB1, KB4, KB15, KB30","[768, 769, 770, 778, 787, 304, 1924, 1925, 192...",{gas:1--global-separation-circuit:1--heating-c...
1,gas:1--heating-circuit:1,132,"KB3, KB6, KB14","[758, 759, 499, 507, 1524, 1525, 1649, 1654, 1...","{gas:1--heat-exchanger:1--heating-circuit:1, g..."
2,district-heating:1--heating-circuit:1--warm-wa...,91,"KB2, KB33","[774, 419, 1909, 1910, 1911, 1912, 1913, 1314,...",{district-heating:1--heating-circuit:1--heat-e...
3,gas:1--heating-circuit:1--heating-circuit:2--w...,51,"KB7, KB23, KB37, KB181","[348, 1775, 1779, 2219, 1511, 1515, 1516, 1517...",{gas:1--heating-circuit:1--heating-circuit:2--...
4,local-heating-station:1--heating-circuit:1--wa...,45,KB5,"[509, 790, 1005, 1628, 1629, 1630, 1631, 672, ...",{local-heating-station:1--heating-circuit:1--w...
5,gas:1--gas:2--heating-circuit:1--warm-water:1,40,"KB8, KB19, KB62","[478, 510, 1773, 1777, 1778, 1528, 1529, 1530,...",{gas:1--gas:2--heat-exchanger:1--heating-circu...
6,gas:1--gas:2--heating-circuit:1,31,"KB12, KB29, KB34","[757, 773, 429, 1506, 668, 680, 696, 745, 530,...",{gas:1--gas:2--global-separation-circuit:1--he...
7,district-heating:1--heating-circuit:1--heating...,25,"KB10, KB57","[600, 1488, 2468, 1724, 796, 1772, 1385, 1386,...",{district-heating:1--heating-circuit:1--heatin...
8,gas:1--gas:2--heating-circuit:1--heating-circu...,24,"KB22, KB26, KB50","[1928, 1930, 506, 333, 417, 388, 1641, 414, 79...",{gas:1--gas:2--heat-exchanger:1--heating-circu...
9,district-heating:1--heating-circuit:1,22,KB9,"[762, 766, 777, 712, 450, 389, 391, 1725, 1726...",{district-heating:1--heating-circuit:1}


In [15]:
# Sum the number of occurrences for top 10 and top 20
top_10_sum = grouped_new_std.head(10)['Occurrence'].sum()
top_20_sum = grouped_new_std.head(20)['Occurrence'].sum()
print(f"total LayoutID: {df.LayoutID.count()}")
print(f"top 10 LayoutIDs count: {top_10_sum}")
print(f"top 20 LayoutIDs count: {top_20_sum}")

print(f"Coverage top 10 LayoutIDs: {100*top_10_sum/df.LayoutID.count(): .1f} %")
print(f"Coverage top 20 LayoutIDs: {100*top_20_sum/df.LayoutID.count(): .1f} %")

total LayoutID: 1138
top 10 LayoutIDs count: 656
top 20 LayoutIDs count: 802
Coverage top 10 LayoutIDs:  57.6 %
Coverage top 20 LayoutIDs:  70.5 %


# A new layout Id

In [16]:
if False:
    building_id = 1801
    mod_sys = df.loc[df['building_id'] == building_id, 'modular_system'].iloc[0]
    modular_system = ast.literal_eval(mod_sys)
    pprint(modular_system)

In [17]:
def transform_layout(layout, modular_system_json_str):
    if pd.isna(layout):
        return layout

    # Step 1: Apply existing replacements
    layout = re.sub(r'heating-circuit:(\d+)', r'heating-circuit_mixed_ctrl:\1', layout)
    layout = re.sub(r'district-heating:(\d+)', r'district-heating_HX_ctrl:\1', layout)
    layout = re.sub(r'local-heating-station:(\d+)', r'local-heating-station_HX_ctrlprim_ctrlsec:\1', layout)

    # Step 2: Handle 'heat-exchanger:<n>--warm-water:<n>' → 'warm-water_external:<n>'
    layout = re.sub(r'heat-exchanger:\d+--warm-water:(\d+)', r'warm-water_external:\1', layout)

    # Step 3: Replace remaining 'warm-water:<n>' → 'warm-water_internal:<n>'
    layout = re.sub(r'(?<!external:)warm-water:(\d+)', r'warm-water_internal:\1', layout)

    # Step 4: Parse modular_system JSON string safely
    if pd.isna(modular_system_json_str):
        return layout

    try:
        modular_system = ast.literal_eval(modular_system_json_str)
    except Exception:
        return layout

    # Step 5: Extract subcomponents from modular_system
    subcomponents_set = set()
    for subsystem in modular_system.get('subSystems', []):
        params = subsystem.get('parameters', {})
        subcomp = params.get('subComponent', {})
        for sc in subcomp.keys():
            if (sc.startswith("district-heating--") or
                sc.startswith("local-heating-station--") or
                sc.startswith("heating-circuit--")):
                subcomponents_set.add(sc)

    # Step 6: Apply suffix modification rules
    if any(sc.startswith("district-heating--heat-exchanger") for sc in subcomponents_set):
        layout = re.sub(r'(district-heating)_HX_ctrl', r'\1_noHX_ctrl', layout)

    if any(sc.startswith("district-heating--valve") for sc in subcomponents_set):
        layout = re.sub(r'(district-heating)_HX_ctrl', r'\1_HX_unctrl', layout)

    if any(sc.startswith("heating-circuit--pump") for sc in subcomponents_set):
        layout = re.sub(r'(heating-circuit)_mixed_ctrl', r'\1_mixed_unctrl', layout)

    if any(sc.startswith("heating-circuit--valve") for sc in subcomponents_set):
        layout = re.sub(r'(heating-circuit)_mixed_ctrl', r'\1_unmixed_ctrl', layout)

    if any(sc.startswith("local-heating-station--heat-exchanger") for sc in subcomponents_set):
        layout = re.sub(r'(local-heating-station)_HX_ctrlprim_ctrlsec', r'\1_noHX_ctrlprim_ctrlsec', layout)

    return layout


In [18]:
df['new_LayoutID'] = df.apply(lambda row: transform_layout(row['LayoutID'], row['modular_system']), axis=1)

## apply new standards

In [19]:
# Group by new_layoutID
grouped_new_layout = df.groupby('new_LayoutID').agg(
    Occurrence=('new_LayoutID', 'count'),
    buildingIDs=('building_id', list),
    LayoutID=('LayoutID', set),
    KBn=('KBn', set)
).reset_index()

# Add a simple index column starting from 0
grouped_new_layout = grouped_new_layout.sort_values(by='Occurrence', ascending=False).reset_index(drop=True)
grouped_new_layout=grouped_new_layout[['new_LayoutID','Occurrence','KBn','buildingIDs','LayoutID']]

grouped_new_layout['KBn'] = grouped_new_layout['KBn'].apply(
    lambda s: ', '.join(f"KB{int(x)}" for x in sorted(s))
)

In [20]:
grouped_new_layout.head(20)

Unnamed: 0,new_LayoutID,Occurrence,KBn,buildingIDs,LayoutID
0,gas:1--heating-circuit_mixed_ctrl:1--warm-wate...,96,KB1,"[768, 769, 770, 778, 787, 1924, 463, 508, 512,...",{gas:1--heating-circuit:1--warm-water:1}
1,local-heating-station_noHX_ctrlprim_ctrlsec:1-...,41,KB5,"[509, 790, 1005, 1628, 1629, 1630, 1631, 672, ...",{local-heating-station:1--heating-circuit:1--w...
2,gas:1--heat-exchanger:1--heating-circuit_mixed...,39,KB4,"[1925, 1781, 1509, 1526, 1527, 1531, 1532, 153...",{gas:1--heat-exchanger:1--heating-circuit:1--w...
3,district-heating_noHX_ctrl:1--heating-circuit_...,37,KB2,"[1314, 1315, 1316, 1317, 1318, 1319, 1320, 132...",{district-heating:1--heating-circuit:1--warm-w...
4,gas:1--heating-circuit_mixed_ctrl:1,32,KB3,"[758, 759, 499, 1649, 1791, 1486, 1500, 2451, ...",{gas:1--heating-circuit:1}
5,gas:1--heating-circuit_unmixed_ctrl:1--warm-wa...,31,KB1,"[1926, 1927, 1776, 1650, 1492, 1495, 1502, 164...",{gas:1--heating-circuit:1--warm-water:1}
6,gas:1--heat-exchanger:1--heating-circuit_unmix...,30,KB6,"[1524, 1795, 1639, 604, 617, 656, 678, 514, 51...",{gas:1--heat-exchanger:1--heating-circuit:1}
7,gas:1--heating-circuit_mixed_ctrl:1--heating-c...,30,KB7,"[2219, 1511, 1515, 1516, 1517, 1518, 1537, 433...",{gas:1--heating-circuit:1--heating-circuit:2--...
8,gas:1--heating-circuit_unmixed_ctrl:1,25,KB3,"[1525, 1792, 493, 605, 606, 607, 610, 612, 614...",{gas:1--heating-circuit:1}
9,district-heating_HX_ctrl:1--heating-circuit_mi...,24,KB2,"[1910, 1911, 1912, 1913, 1505, 1382, 1384, 139...",{district-heating:1--heating-circuit:1--warm-w...


In [21]:
# Sum the number of occurrences for top 10 and top 20
top_10_sum = grouped_new_layout.head(10)['Occurrence'].sum()
top_20_sum = grouped_new_layout.head(20)['Occurrence'].sum()
print(f"total LayoutID: {df.LayoutID.count()}")
print(f"top 10 LayoutIDs count: {top_10_sum}")
print(f"top 20 LayoutIDs count: {top_20_sum}")

print(f"Coverage top 10 LayoutIDs: {100*top_10_sum/df.LayoutID.count(): .1f} %")
print(f"Coverage top 20 LayoutIDs: {100*top_20_sum/df.LayoutID.count(): .1f} %")

total LayoutID: 1138
top 10 LayoutIDs count: 385
top 20 LayoutIDs count: 511
Coverage top 10 LayoutIDs:  33.8 %
Coverage top 20 LayoutIDs:  44.9 %


In [22]:
# Define substrings to remove
remove_parts = ['--heat-exchanger:1', '--global-separation-circuit:1']
# Copy LayoutID and remove unwanted parts
df['new_LayoutID_Tops'] = df['new_LayoutID']
# Loop over each part and remove it
for part in remove_parts:
    df['new_LayoutID_Tops'] = df['new_LayoutID_Tops'].str.replace(part, '', regex=False)

In [23]:
# Group by LayoutID
grouped_new_std_newlayout = df.groupby('new_LayoutID_Tops').agg(
    Occurrence=('new_LayoutID_Tops', 'count'),
    buildingIDs=('building_id', list),
    Variations=('new_LayoutID_Tops', set),
    Variations_KBn=('KBn', set)
).reset_index()

# Add a simple index column starting from 0
grouped_new_std_newlayout = grouped_new_std_newlayout.sort_values(by='Occurrence', ascending=False).reset_index(drop=True)
grouped_new_std_newlayout=grouped_new_std_newlayout[['new_LayoutID_Tops','Occurrence','Variations_KBn','buildingIDs','Variations']]

grouped_new_std_newlayout['Variations_KBn'] = grouped_new_std_newlayout['Variations_KBn'].apply(
    lambda s: ', '.join(f"KB{int(x)}" for x in sorted(s))
)

In [24]:
grouped_new_std_newlayout.head(20)

Unnamed: 0,new_LayoutID_Tops,Occurrence,Variations_KBn,buildingIDs,Variations
0,gas:1--heating-circuit_mixed_ctrl:1--warm-wate...,142,"KB1, KB4, KB15","[768, 769, 770, 778, 787, 304, 1924, 1925, 463...",{gas:1--heating-circuit_mixed_ctrl:1--warm-wat...
1,gas:1--heating-circuit_unmixed_ctrl:1,65,"KB3, KB6, KB14","[507, 1524, 1525, 1792, 1795, 1797, 601, 1503,...",{gas:1--heating-circuit_unmixed_ctrl:1}
2,gas:1--heating-circuit_mixed_ctrl:1--heating-c...,44,"KB7, KB23, KB37","[1775, 1779, 2219, 1511, 1515, 1516, 1517, 151...",{gas:1--heating-circuit_mixed_ctrl:1--heating-...
3,gas:1--heating-circuit_unmixed_ctrl:1--warm-wa...,42,"KB1, KB4, KB15","[1926, 1927, 1931, 1776, 1534, 1650, 1484, 148...",{gas:1--heating-circuit_unmixed_ctrl:1--warm-w...
4,gas:1--heating-circuit_mixed_ctrl:1,42,"KB3, KB6, KB14","[758, 759, 499, 1649, 1791, 1486, 1497, 1500, ...",{gas:1--heating-circuit_mixed_ctrl:1}
5,local-heating-station_noHX_ctrlprim_ctrlsec:1-...,41,KB5,"[509, 790, 1005, 1628, 1629, 1630, 1631, 672, ...",{local-heating-station_noHX_ctrlprim_ctrlsec:1...
6,district-heating_noHX_ctrl:1--heating-circuit_...,37,KB2,"[1314, 1315, 1316, 1317, 1318, 1319, 1320, 132...",{district-heating_noHX_ctrl:1--heating-circuit...
7,gas:1--gas:2--heating-circuit_mixed_ctrl:1--wa...,26,"KB8, KB19, KB62","[1773, 1528, 1529, 1530, 1539, 1485, 797, 801,...",{gas:1--gas:2--heating-circuit_mixed_ctrl:1--w...
8,gas:1--heating-circuit_mixed_unctrl:1,25,"KB3, KB6, KB14","[1654, 1657, 599, 1638, 439, 491, 609, 616, 62...",{gas:1--heating-circuit_mixed_unctrl:1}
9,district-heating_HX_ctrl:1--heating-circuit_mi...,24,KB2,"[1910, 1911, 1912, 1913, 1505, 1382, 1384, 139...",{district-heating_HX_ctrl:1--heating-circuit_m...


In [25]:
# Sum the number of occurrences for top 10 and top 20
top_10_sum = grouped_new_std_newlayout.head(10)['Occurrence'].sum()
top_20_sum = grouped_new_std_newlayout.head(20)['Occurrence'].sum()
print(f"total LayoutID: {df.LayoutID.count()}")
print(f"top 10 LayoutIDs count: {top_10_sum}")
print(f"top 20 LayoutIDs count: {top_20_sum}")

print(f"Coverage top 10 LayoutIDs: {100*top_10_sum/df.LayoutID.count(): .1f} %")
print(f"Coverage top 20 LayoutIDs: {100*top_20_sum/df.LayoutID.count(): .1f} %")

total LayoutID: 1138
top 10 LayoutIDs count: 488
top 20 LayoutIDs count: 617
Coverage top 10 LayoutIDs:  42.9 %
Coverage top 20 LayoutIDs:  54.2 %


## explore modular_system

In [26]:
if True: #test single buildings
    building_id = 1005#2389#1801#2317#1315#1593#1809 #1801 #
    mod_sys = df.loc[df['building_id'] == building_id, 'modular_system'].iloc[0]
    modular_system = ast.literal_eval(mod_sys)
    print(df.loc[df['building_id'] == building_id, 'new_LayoutID'].iloc[0])
    pprint(modular_system)

local-heating-station_noHX_ctrlprim_ctrlsec:1--heating-circuit_mixed_ctrl:1--warm-water_internal:1
{'buildingID': 1005,
 'id': 'modular-system-jahnstrasse-16',
 'layoutID': 'local-heating-station:1--heating-circuit:1--warm-water:1',
 'name': 'modular-system-entry',
 'published': True,
 'publishedAt': '2024-10-01T13:13:24.551Z',
 'subSystems': [{'connections': [],
                 'hydraulicLocationIndex': 1,
                 'id': 'heating-circuit-1',
                 'name': 'heating-circuit',
                 'navigation': {'back': {'hydraulicLocationIndex': None,
                                         'name': 'local-heating-station'},
                                'forward': {'hydraulicLocationIndex': 1,
                                            'name': 'warm-water'}}},
                {'connections': [{'status': 'hot',
                                  'to': {'direction': 'right',
                                         'id': 'warm-water:1',
                                 

## get subcomponent

In [27]:
def extract_unique_subcomponents(df, column='modular_system'):
    subcomponent_set = set()

    for entry in df[column]:
        try:
            # Safely convert string representation of dict to actual dict
            data = ast.literal_eval(entry)
        except Exception as e:
            print(f"Skipping invalid entry due to error: {e}")
            continue

        sub_systems = data.get("subSystems", [])
        for subsystem in sub_systems:
            parameters = subsystem.get("parameters", {})
            sub_components = parameters.get("subComponent", {})
            
            if isinstance(sub_components, dict):
                for key in sub_components.keys():
                    # Remove trailing --<number>
                    base_key = re.sub(r'--\d+$', '', key)

                    # Check prefix
                    if base_key.startswith(("district-heating--", "local-heating-station--", "heating-circuit--")):
                        subcomponent_set.add(base_key)

    return sorted(subcomponent_set)

In [28]:
unique_subcomponents = extract_unique_subcomponents(df)
for compo in unique_subcomponents:
    print(compo)

district-heating--heat-exchanger
district-heating--pump-sec
district-heating--sec-flow-temp
district-heating--valve
heating-circuit--placeholder-cold-connection-node
heating-circuit--pump
heating-circuit--secondary-flow-temp
heating-circuit--valve
local-heating-station--heat-exchanger
local-heating-station--pump-prim
local-heating-station--pump-sec


## get subSystems

In [29]:
# Sample: df['LayoutID_Tops'] = your actual column
def extract_words(layout_str):
    # Split by '--', then split each part by ':'
    parts = layout_str.split('--')
    words = []
    for part in parts:
        words.extend(part.split(':'))
    return words

# Apply and flatten the list
all_words = df['LayoutID'].dropna().apply(extract_words).explode()

# Get unique values
unique_words = all_words.unique().tolist()

# Optional: sort alphabetically or numerically
unique_words.sort()

#get rid of pipes and numbers
cleaned_words = [
    word for word in unique_words
    if not word.isdigit() and
    'pipe' not in word.lower() and
    'valve' not in word.lower()
]

In [30]:
for word in cleaned_words:
    print(word)#cleaned_words

1-heating-circuit
buffer-tank
chp
district-heating
gas
global-separation-circuit
heat-exchanger
heat-pump
heating-circuit
hydraulic-separator
local-heating-station
solar-thermal
warm-water


# exports

In [34]:
if True:
    filename1='Building_ID_VS_LayoutID_'+extraced_date+'.csv'
    filename2='LayoutID_occurence_'+extraced_date+'.csv'
    filename3='LayoutID_occurence_new_std_'+extraced_date+'.csv'
    filename4='new_LayoutID_occurence_'+extraced_date+'.csv'
    filename5='new_LayoutID_occurence_new_std_'+extraced_date+'.csv'
    filepath1 = os.path.join('OUTs', filename1)
    filepath2 = os.path.join('OUTs', filename2)
    filepath3 = os.path.join('OUTs', filename3)
    filepath4 = os.path.join('OUTs', filename4)
    filepath5 = os.path.join('OUTs', filename5)
    df.to_csv(filepath1, index=False)
    grouped.to_csv(filepath2, index=False)
    grouped_new_std.to_csv(filepath3, index=False)
    grouped_new_layout.to_csv(filepath4, index=False)
    grouped_new_std_newlayout.to_csv(filepath5, index=False)

In [35]:
grouped_new_layout.head()

Unnamed: 0,new_LayoutID,Occurrence,KBn,buildingIDs,LayoutID
0,gas:1--heating-circuit_mixed_ctrl:1--warm-wate...,96,KB1,"[768, 769, 770, 778, 787, 1924, 463, 508, 512,...",{gas:1--heating-circuit:1--warm-water:1}
1,local-heating-station_noHX_ctrlprim_ctrlsec:1-...,41,KB5,"[509, 790, 1005, 1628, 1629, 1630, 1631, 672, ...",{local-heating-station:1--heating-circuit:1--w...
2,gas:1--heat-exchanger:1--heating-circuit_mixed...,39,KB4,"[1925, 1781, 1509, 1526, 1527, 1531, 1532, 153...",{gas:1--heat-exchanger:1--heating-circuit:1--w...
3,district-heating_noHX_ctrl:1--heating-circuit_...,37,KB2,"[1314, 1315, 1316, 1317, 1318, 1319, 1320, 132...",{district-heating:1--heating-circuit:1--warm-w...
4,gas:1--heating-circuit_mixed_ctrl:1,32,KB3,"[758, 759, 499, 1649, 1791, 1486, 1500, 2451, ...",{gas:1--heating-circuit:1}
