# Analysis of Layout ID

## Relevant documents

- (Python Client Repo)[https://github.com/Green-Fusion/energy-management-backend/tree/main/python_client]
- (Klemmenbelegung)[https://docs.google.com/spreadsheets/d/1nkdkx2rI6nVKgoKBgkCUtfwEwuv8kptrRUXcXtfv0NM/edit?gid=247168398#gid=247168398]
- (Hypothesis for Klemmenbelegung)[https://docs.google.com/spreadsheets/d/1TSTxMCgEvuoayzOfx1MUqlV0tiqsVTBRN8aldlnFXxA/edit?gid=0#gid=0]

# import data

In [1]:
#imports
import os
import pandas as pd
import json
import ast
import re
from pprint import pprint

In [2]:
#build df from database exctracted data
filename='Building_device_Layout_2025-07-15.csv'
extraced_date=filename[-14:-4]
filepath = os.path.join('INs', filename)
df = pd.read_csv(filepath)
#inspest column
df.columns

Index(['building_id', 'customerID', 'customer_name', 'address', 'postal_code',
       'city', 'coordinates', 'LayoutID', 'device_type', 'device_id',
       'modular_system', 't_sensor_count', 'gfid'],
      dtype='object')

In [3]:
#filter out uncesseray infos
df=df[['building_id','customerID', 'customer_name','device_type', 'customer_name','address','t_sensor_count','LayoutID','modular_system']]

In [4]:
df.head()

Unnamed: 0,building_id,customerID,customer_name,device_type,customer_name.1,address,t_sensor_count,LayoutID,modular_system
0,756,66,HwS,RUT956,HwS,Wolzogenstr.28,8,gas:1--gas:2--heating-circuit:1--heating-circu...,"{'name': 'modular-system-entry', 'id': 'modula..."
1,757,66,HwS,RUT956,HwS,Hagelberger Str. 26,4,gas:1--gas:2--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
2,758,66,HwS,RUT956,HwS,Hochstr. 8,7,gas:1--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
3,759,66,HwS,RUT956,HwS,Planufer 82a,4,gas:1--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
4,761,66,HwS,RUT956,HwS,Gabainstr. 13,8,gas:1--gas:2--global-separation-circuit:1--hea...,"{'name': 'modular-system-entry', 'id': 'modula..."


In [5]:
df.count()

building_id       2068
customerID        2068
customer_name     2068
device_type       1087
customer_name     2068
address           2068
t_sensor_count    2068
LayoutID          1138
modular_system    2068
dtype: int64

## get rid of API building and no Layout ID

In [6]:
type_of_devices=set()
for dev_type in df['device_type']:
    type_of_devices.add(dev_type)
type_of_devices   

{'ECR_LW300', 'RUT956', 'RevPiConnectSE', 'RevPiCore32SE', 'WAGOPFC200', nan}

In [7]:
df = df.dropna(subset=['device_type'])
df = df.dropna(subset=['LayoutID'])

In [8]:
df.count()

building_id       1010
customerID        1010
customer_name     1010
device_type       1010
customer_name     1010
address           1010
t_sensor_count    1010
LayoutID          1010
modular_system    1010
dtype: int64

# Group by LayoutID

In [9]:
unique_count = df['LayoutID'].nunique()
print(f"Number of unique LayoutID values: {unique_count}")

Number of unique LayoutID values: 219


In [10]:
# Group by LayoutID
grouped = df.groupby('LayoutID').agg(
    Occurrence=('LayoutID', 'count'),
    buildingIDs=('building_id', list),
    sensor_count=('t_sensor_count', set)
).reset_index()

# Add a simple index column starting from 0
grouped = grouped.sort_values(by='Occurrence', ascending=False).reset_index(drop=True)
grouped['KBn']=grouped.index+1
#grouped['KBn'] = grouped['KBn'].fillna(-1).astype(int)

In [11]:
# Show result
grouped.head(20)

Unnamed: 0,LayoutID,Occurrence,buildingIDs,sensor_count,KBn
0,gas:1--heating-circuit:1--warm-water:1,100,"[768, 769, 770, 778, 787, 1924, 1926, 1927, 46...","{0, 3, 4, 6, 7, 8, 9, 15}",1
1,district-heating:1--heating-circuit:1--warm-wa...,73,"[774, 1909, 1910, 1911, 1912, 1913, 1314, 1315...","{8, 9, 6, 7}",2
2,gas:1--heating-circuit:1,70,"[758, 759, 499, 1525, 1649, 1654, 1657, 1791, ...","{2, 3, 4, 7, 8}",3
3,gas:1--heat-exchanger:1--heating-circuit:1--wa...,44,"[1925, 1931, 1781, 1509, 1526, 1527, 1531, 153...","{8, 5}",4
4,local-heating-station:1--heating-circuit:1--wa...,42,"[509, 790, 1005, 1628, 1629, 1630, 1631, 672, ...","{8, 5, 6, 7}",5
5,gas:1--heat-exchanger:1--heating-circuit:1,38,"[1524, 1795, 1560, 1638, 1639, 604, 617, 623, ...","{3, 4, 6}",6
6,gas:1--gas:2--heat-exchanger:1--heating-circui...,25,"[510, 1773, 1777, 1778, 1528, 1529, 1530, 1539...","{8, 10, 11, 12, 13}",7
7,district-heating:1--heating-circuit:1--heating...,22,"[600, 1488, 1724, 796, 1772, 1385, 1386, 1389,...","{8, 9, 10, 13}",8
8,gas:1--heating-circuit:1--heating-circuit:2--w...,22,"[2219, 1511, 1515, 1516, 1517, 1518, 1537, 179...","{8, 9, 10, 12, 14}",9
9,district-heating:1--heating-circuit:1,20,"[762, 766, 777, 712, 450, 389, 391, 1725, 1726...","{3, 4, 5, 14}",10


In [12]:
# Sum the number of occurrences for top 10 and top 20
top_10_sum = grouped.head(10)['Occurrence'].sum()
top_20_sum = grouped.head(20)['Occurrence'].sum()
print(f"total LayoutID: {df.LayoutID.count()}")
print(f"top 10 LayoutIDs count: {top_10_sum}")
print(f"top 20 LayoutIDs count: {top_20_sum}")

print(f"Coverage top 10 LayoutIDs: {100*top_10_sum/df.LayoutID.count(): .1f} %")
print(f"Coverage top 20 LayoutIDs: {100*top_20_sum/df.LayoutID.count(): .1f} %")

total LayoutID: 1010
top 10 LayoutIDs count: 456
top 20 LayoutIDs count: 578
Coverage top 10 LayoutIDs:  45.1 %
Coverage top 20 LayoutIDs:  57.2 %


# new standard

In [13]:
# preparation add KBns
df = df.merge(grouped[['LayoutID', 'KBn']], on='LayoutID', how='left')
cols = ['KBn'] + [col for col in df.columns if col != 'KBn']
df = df[cols]
df.head()

Unnamed: 0,KBn,building_id,customerID,customer_name,customer_name.1,device_type,customer_name.2,customer_name.3,address,t_sensor_count,LayoutID,modular_system
0,41,756,66,HwS,HwS,RUT956,HwS,HwS,Wolzogenstr.28,8,gas:1--gas:2--heating-circuit:1--heating-circu...,"{'name': 'modular-system-entry', 'id': 'modula..."
1,34,757,66,HwS,HwS,RUT956,HwS,HwS,Hagelberger Str. 26,4,gas:1--gas:2--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
2,3,758,66,HwS,HwS,RUT956,HwS,HwS,Hochstr. 8,7,gas:1--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
3,3,759,66,HwS,HwS,RUT956,HwS,HwS,Planufer 82a,4,gas:1--heating-circuit:1,"{'name': 'modular-system-entry', 'id': 'modula..."
4,36,761,66,HwS,HwS,RUT956,HwS,HwS,Gabainstr. 13,8,gas:1--gas:2--global-separation-circuit:1--hea...,"{'name': 'modular-system-entry', 'id': 'modula..."


In [14]:
# remove HX and GSC and combine HX with warmwater

In [15]:
def transform_to_tops(layout):
    if pd.isna(layout):
        return layout

    parts = layout.split('--')

    new_parts = []
    skip_next = False

    for i in range(len(parts)):
        if skip_next:
            skip_next = False
            continue

        part = parts[i]

        # Rule 1: heat-exchanger directly before warm-water → skip heat-exchanger, modify warm-water
        if re.match(r'heat-exchanger:\d+', part):
            if i + 1 < len(parts) and re.match(r'warm-water:(\d+)', parts[i + 1]):
                match = re.match(r'warm-water:(\d+)', parts[i + 1])
                new_parts.append(f"warm-water_external:{match.group(1)}")
                skip_next = True
            # Else → skip this heat-exchanger entirely
            continue

        # Rule 2: remove global-separation-circuit:<n>
        elif re.match(r'global-separation-circuit:\d+', part):
            continue

        # Rule 3: remove hydraulic-separator:<n>
        elif re.match(r'hydraulic-separator:\d+', part):
            continue

        else:
            new_parts.append(part)

    return '--'.join(new_parts)

In [16]:
# Apply to your DataFrame
df['LayoutID_Tops'] = df['LayoutID'].apply(transform_to_tops)

## get new sensor count

In [17]:
sensor_count_subsystem={'buffer-tank': 1,
 'chp': 2,
 'district-heating': 2,
 'gas': 2,
 'heat-pump': 4,
 'heating-circuit': 2,
 'local-heating-station': 2,
 'solar-thermal': 2,
 'warm-water': 4}

def calculate_sensor_count(layout):
    if pd.isna(layout):
        return 0

    total = 0
    parts = layout.split('--')
    for part in parts:
        subsystem = part.split(':')[0]
        # Remove suffixes like _mixed_ctrl or _external if present
        subsystem_clean = subsystem.split('_')[0]
        count = sensor_count_subsystem.get(subsystem_clean, 0)
        total += count
    return total

In [18]:
# Apply the function
df['new_sensor_count'] = df['LayoutID_Tops'].apply(calculate_sensor_count)
df['count_diff'] = df['t_sensor_count']-df['new_sensor_count']

In [32]:
# mismatch_df
#mismatch_df = df.loc[(df['KBn'] == 3) & (df['t_sensor_count'] != df['new_sensor_count'])].reset_index() #per KBn
mismatch_df = df.loc[df['t_sensor_count'] < df['new_sensor_count']].reset_index()
mismatch_df[['building_id','index', 'KBn', 't_sensor_count', 'new_sensor_count','count_diff']]

Unnamed: 0,building_id,index,KBn,t_sensor_count,new_sensor_count,count_diff
0,757,1,34,4,6,-2
1,761,4,36,8,10,-2
2,766,8,10,3,4,-1
3,774,15,2,7,8,-1
4,777,18,10,3,4,-1
...,...,...,...,...,...,...
277,1762,983,5,7,8,-1
278,2460,992,91,8,9,-1
279,2299,1002,69,8,12,-4
280,2296,1007,2,6,8,-2


In [33]:
mismatch_df.describe()

Unnamed: 0,index,KBn,building_id,customerID,t_sensor_count,new_sensor_count,count_diff
count,282.0,282.0,282.0,282.0,282.0,282.0,282.0
mean,454.457447,58.152482,1275.691489,77.265957,8.024823,10.475177,-2.450355
std,264.794569,62.815281,640.084815,46.410096,4.09184,4.744102,2.262022
min,1.0,1.0,5.0,1.0,0.0,4.0,-21.0
25%,230.75,7.0,621.25,39.0,6.0,8.0,-2.75
50%,426.5,36.0,1404.5,61.5,8.0,10.0,-2.0
75%,669.5,87.5,1792.75,115.0,10.0,12.0,-1.0
max,1009.0,219.0,2468.0,187.0,22.0,27.0,-1.0


## group by LayoutID_Tops

In [20]:
unique_count = df['LayoutID_Tops'].nunique()
print(f"Number of unique LayoutID values: {unique_count}")

Number of unique LayoutID values: 181


In [21]:
from collections import Counter

def most_common(series):
    if series.empty:
        return None
    return Counter(series).most_common(1)[0][0]

# Group by LayoutID_Tops with additional 'most_common_sensor_count'
grouped_new_std = df.groupby('LayoutID_Tops').agg(
    Occurrence=('LayoutID_Tops', 'count'),
    buildingIDs=('building_id', list),
    sensor_count_set=('t_sensor_count', set),
    new_sensor_count=('new_sensor_count', set),
    Variations=('LayoutID', set),
    Variations_KBn=('KBn', set),
    most_common_count=('t_sensor_count', most_common)
).reset_index()

# Sort by Occurrence
grouped_new_std = grouped_new_std.sort_values(by='Occurrence', ascending=False).reset_index(drop=True)

# Reorder columns
grouped_new_std = grouped_new_std[
    ['LayoutID_Tops', 'Occurrence', 'most_common_count',
     'new_sensor_count',  'Variations_KBn', 'buildingIDs', 'Variations', 'sensor_count_set']
]

# Format Variations_KBn nicely
grouped_new_std['Variations_KBn'] = grouped_new_std['Variations_KBn'].apply(
    lambda s: ', '.join(f"KB{int(x)}" for x in sorted(s))
)

In [22]:
grouped_new_std[['LayoutID_Tops', 'Occurrence', 'most_common_count','new_sensor_count', 'Variations_KBn', 'buildingIDs', 'Variations','sensor_count_set']].head(20)

Unnamed: 0,LayoutID_Tops,Occurrence,most_common_count,new_sensor_count,Variations_KBn,buildingIDs,Variations,sensor_count_set
0,gas:1--heating-circuit:1--warm-water:1,156,8,{8},"KB1, KB4, KB15","[768, 769, 770, 778, 787, 304, 1924, 1925, 192...",{gas:1--global-separation-circuit:1--heating-c...,"{0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 22}"
1,gas:1--heating-circuit:1,121,4,{4},"KB3, KB6, KB13","[758, 759, 499, 507, 1524, 1525, 1649, 1654, 1...","{gas:1--heat-exchanger:1--heating-circuit:1, g...","{2, 3, 4, 5, 6, 7, 8}"
2,district-heating:1--heating-circuit:1--warm-wa...,73,8,{8},KB2,"[774, 1909, 1910, 1911, 1912, 1913, 1314, 1315...",{district-heating:1--heating-circuit:1--warm-w...,"{8, 9, 6, 7}"
3,local-heating-station:1--heating-circuit:1--wa...,42,8,{8},KB5,"[509, 790, 1005, 1628, 1629, 1630, 1631, 672, ...",{local-heating-station:1--heating-circuit:1--w...,"{8, 5, 6, 7}"
4,gas:1--gas:2--heating-circuit:1--warm-water:1,39,10,{10},"KB7, KB18, KB54","[478, 510, 1773, 1777, 1778, 1528, 1529, 1530,...",{gas:1--gas:2--heating-circuit:1--warm-water:1...,"{8, 9, 10, 11, 12, 13}"
5,gas:1--heating-circuit:1--heating-circuit:2--w...,37,10,{10},"KB9, KB20, KB45, KB164","[1775, 1779, 2219, 1511, 1515, 1516, 1517, 151...",{gas:1--global-separation-circuit:1--heating-c...,"{8, 9, 10, 11, 12, 14}"
6,gas:1--gas:2--heating-circuit:1,30,6,{6},"KB12, KB27, KB34, KB181","[757, 773, 429, 1506, 668, 680, 696, 745, 529,...",{gas:1--gas:2--hydraulic-separator:1--heating-...,"{4, 5, 6, 7, 8, 16}"
7,district-heating:1--heating-circuit:1--heating...,22,10,{10},KB8,"[600, 1488, 1724, 796, 1772, 1385, 1386, 1389,...",{district-heating:1--heating-circuit:1--heatin...,"{8, 9, 10, 13}"
8,gas:1--gas:2--heating-circuit:1--heating-circu...,22,12,{12},"KB19, KB25, KB58","[1928, 1930, 506, 333, 388, 1641, 414, 798, 80...",{gas:1--gas:2--heat-exchanger:1--heating-circu...,"{10, 11, 12, 13, 14}"
9,district-heating:1--heating-circuit:1,20,4,{4},KB10,"[762, 766, 777, 712, 450, 389, 391, 1725, 1726...",{district-heating:1--heating-circuit:1},"{3, 4, 5, 14}"


In [23]:
# Sum the number of occurrences for top 10 and top 20
top_10_sum = grouped_new_std.head(10)['Occurrence'].sum()
top_20_sum = grouped_new_std.head(20)['Occurrence'].sum()
print(f"total LayoutID: {df.LayoutID.count()}")
print(f"top 10 LayoutIDs count: {top_10_sum}")
print(f"top 20 LayoutIDs count: {top_20_sum}")

print(f"Coverage top 10 LayoutIDs: {100*top_10_sum/df.LayoutID.count(): .1f} %")
print(f"Coverage top 20 LayoutIDs: {100*top_20_sum/df.LayoutID.count(): .1f} %")

total LayoutID: 1010
top 10 LayoutIDs count: 562
top 20 LayoutIDs count: 695
Coverage top 10 LayoutIDs:  55.6 %
Coverage top 20 LayoutIDs:  68.8 %


## explore modular_system

In [24]:
if True: #test single buildings
    building_id = 1801#2389#1801#2317#1315#1593#1809 #1801 #
    mod_sys = df.loc[df['building_id'] == building_id, 'modular_system'].iloc[0]
    modular_system = ast.literal_eval(mod_sys)
    print(df.loc[df['building_id'] == building_id, 'LayoutID_Tops'].iloc[0])
    pprint(modular_system)

district-heating:1--heating-circuit:2
{'buildingID': 1801,
 'id': 'modular-system-mock-1',
 'layoutID': 'district-heating:1--heat-exchanger:1-heating-circuit:1--heating-circuit:2',
 'name': 'modular-system-entry',
 'published': True,
 'publishedAt': '2024-11-25T14:26:41.896Z',
 'subSystems': [{'connections': [{'status': 'cold',
                                  'to': {'direction': 'bottom',
                                         'id': 'district-heating:1',
                                         'subSystem': {'hydraulicLocationIndex': None,
                                                       'name': 'district-heating',
                                                       'side': 'right'}}},
                                 {'from': {'direction': 'bottom',
                                           'id': 'district-heating:1',
                                           'subSystem': {'hydraulicLocationIndex': None,
                                                         'name': '

In [25]:
#buildings_with_unmixed_unctrl = df[df['new_LayoutID'].str.contains('heating-circuit_unmixed_unctrl', na=False)]

# Display the result
#print(buildings_with_unmixed_unctrl[['buildingID', 'new_LayoutID']])

## get subcomponent

In [26]:
def extract_unique_subcomponents(df, column='modular_system'):
    subcomponent_set = set()

    for entry in df[column]:
        try:
            # Safely convert string representation of dict to actual dict
            data = ast.literal_eval(entry)
        except Exception as e:
            print(f"Skipping invalid entry due to error: {e}")
            continue

        sub_systems = data.get("subSystems", [])
        for subsystem in sub_systems:
            parameters = subsystem.get("parameters", {})
            sub_components = parameters.get("subComponent", {})
            
            if isinstance(sub_components, dict):
                for key in sub_components.keys():
                    # Remove trailing --<number>
                    base_key = re.sub(r'--\d+$', '', key)

                    # Check prefix
                    if base_key.startswith(("district-heating--", "local-heating-station--", "heating-circuit--")):
                        subcomponent_set.add(base_key)

    return sorted(subcomponent_set)

In [27]:
unique_subcomponents = extract_unique_subcomponents(df)
for compo in unique_subcomponents:
    print(compo)

district-heating--heat-exchanger
district-heating--pump-sec
district-heating--sec-flow-temp
district-heating--valve
heating-circuit--placeholder-cold-connection-node
heating-circuit--pump
heating-circuit--secondary-flow-temp
heating-circuit--valve
local-heating-station--heat-exchanger
local-heating-station--pump-prim
local-heating-station--pump-sec


## get subSystems

In [28]:
# Sample: df['LayoutID_Tops'] = your actual column
def extract_words(layout_str):
    # Split by '--', then split each part by ':'
    parts = layout_str.split('--')
    words = []
    for part in parts:
        words.extend(part.split(':'))
    return words

# Apply and flatten the list
all_words = df['LayoutID'].dropna().apply(extract_words).explode()

# Get unique values
unique_words = all_words.unique().tolist()

# Optional: sort alphabetically or numerically
unique_words.sort()

#get rid of pipes and numbers
cleaned_words = [
    word for word in unique_words
    if not word.isdigit() and
    'pipe' not in word.lower() and
    'valve' not in word.lower()
]

In [29]:
cleaned_words

['1-heating-circuit',
 'buffer-tank',
 'chp',
 'district-heating',
 'gas',
 'global-separation-circuit',
 'heat-exchanger',
 'heat-pump',
 'heating-circuit',
 'hydraulic-separator',
 'local-heating-station',
 'solar-thermal',
 'warm-water']

# exports

In [30]:
if True:
    filename1='Building_ID_VS_LayoutID_'+extraced_date+'.csv'
    filename2='LayoutID_occurence_'+extraced_date+'.csv'
    filename3='LayoutID_occurence_new_std_'+extraced_date+'.csv'
    filepath1 = os.path.join('OUTs', filename1)
    filepath2 = os.path.join('OUTs', filename2)
    filepath3 = os.path.join('OUTs', filename3)
    df.to_csv(filepath1, index=False)
    grouped.to_csv(filepath2, index=False)
    grouped_new_std.to_csv(filepath3, index=False)