In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

This code will take in three files that will be different sheets in the market characterization workbook. It will then multiply these values together to get to the initial count of every piece of equipment(condition) in the potential study.

In [11]:
#future cleaning functions here

In [12]:
#@task
def read_in_market_characterization_sheets(file_path: str = './001_input/02_market_characterization.xlsx') -> tuple:
    """Read market characterization sheets from the Excel file and return individual dataframes"""
    # Define the sheets we want to read
    sheets = ['customer_building_type_count', 'equipment_count_per_building', 'efficiency_level_breakout']
    
    # Read all sheets at once
    all_dfs = pd.read_excel(file_path, sheet_name=sheets, header=0)
    
    # Extract individual dataframes
    customer_building_type_count = all_dfs[sheets[0]]
    equipment_count_per_building = all_dfs[sheets[1]]
    efficiency_level_breakout = all_dfs[sheets[2]]
    
    # Required columns to check
    required_columns = {"single_family", "multi_family"}
    
    # Check each dataframe for required columns
    for sheet_name, df in zip(sheets, [customer_building_type_count, equipment_count_per_building, efficiency_level_breakout]):
        if not required_columns.issubset(df.columns):
            raise ValueError(
                f"Sheet '{sheet_name}' must contain {', '.join(required_columns)} columns"
            )
    
    return customer_building_type_count, equipment_count_per_building, efficiency_level_breakout

In [13]:
#Calling the function above
customer_building_type_count, equipment_count_per_building, efficiency_level_breakout = read_in_market_characterization_sheets()

In [14]:
#Parameters
building_types = ["single_family", "multi_family", "single_family_li", "multi_family_li"]

Combining first and second sheet (the number of customers in each building type and the number of equipment each customer has for that building type)

In [15]:
# Create a Cartesian product: each row in customer_building_type_count × all rows in equipment_count_per_building
# This ensures we have utility information for each equipment/condition combo

result_rows = []

# Iterate over each row in customer_building_type_count (each utility combo)
for cust_idx, cust_row in customer_building_type_count.iterrows():
    # For each equipment row
    for equip_idx, equip_row in equipment_count_per_building.iterrows():
        # Create a new row combining customer info and multiplied equipment counts
        new_row = {}
        
        # Copy non-building-type columns from customer row (e.g., electric_utility, gas_utility)
        for col in customer_building_type_count.columns:
            if col not in building_types:
                new_row[col] = cust_row[col]
        
        # Copy non-building-type columns from equipment row (e.g., competition_group, subgroup, condition_name)
        for col in equipment_count_per_building.columns:
            if col not in building_types:
                new_row[col] = equip_row[col]
        
        # Multiply each building type count: equipment_count × customer_building_count
        for building_type in building_types:
            equip_count = equip_row.get(building_type, 0)
            cust_count = cust_row.get(building_type, 0)
            new_row[building_type] = equip_count * cust_count
        
        result_rows.append(new_row)

# Create the new dataframe from the combined rows
equipment_count_total = pd.DataFrame(result_rows)

In [16]:
# Merge equipment_count_total and efficiency_level_breakout on 'condition_name' and 'competition_group'
merged_df = pd.merge(
    efficiency_level_breakout,
    equipment_count_total,
    on=["competition_group", "subgroup"],
    suffixes=("_eff", "_equip")
)

# Multiply building type columns where matches occur
for building_type in building_types:
    merged_df[building_type + "_year_one"] = merged_df[building_type + "_eff"] * merged_df[building_type + "_equip"]

 #Drop columns ending with _eff and _equip from merged_df",
cols_to_drop = [col for col in merged_df.columns if col.endswith('_eff') or col.endswith('_equip')]
df_yr1 = merged_df.drop(columns=cols_to_drop)


In [17]:
df_yr1.to_pickle("./001_output/df_yr1_001.pkl")
df_yr1.to_excel("./001_output/df_yr1_001.xlsx")