Get community building samples from euss and renormalize the sample weight

In [11]:
import pandas as pd
from pathlib import Path
import numpy as np
import getpass

import add_envelope_metrics as er
user = getpass.getuser()
print(f"The user is {user}")

The user is ylou2


In [12]:
# ["louisville", "san_jose", "columbia", "north_birmingham", "jackson_county", "duluth", "lawrence",]
community = 'north_birmingham' # TODO change community name

In [13]:
# set path
if user == "ylou2":
    euss_dir = 'data_/euss_res_final_2018_550k_20220901/' # TODO
elif user == "lliu2":
    euss_dir = "/Volumes/Lixi_Liu/euss_aws"
else:
    print(f"new user={user}, set euss_dir path here with an elif statement")

euss_dir = Path(euss_dir).resolve()
output_dir = Path(".").resolve() / "data_" / "community_building_samples" / community
output_dir.mkdir(exist_ok=True)

print(f"euss_dir: {euss_dir}")
print(f"output_dir: {output_dir}")

euss_dir: C:\Users\ylou2\Desktop\resstock\euss_cleap\data_\euss_res_final_2018_550k_20220901
output_dir: C:\Users\ylou2\Desktop\resstock\euss_cleap\data_\community_building_samples\north_birmingham


In [14]:
building_id_weight = pd.read_csv(f'data_/downsampled_buildings_id/euss1_2018_results_up00__downsampled_method1__{community}.csv')
building_id_weight['building_id'] = building_id_weight['building_id'].astype(int)
building_id_weight = building_id_weight.set_index('building_id')

print(f"building_id_weight for {community} has {len(building_id_weight)} rows")
df = []
for i in reversed(range(11)):
    euss_up = pd.read_parquet(euss_dir / f'results_up{i:02d}.parquet', engine='pyarrow')
    euss_up["building_id"] = euss_up["building_id"].astype(int)
    community_up = euss_up.join(building_id_weight, on="building_id", how="right")
    n_downselected = len(community_up)

    community_up = community_up.loc[community_up["completed_status"] == "Success"].reset_index(drop=True) # drop failed sims
    output_file = output_dir / f"up{i:02d}.parquet"
    community_up.to_parquet(output_file)
    df.append(community_up)
    
    print(f" - for up{i:02d}, community_up has {n_downselected} rows after downselection, and {len(community_up)} rows after dropping failed sims")

print("all done!")


building_id_weight for north_birmingham has 1158 rows
 - for up10, community_up has 1158 rows after downselection, and 1158 rows after dropping failed sims
 - for up09, community_up has 1158 rows after downselection, and 1158 rows after dropping failed sims
 - for up08, community_up has 1158 rows after downselection, and 1158 rows after dropping failed sims
 - for up07, community_up has 1158 rows after downselection, and 613 rows after dropping failed sims
 - for up06, community_up has 1158 rows after downselection, and 1149 rows after dropping failed sims
 - for up05, community_up has 1158 rows after downselection, and 1157 rows after dropping failed sims
 - for up04, community_up has 1158 rows after downselection, and 1155 rows after dropping failed sims
 - for up03, community_up has 1158 rows after downselection, and 1155 rows after dropping failed sims
 - for up02, community_up has 1158 rows after downselection, and 1148 rows after dropping failed sims
 - for up01, community_up has

### Add envelope metrics to baseline

In [15]:
er.setup_logging(community, output_dir / f"output__envelope_rating__{community}.log")
community_up = er.add_envelope_ratings(community_up, community)
community_up.to_parquet(output_file)


2023-09-07 21:41:17,968 - INFO - Adding envelope metrics to north_birmingham
2023-09-07 21:41:24,187 - INFO - 
                       code-compliant above-average below-average
Frame Wall (R-value)            >= 20    < 20, >= 8           < 8
Masonry Wall (R-value)           >= 8     < 8, >= 8           < 8
Ceiling/Roof (R-value)          >= 49   < 49, >= 12          < 12
Air Leakage (ACH50)              <= 3    > 3, <= 17          > 17
2023-09-07 21:41:24,207 - INFO - 
combined_envelope_rating  wall_rating     ceiling_roof_rating  infiltration_rating
above-average             above-average   above-average        above-average          340
                                                               code-compliant           5
                                          code-compliant       above-average            9
                          code-compliant  above-average        above-average           43
                                                               code-compliant     

### Get summary

In [16]:
def weighted_mean(x, metric: str):
    if len(x[metric].dropna()) == 0:
        res = 0
    else:
        res = (x[metric] * x["sample_weight"]).sum()/x["sample_weight"].sum()
    col_name = f"{metric} wt_mean"
    return pd.Series({col_name: res})

def weighted_percentile(x, metric: str, percents: int):
    col_name = f"{metric} wt_p{percents}"
    res = _weighted_percentile(x[metric], percents, weights=x["sample_weight"])
    return pd.Series({col_name: res})

def _weighted_percentile(data, percents: int, weights=None):
    """
    perc : percentile in 0 - 1
    """
    # dropna
    idx = ~np.isnan(data) & ~np.isnan(weights)
    data = np.array(data)[idx]
    if len(data) == 0:
        return np.nan
    if weights is None:
        return np.percentile(data, percents)
    weights = np.array(weights)[idx]
    ix = np.argsort(data)
    data = data[ix] # sort data
    weights = weights[ix] # sort weights
    cdf = (np.cumsum(weights) - 0.5 * weights) / np.sum(weights) # 'like' a CDF function
    return np.interp(percents, cdf, data)


In [17]:
df = pd.concat(df, axis=0)
df["apply_upgrade.upgrade_name"] = df["apply_upgrade.upgrade_name"].fillna("Baseline")

In [18]:
metric = 'report_simulation_output.energy_use_total_m_btu'
dfgb = df.groupby(["apply_upgrade.upgrade_name"])
pd.concat([
    dfgb["sample_weight"].count().rename("n_samples"),
    dfgb["sample_weight"].sum().rename("n_represented"),
    dfgb.apply(lambda x: weighted_mean(x, metric)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.25)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.75)),
], axis=1)

Unnamed: 0_level_0,n_samples,n_represented,report_simulation_output.energy_use_total_m_btu wt_mean,report_simulation_output.energy_use_total_m_btu wt_p0.25,report_simulation_output.energy_use_total_m_btu wt_p0.75
apply_upgrade.upgrade_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baseline,1158,12445.0,75.385937,41.481512,98.03718
Basic enclosure,1144,12318.166937,59.768032,36.709187,76.602376
Enhanced enclosure,1148,12340.998249,57.1637,35.97471,73.153886
Heat pump water heaters,1149,12368.694344,69.719602,37.479341,90.724393
"Heat pumps, high-efficiency, electric backup",1155,12433.212256,43.146241,28.414643,55.261429
"Heat pumps, min-efficiency, electric backup",1155,12433.212256,55.115922,35.316459,70.296526
"Heat pumps, min-efficiency, existing heating as backup",1157,12421.781716,61.219991,36.429033,78.695125
"Whole-home electrification, high efficiency",1158,12445.0,34.240805,23.195264,43.37663
"Whole-home electrification, high efficiency + basic enclosure package (packages 1 & 8)",1158,12445.0,30.45142,20.91583,38.604771
"Whole-home electrification, high efficiency + enhanced enclosure package (packages 2 & 8)",1158,12445.0,29.914181,20.230896,38.047975


In [19]:
metric = 'report_simulation_output.fuel_use_electricity_total_m_btu'
dfgb = df.groupby(["apply_upgrade.upgrade_name"])
pd.concat([
    dfgb["sample_weight"].count().rename("n_samples"),
    dfgb["sample_weight"].sum().rename("n_represented"),
    dfgb.apply(lambda x: weighted_mean(x, metric)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.25)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.75)),
], axis=1)

Unnamed: 0_level_0,n_samples,n_represented,report_simulation_output.fuel_use_electricity_total_m_btu wt_mean,report_simulation_output.fuel_use_electricity_total_m_btu wt_p0.25,report_simulation_output.fuel_use_electricity_total_m_btu wt_p0.75
apply_upgrade.upgrade_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baseline,1158,12445.0,48.884882,29.371278,61.605286
Basic enclosure,1144,12318.166937,41.421595,25.985415,52.953251
Enhanced enclosure,1148,12340.998249,40.332706,25.452894,50.901756
Heat pump water heaters,1149,12368.694344,47.170746,28.553035,58.155272
"Heat pumps, high-efficiency, electric backup",1155,12433.212256,36.277911,24.907712,46.094542
"Heat pumps, min-efficiency, electric backup",1155,12433.212256,48.249339,31.878224,60.892838
"Heat pumps, min-efficiency, existing heating as backup",1157,12421.781716,45.203973,30.561132,56.936911
"Whole-home electrification, high efficiency",1158,12445.0,33.731704,22.932491,42.915697
"Whole-home electrification, high efficiency + basic enclosure package (packages 1 & 8)",1158,12445.0,29.942313,20.70082,38.252037
"Whole-home electrification, high efficiency + enhanced enclosure package (packages 2 & 8)",1158,12445.0,29.405076,20.035852,37.742843


In [20]:
metric = 'report_simulation_output.fuel_use_natural_gas_total_m_btu'
dfgb = df.groupby(["apply_upgrade.upgrade_name"])
pd.concat([
    dfgb["sample_weight"].count().rename("n_samples"),
    dfgb["sample_weight"].sum().rename("n_represented"),
    dfgb.apply(lambda x: weighted_mean(x, metric)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.25)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.75)),
], axis=1)

Unnamed: 0_level_0,n_samples,n_represented,report_simulation_output.fuel_use_natural_gas_total_m_btu wt_mean,report_simulation_output.fuel_use_natural_gas_total_m_btu wt_p0.25,report_simulation_output.fuel_use_natural_gas_total_m_btu wt_p0.75
apply_upgrade.upgrade_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baseline,1158,12445.0,25.807744,0.0,46.299963
Basic enclosure,1144,12318.166937,17.813685,0.0,31.83008
Enhanced enclosure,1148,12340.998249,16.341939,0.0,28.499566
Heat pump water heaters,1149,12368.694344,22.056539,0.0,38.285416
"Heat pumps, high-efficiency, electric backup",1155,12433.212256,6.536476,0.0,11.512533
"Heat pumps, min-efficiency, electric backup",1155,12433.212256,6.534936,0.0,11.512533
"Heat pumps, min-efficiency, existing heating as backup",1157,12421.781716,15.492019,0.0,26.624412
"Whole-home electrification, high efficiency",1158,12445.0,0.509106,0.0,0.0
"Whole-home electrification, high efficiency + basic enclosure package (packages 1 & 8)",1158,12445.0,0.509106,0.0,0.0
"Whole-home electrification, high efficiency + enhanced enclosure package (packages 2 & 8)",1158,12445.0,0.509106,0.0,0.0


In [21]:
# TODO: update TOU / tiered files without rerunning buildstock-query
# import pandas as pd
# file1 = "/Users/lliu2/Documents/GitHub/resstock/euss_cleap/data_/community_building_samples_with_upgrade_cost_and_bill/north_birmingham/processed_upgrade_results.parquet"
# file2 = "/Users/lliu2/Documents/GitHub/resstock/euss_cleap/data_/community_building_samples_with_upgrade_cost_and_bill/north_birmingham/processed_upgrade_results_tiered.parquet"
# df1 = pd.read_parquet(file1)
# df2 = pd.read_parquet(file2)

# df2_cols = df2.columns
# new_cols = [col for col in df2.columns if col not in df1.columns]
# df1["idx"] = df1["building_id"].astype(str).str.cat(df1["upgrade_name"])
# df2["idx"] = df2["building_id"].astype(str).str.cat(df2["upgrade_name"])

# df3 = df1.join(df2.set_index(["idx"])[new_cols], how="left").drop(columns=["idx"])

# print(set(df3.columns)-(set(df2_cols)))
# print(set(df2_cols)-(set(df3.columns)))
# assert len(df1) == len(df3)

# df3.to_parquet(file2)