Get community building samples from euss and renormalize the sample weight

In [41]:
import pandas as pd
from pathlib import Path
import numpy as np
import getpass

import add_envelope_metrics as er
user = getpass.getuser()
print(f"The user is {user}")

The user is lliu2


In [42]:
# ["louisville", "san_jose", "columbia", "north_birmingham", "jackson_county", "duluth",]
community = 'louisville' # TODO change community name

In [43]:
# set path
if user == "ylou2":
    euss_dir = 'data_/euss_res_final_2018_550k_20220901/' # TODO
elif user == "lliu2":
    euss_dir = "/Volumes/Lixi_Liu/euss_aws"
else:
    print(f"new user={user}, set euss_dir path here with an elif statement")

euss_dir = Path(euss_dir).resolve()
output_dir = Path(".").resolve() / "data_" / "community_building_samples" / community
output_dir.mkdir(exist_ok=True)

print(f"euss_dir: {euss_dir}")
print(f"output_dir: {output_dir}")

euss_dir: /Volumes/Lixi_Liu/euss_aws
output_dir: /Users/lliu2/Documents/GitHub/resstock/euss_cleap/data_/community_building_samples/louisville


In [44]:
building_id_weight = pd.read_csv(f'data_/downsampled_buildings_id/euss1_2018_results_up00__downsampled_method1__{community}.csv')
building_id_weight['building_id'] = building_id_weight['building_id'].astype(int)
building_id_weight = building_id_weight.set_index('building_id')

print(f"building_id_weight for {community} has {len(building_id_weight)} rows")
df = []
for i in reversed(range(11)):
    euss_up = pd.read_parquet(euss_dir / f'results_up{i:02d}.parquet', engine='pyarrow')
    euss_up["building_id"] = euss_up["building_id"].astype(int)
    community_up = euss_up.join(building_id_weight, on="building_id", how="right")
    n_downselected = len(community_up)

    community_up = community_up.loc[community_up["completed_status"] == "Success"].reset_index(drop=True) # drop failed sims
    output_file = output_dir / f"up{i:02d}.parquet"
    community_up.to_parquet(output_file)
    df.append(community_up)
    
    print(f" - for up{i:02d}, community_up has {n_downselected} rows after downselection, and {len(community_up)} rows after dropping failed sims")

print("all done!")


building_id_weight for louisville has 1403 rows
 - for up10, community_up has 1403 rows after downselection, and 1403 rows after dropping failed sims
 - for up09, community_up has 1403 rows after downselection, and 1403 rows after dropping failed sims
 - for up08, community_up has 1403 rows after downselection, and 1403 rows after dropping failed sims
 - for up07, community_up has 1403 rows after downselection, and 995 rows after dropping failed sims
 - for up06, community_up has 1403 rows after downselection, and 1400 rows after dropping failed sims
 - for up05, community_up has 1403 rows after downselection, and 1396 rows after dropping failed sims
 - for up04, community_up has 1403 rows after downselection, and 1399 rows after dropping failed sims
 - for up03, community_up has 1403 rows after downselection, and 1399 rows after dropping failed sims
 - for up02, community_up has 1403 rows after downselection, and 1382 rows after dropping failed sims
 - for up01, community_up has 1403 

### Add envelope metrics to baseline

In [45]:
er.setup_logging(community, output_dir / f"output__envelope_rating__{community}.log")
community_up = er.add_envelope_ratings(community_up, community)
community_up.to_parquet(output_file)


2023-08-16 22:51:37,118 - INFO - Adding envelope metrics to louisville
2023-08-16 22:51:39,209 - INFO - 
                       code-compliant above-average below-average
Frame Wall (R-value)            >= 30    < 30, >= 7           < 7
Masonry Wall (R-value)           >= 8     < 8, >= 5           < 5
Ceiling/Roof (R-value)          >= 60   < 60, >= 14          < 14
Air Leakage (ACH50)              <= 3    > 3, <= 20          > 20
2023-08-16 22:51:39,217 - INFO - 
combined_envelope_rating  wall_rating     ceiling_roof_rating  infiltration_rating
above-average             above-average   above-average        above-average          399
                                                               code-compliant           9
                          code-compliant  above-average        above-average           21
                                                               code-compliant           1
below-average             above-average   above-average        below-average           8

### Get summary

In [46]:
def weighted_mean(x, metric: str):
    if len(x[metric].dropna()) == 0:
        res = 0
    else:
        res = (x[metric] * x["sample_weight"]).sum()/x["sample_weight"].sum()
    col_name = f"{metric} wt_mean"
    return pd.Series({col_name: res})

def weighted_percentile(x, metric: str, percents: int):
    col_name = f"{metric} wt_p{percents}"
    res = _weighted_percentile(x[metric], percents, weights=x["sample_weight"])
    return pd.Series({col_name: res})

def _weighted_percentile(data, percents: int, weights=None):
    """
    perc : percentile in 0 - 1
    """
    # dropna
    idx = ~np.isnan(data) & ~np.isnan(weights)
    data = np.array(data)[idx]
    if len(data) == 0:
        return np.nan
    if weights is None:
        return np.percentile(data, percents)
    weights = np.array(weights)[idx]
    ix = np.argsort(data)
    data = data[ix] # sort data
    weights = weights[ix] # sort weights
    cdf = (np.cumsum(weights) - 0.5 * weights) / np.sum(weights) # 'like' a CDF function
    return np.interp(percents, cdf, data)


In [47]:
df = pd.concat(df, axis=0)
df["apply_upgrade.upgrade_name"] = df["apply_upgrade.upgrade_name"].fillna("Baseline")

  df = pd.concat(df, axis=0)


In [48]:
metric = 'report_simulation_output.energy_use_total_m_btu'
dfgb = df.groupby(["apply_upgrade.upgrade_name"])
pd.concat([
    dfgb["sample_weight"].count().rename("n_samples"),
    dfgb["sample_weight"].sum().rename("n_represented"),
    dfgb.apply(lambda x: weighted_mean(x, metric)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.25)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.75)),
], axis=1)

Unnamed: 0_level_0,n_samples,n_represented,report_simulation_output.energy_use_total_m_btu wt_mean,report_simulation_output.energy_use_total_m_btu wt_p0.25,report_simulation_output.energy_use_total_m_btu wt_p0.75
apply_upgrade.upgrade_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baseline,1403,357900.0,99.050604,51.73,132.3245
Basic enclosure,1370,349481.824661,77.321003,43.793,100.918
Enhanced enclosure,1382,352542.97933,72.563901,42.064,94.428
Heat pump water heaters,1400,357134.711333,92.997565,47.546,124.8275
"Heat pumps, high-efficiency, electric backup",1399,356879.61511,49.409461,31.81075,63.7455
"Heat pumps, min-efficiency, electric backup",1399,356879.61511,64.649485,39.784,82.923
"Heat pumps, min-efficiency, existing heating as backup",1396,356114.326443,77.568355,42.2165,101.115
"Whole-home electrification, high efficiency",1403,357900.0,38.683488,24.3915,49.39825
"Whole-home electrification, high efficiency + basic enclosure package (packages 1 & 8)",1403,357900.0,33.565047,22.072,42.30275
"Whole-home electrification, high efficiency + enhanced enclosure package (packages 2 & 8)",1403,357900.0,32.903318,21.7345,41.739


In [49]:
metric = 'report_simulation_output.fuel_use_electricity_total_m_btu'
dfgb = df.groupby(["apply_upgrade.upgrade_name"])
pd.concat([
    dfgb["sample_weight"].count().rename("n_samples"),
    dfgb["sample_weight"].sum().rename("n_represented"),
    dfgb.apply(lambda x: weighted_mean(x, metric)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.25)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.75)),
], axis=1)

Unnamed: 0_level_0,n_samples,n_represented,report_simulation_output.fuel_use_electricity_total_m_btu wt_mean,report_simulation_output.fuel_use_electricity_total_m_btu wt_p0.25,report_simulation_output.fuel_use_electricity_total_m_btu wt_p0.75
apply_upgrade.upgrade_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baseline,1403,357900.0,45.090349,25.595,55.4985
Basic enclosure,1370,349481.824661,39.364696,23.921,49.185
Enhanced enclosure,1382,352542.97933,37.987543,23.102,47.616
Heat pump water heaters,1400,357134.711333,44.186119,25.7145,53.8865
"Heat pumps, high-efficiency, electric backup",1399,356879.61511,40.62097,26.03125,51.6815
"Heat pumps, min-efficiency, electric backup",1399,356879.61511,55.867255,34.20525,71.31125
"Heat pumps, min-efficiency, existing heating as backup",1396,356114.326443,47.229978,30.3735,59.6785
"Whole-home electrification, high efficiency",1403,357900.0,38.434326,24.272,49.32075
"Whole-home electrification, high efficiency + basic enclosure package (packages 1 & 8)",1403,357900.0,33.315881,21.98125,42.09675
"Whole-home electrification, high efficiency + enhanced enclosure package (packages 2 & 8)",1403,357900.0,32.654151,21.526,41.40275


In [50]:
metric = 'report_simulation_output.fuel_use_natural_gas_total_m_btu'
dfgb = df.groupby(["apply_upgrade.upgrade_name"])
pd.concat([
    dfgb["sample_weight"].count().rename("n_samples"),
    dfgb["sample_weight"].sum().rename("n_represented"),
    dfgb.apply(lambda x: weighted_mean(x, metric)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.25)),
    dfgb.apply(lambda x: weighted_percentile(x, metric, percents=0.75)),
], axis=1)

Unnamed: 0_level_0,n_samples,n_represented,report_simulation_output.fuel_use_natural_gas_total_m_btu wt_mean,report_simulation_output.fuel_use_natural_gas_total_m_btu wt_p0.25,report_simulation_output.fuel_use_natural_gas_total_m_btu wt_p0.75
apply_upgrade.upgrade_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baseline,1403,357900.0,53.459465,0.0,86.1075
Basic enclosure,1370,349481.824661,37.600947,0.0,61.094
Enhanced enclosure,1382,352542.97933,34.236521,0.0,54.528
Heat pump water heaters,1400,357134.711333,48.363874,0.0,76.9075
"Heat pumps, high-efficiency, electric backup",1399,356879.61511,8.673839,0.0,14.65925
"Heat pumps, min-efficiency, electric backup",1399,356879.61511,8.667723,0.0,14.63075
"Heat pumps, min-efficiency, existing heating as backup",1396,356114.326443,30.052929,0.0,46.858
"Whole-home electrification, high efficiency",1403,357900.0,0.249167,0.0,0.0
"Whole-home electrification, high efficiency + basic enclosure package (packages 1 & 8)",1403,357900.0,0.249167,0.0,0.0
"Whole-home electrification, high efficiency + enhanced enclosure package (packages 2 & 8)",1403,357900.0,0.249167,0.0,0.0


In [49]:
# TODO: update TOU / tiered files without rerunning buildstock-query
# import pandas as pd
# file1 = "/Users/lliu2/Documents/GitHub/resstock/euss_cleap/data_/community_building_samples_with_upgrade_cost_and_bill/north_birmingham/processed_upgrade_results.parquet"
# file2 = "/Users/lliu2/Documents/GitHub/resstock/euss_cleap/data_/community_building_samples_with_upgrade_cost_and_bill/north_birmingham/processed_upgrade_results_tiered.parquet"
# df1 = pd.read_parquet(file1)
# df2 = pd.read_parquet(file2)

# df2_cols = df2.columns
# new_cols = [col for col in df2.columns if col not in df1.columns]
# df1["idx"] = df1["building_id"].astype(str).str.cat(df1["upgrade_name"])
# df2["idx"] = df2["building_id"].astype(str).str.cat(df2["upgrade_name"])

# df3 = df1.join(df2.set_index(["idx"])[new_cols], how="left").drop(columns=["idx"])

# print(set(df3.columns)-(set(df2_cols)))
# print(set(df2_cols)-(set(df3.columns)))
# assert len(df1) == len(df3)

# df3.to_parquet(file2)