# Explore representative value(s) for each panel bin label

In [11]:
from pathlib import Path
import pandas as pd
import numpy as np

In [5]:
def undummify_input_data(df, prefix_sep="__"):
    # undummy categorical columns
    cat_cols = [x for x in df.columns if prefix_sep in x]
    other_cols = [x for x in df.columns if x not in cat_cols]
    dfd = undummify(df[cat_cols], prefix_sep=prefix_sep)

    # bin sqft
    para = "geometry_floor_area"
    bins = ["0-499", "500-749", "750-999", "1000-1499", "1500-1999", "2000-2499", "2500-2999", "3000-3999", "4000+"]
    bin_edges = [int(x.split("-")[0].split("+")[0]) for x in bins]
    dfd.loc[df["sqft"]>=bin_edges[-1], para] = bins[-1]
    for edge, label in zip(reversed(bin_edges[1:]), reversed(bins[:-1])):
        dfd.loc[df["sqft"]<edge, para] = label

    dfd["geometry_floor_area_bin"] = dfd["geometry_floor_area"].map({
        "0-499": "0-1499", 
        "500-749": "0-1499",
        "750-999": "0-1499", 
        "1000-1499": "0-1499", 
        "1500-1999": "1500-2499", 
        "2000-2499": "1500-2499", 
        "2500-2999": "2500-3999", 
        "3000-3999": "2500-3999", 
        "4000+": "4000+",
    })

    dfd = pd.concat([dfd, df[other_cols]], axis=1)

    return dfd


def undummify(df, prefix_sep="__"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)

    return undummified_df


def create_categorical_columns_for_input_data(dfd):
    dfd["geometry_floor_area"] = pd.Categorical(dfd["geometry_floor_area"], ordered=True, 
        categories=['0-499', '500-749', '750-999', '1000-1499', '1500-1999', '2000-2499', '2500-2999', '3000-3999', '4000+'])
    return dfd

def save_filtered_df_to_file(df, original_filename: Path, output_dir: Path | None):
    new_file = original_filename.stem + "__filtered.csv"
    if output_dir is None:
        new_filename = original_filename.parent / new_file
    else:
        new_filename = output_dir / new_file

    df.to_csv(new_filename, index=False)



### [1] Lookup table (by fuel type and floor area)

In [7]:
# Load training data and undummify:
model_training_file = Path("model_20240517/train_data_with_continuous_panel_amp_134078.csv") # electric heating
dfd1 = pd.read_csv(model_training_file, header=0)
dfd1["panel_amp_pre_bin_7"] = dfd1["panel_amp_pre_bin_7"].replace("lt_100", "<100")
dfd1 = undummify_input_data(dfd1)
dfd1 = create_categorical_columns_for_input_data(dfd1)

cond1 = ~dfd1["panel_amp_pre_bin_7"].isin(["100", "125", "200"])
dfd1.loc[cond1].groupby(["panel_amp_pre_bin_7"])["panel_amp_pre"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
panel_amp_pre_bin_7,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
101-124,2.0,120.0,0.0,120.0,120.0,120.0,120.0,120.0
126-199,10.0,150.0,0.0,150.0,150.0,150.0,150.0,150.0
201+,11.0,274.545455,80.94611,220.0,225.0,225.0,325.0,400.0
<100,5.0,88.0,4.472136,80.0,90.0,90.0,90.0,90.0


In [8]:
# Load training data and undummify:
model_training_file = Path("model_20240517/train_data_with_continuous_panel_amp_238518.csv") # non-electric heating
dfd2 = pd.read_csv(model_training_file, header=0)
dfd2["panel_amp_pre_bin_7"] = dfd2["panel_amp_pre_bin_7"].replace("lt_100", "<100")
dfd2 = undummify_input_data(dfd2)
dfd2 = create_categorical_columns_for_input_data(dfd2)

cond2 = ~dfd2["panel_amp_pre_bin_7"].isin(["100", "125", "200"])
dfd2.loc[cond2].groupby(["panel_amp_pre_bin_7"])["panel_amp_pre"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
panel_amp_pre_bin_7,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
101-124,106.0,116.509434,4.688883,110.0,110.0,120.0,120.0,120.0
126-199,1034.0,153.805609,10.074741,130.0,150.0,150.0,150.0,195.0
201+,714.0,330.364146,196.709985,205.0,225.0,250.0,400.0,2000.0
<100,438.0,69.417808,17.668868,30.0,60.0,60.0,90.0,90.0


In [9]:
# For 201+, combining electric (n=11) and non-electric (n=714) and then segment by geometry floor area
cond1 = dfd1["panel_amp_pre_bin_7"].isin(["201+"])
cond2 = dfd2["panel_amp_pre_bin_7"].isin(["201+"])
res = pd.concat([
    dfd1.loc[cond1, ["panel_amp_pre_bin_7", "geometry_floor_area", "panel_amp_pre"]],
    dfd2.loc[cond2, ["panel_amp_pre_bin_7", "geometry_floor_area", "panel_amp_pre"]]
], axis=0).groupby(["panel_amp_pre_bin_7", "geometry_floor_area", ])["panel_amp_pre"].agg(["count", "mean", "median"])

# standardize to nearest 25
res["mean_standardized"] = (res["mean"]/25).round()*25
res["median_standardized"] = (res["median"]/25).round()*25
res

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,median,mean_standardized,median_standardized
panel_amp_pre_bin_7,geometry_floor_area,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
201+,0-499,0,,,,
201+,500-749,13,322.692308,250.0,325.0,250.0
201+,750-999,34,386.029412,312.5,375.0,300.0
201+,1000-1499,159,324.528302,250.0,325.0,250.0
201+,1500-1999,180,293.361111,240.0,300.0,250.0
201+,2000-2499,116,332.241379,245.0,325.0,250.0
201+,2500-2999,78,325.320513,245.0,325.0,250.0
201+,3000-3999,79,376.012658,300.0,375.0,300.0
201+,4000+,66,356.893939,400.0,350.0,400.0


## [2] weighted standardized values
Pick out standard sizes for each bin's representative numbers:
value (top # place mode), e.g.,:
- 126-199:   ([150.0 (1st), 175.0 (2nd)])


In [14]:
# standardize panel_amp_pre values by rounding to the nearest bin defined above
standard_sizes = {
    "<100": np.array([30, 60, 70, 90]),
    "100": np.array([100]),
    "101-124": np.array([120]),
    "125": np.array([125]),
    "126-199": np.array([150, 175]),
    "200": np.array([200]),
    "201+": np.concatenate([
        np.array([225, 250, 275, 300, 325, 350, 375,]), 
        np.arange(400, 1100, 100)
    ])
}
dfd = pd.concat([
    dfd1[["panel_amp_pre_bin_7", "sqft", "geometry_floor_area", "panel_amp_pre"]],
    dfd2[["panel_amp_pre_bin_7", "sqft", "geometry_floor_area", "panel_amp_pre"]]
], axis=0)
for lab, std_sizes in standard_sizes.items():
    cond = dfd["panel_amp_pre_bin_7"]==lab
    dfd.loc[cond, "panel_amp_pre_std"] = dfd.loc[cond, "panel_amp_pre"].apply(lambda x: std_sizes[np.argmin(np.abs(std_sizes/x-1))])

dfd

Unnamed: 0,panel_amp_pre_bin_7,sqft,geometry_floor_area,panel_amp_pre,panel_amp_pre_std
0,200,1908.0,1500-1999,200,200.0
1,200,924.0,750-999,200,200.0
2,200,1194.0,1000-1499,200,200.0
3,125,1459.0,1000-1499,125,125.0
4,200,1053.0,1000-1499,200,200.0
...,...,...,...,...,...
24299,200,1204.0,1000-1499,200,200.0
24300,125,2150.0,2000-2499,125,125.0
24301,100,1040.0,1000-1499,100,100.0
24302,201+,2650.0,2500-2999,210,225.0


In [16]:
# turn standardized panel amps into rep value distribution
gb = ["panel_amp_pre_bin_7",]
panel_value_dist = (dfd.groupby(gb+["panel_amp_pre_std"])["sqft"].count() / dfd.groupby(gb)["sqft"].count()
                   ).rename("prob").reset_index(level='panel_amp_pre_std')
panel_value_dist = panel_value_dist.groupby(level=gb).agg({'panel_amp_pre_std': list, 'prob': list}).agg(tuple, axis=1).rename("panel_amp_dist")
panel_value_dist

panel_amp_pre_bin_7
100                                         ([100.0], [1.0])
101-124                                     ([120.0], [1.0])
125                                         ([125.0], [1.0])
126-199    ([150.0, 175.0], [0.8419540229885057, 0.158045...
200                                         ([200.0], [1.0])
201+       ([225.0, 250.0, 275.0, 300.0, 325.0, 350.0, 37...
<100       ([30.0, 60.0, 70.0, 90.0], [0.0361173814898419...
Name: panel_amp_dist, dtype: object

In [17]:
panel_value_dist.to_csv(model_training_file.parent / "weighted_standardized_panel_bin_values.csv")