In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import time
from zipfile import ZipFile
import warnings
import pickle
import torch
from torch.utils.data import Dataset
import tsfel
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
from hiclass import LocalClassifierPerNode, LocalClassifierPerParentNode, LocalClassifierPerLevel
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

In [21]:
base_dir = "../logs/ensemble/base_ensemble/01_27_2025-12_43_22"

In [22]:
os.listdir(base_dir)

['xgb', 'rf', 'lgb']

In [23]:
cv_res = pd.read_csv(os.path.join(base_dir, 'rf/cv_report.csv'))
cv_res

Unnamed: 0,col,precision,recall,f1,support
0,Active_Power_Sensor,0.174897,0.273752,0.213434,
1,Air_Flow_Sensor,0.182297,0.272370,0.218411,
2,Air_Flow_Setpoint,0.098485,0.144444,0.117117,
3,Air_Temperature_Sensor,0.264496,0.298428,0.280440,
4,Air_Temperature_Setpoint,0.232540,0.277473,0.253027,
...,...,...,...,...,...
90,Zone_Air_Dewpoint_Sensor,0.301282,0.261111,0.279762,
91,Zone_Air_Humidity_Sensor,0.318627,0.300926,0.309524,
92,Zone_Air_Humidity_Setpoint,0.000000,0.000000,0.000000,
93,Zone_Air_Temperature_Sensor,0.289678,0.294362,0.292001,


In [302]:
import pandas as pd
from typing import List

def merge_mean(df_list):
    """
    Given a list of DataFrames with identical columns and index,
    return one DataFrame whose numeric columns are the mean of all
    those numeric columns across the list.

    The "filename" column is assumed to be the same or can be taken
    from the first DataFrame.
    """
    # Make a copy of the first DataFrame to initialize the output
    merged_df = df_list[0].copy()

    # Identify all numeric columns (excluding "filename")
    numeric_cols = merged_df.columns.drop("filename")

    # Sum up the numeric columns from all other DataFrames
    for df in df_list[1:]:
        merged_df[numeric_cols] += df[numeric_cols]

    # Divide by the number of DataFrames to get the mean
    merged_df[numeric_cols] /= len(df_list)

    return merged_df


def weighted_merge_mean(df_list, weights: List[dict]):
    """
    Given a list of DataFrames with identical columns and index,
    return one DataFrame whose numeric columns are the mean of all
    those numeric columns across the list.

    The "filename" column is assumed to be the same or can be taken
    from the first DataFrame.
    """
    # Make a copy of the first DataFrame to initialize the output
    merged_df = df_list[0].copy()

    # Identify all numeric columns (excluding "filename")
    numeric_cols = merged_df.columns.drop("filename")

    # Sum up the numeric columns from all other DataFrames
    for idx, df in enumerate(df_list):
        for col in numeric_cols:
            if idx == 0:
                merged_df[col] = df[col] * weights[idx][col]
            else:
                merged_df[col] += df[col] * weights[idx][col]
    
    for col in numeric_cols:
        merged_df[col] = merged_df[col].round(3)
        merged_df[col] = merged_df[col].fillna(0.0)
        
    return merged_df



In [36]:
def check_pred_num(_final_res, thr=0.4):
    # Exclude 'filename' column if it exists
    filtered_df = _final_res.drop(columns=['filename'], errors='ignore')

    return (filtered_df >= thr).sum(axis=1)

In [37]:
m_res = merge_mean(avg)

In [38]:
check_pred_num(m_res).value_counts()

2     85307
3     74616
4     58612
1     52155
5     31619
6      9940
7      2610
8       726
9       119
10       16
Name: count, dtype: int64

# Ensemble

### Get importance of each model of each class according to CV

In [206]:
LABEL_NAMES = [
    'Active_Power_Sensor', 'Air_Flow_Sensor',
    'Air_Flow_Setpoint', 'Air_Temperature_Sensor',
    'Air_Temperature_Setpoint', 'Alarm', 'Angle_Sensor',
    'Average_Zone_Air_Temperature_Sensor',
    'Chilled_Water_Differential_Temperature_Sensor',
    'Chilled_Water_Return_Temperature_Sensor',
    'Chilled_Water_Supply_Flow_Sensor',
    'Chilled_Water_Supply_Temperature_Sensor', 'Command',
    'Cooling_Demand_Sensor', 'Cooling_Demand_Setpoint',
    'Cooling_Supply_Air_Temperature_Deadband_Setpoint',
    'Cooling_Temperature_Setpoint', 'Current_Sensor',
    'Damper_Position_Sensor', 'Damper_Position_Setpoint', 'Demand_Sensor',
    'Dew_Point_Setpoint', 'Differential_Pressure_Sensor',
    'Differential_Pressure_Setpoint',
    'Differential_Supply_Return_Water_Temperature_Sensor',
    'Discharge_Air_Dewpoint_Sensor', 'Discharge_Air_Temperature_Sensor',
    'Discharge_Air_Temperature_Setpoint',
    'Discharge_Water_Temperature_Sensor', 'Duration_Sensor',
    'Electrical_Power_Sensor', 'Energy_Usage_Sensor',
    'Filter_Differential_Pressure_Sensor', 'Flow_Sensor', 'Flow_Setpoint',
    'Frequency_Sensor', 'Heating_Demand_Sensor', 'Heating_Demand_Setpoint',
    'Heating_Supply_Air_Temperature_Deadband_Setpoint',
    'Heating_Temperature_Setpoint', 'Hot_Water_Flow_Sensor',
    'Hot_Water_Return_Temperature_Sensor',
    'Hot_Water_Supply_Temperature_Sensor', 'Humidity_Setpoint',
    'Load_Current_Sensor', 'Low_Outside_Air_Temperature_Enable_Setpoint',
    'Max_Air_Temperature_Setpoint', 'Min_Air_Temperature_Setpoint',
    'Outside_Air_CO2_Sensor', 'Outside_Air_Enthalpy_Sensor',
    'Outside_Air_Humidity_Sensor',
    'Outside_Air_Lockout_Temperature_Setpoint',
    'Outside_Air_Temperature_Sensor', 'Outside_Air_Temperature_Setpoint',
    'Parameter', 'Peak_Power_Demand_Sensor', 'Position_Sensor',
    'Power_Sensor', 'Pressure_Sensor', 'Rain_Sensor',
    'Reactive_Power_Sensor', 'Reset_Setpoint',
    'Return_Air_Temperature_Sensor', 'Return_Water_Temperature_Sensor',
    'Room_Air_Temperature_Setpoint', 'Sensor', 'Setpoint',
    'Solar_Radiance_Sensor', 'Speed_Setpoint', 'Static_Pressure_Sensor',
    'Static_Pressure_Setpoint', 'Status', 'Supply_Air_Humidity_Sensor',
    'Supply_Air_Static_Pressure_Sensor',
    'Supply_Air_Static_Pressure_Setpoint', 'Supply_Air_Temperature_Sensor',
    'Supply_Air_Temperature_Setpoint', 'Temperature_Sensor',
    'Temperature_Setpoint', 'Thermal_Power_Sensor', 'Time_Setpoint',
    'Usage_Sensor', 'Valve_Position_Sensor', 'Voltage_Sensor',
    'Warmest_Zone_Air_Temperature_Sensor', 'Water_Flow_Sensor',
    'Water_Temperature_Sensor', 'Water_Temperature_Setpoint',
    'Wind_Direction_Sensor', 'Wind_Speed_Sensor',
    'Zone_Air_Dewpoint_Sensor', 'Zone_Air_Humidity_Sensor',
    'Zone_Air_Humidity_Setpoint', 'Zone_Air_Temperature_Sensor'
]

In [209]:
xgb_val = pd.read_csv("../logs/ensemble/high_prec_val/xgb.csv")
lgb_val = pd.read_csv("../logs/ensemble/high_prec_val/lgb.csv")
rf_val = pd.read_csv("../logs/ensemble/high_prec_val/rf.csv")

In [457]:
[g_df.drop(columns=["fold_idx", "dataset_idx"]) for _, g_df in xgb_val.groupby("dataset_idx")][0]

train_sets = []
for _, g_df in xgb_val.groupby("dataset_idx"):
    g_df = g_df.drop(columns=["fold_idx", "dataset_idx"])
    g_df = g_df.rename(columns={col: f"{col}_S1" for col in g_df.columns})
    train_sets.append(g_df)

train_sets[0]

[       0_Absolute energy_S1  0_Area under the curve_S1  0_Autocorrelation_S1   
 0                     795.0                      828.0                 741.0  \
 1                     801.0                      857.0                 734.0   
 2                     893.0                      899.0                 875.0   
 3                     969.0                      974.0                   0.0   
 4                     939.0                      952.0                   1.0   
 ...                     ...                        ...                   ...   
 31834                 998.0                      998.0                 994.0   
 31835                 478.0                      527.0                   1.0   
 31836                 860.0                      866.0                   0.0   
 31837                 921.0                      921.0                   0.0   
 31838                 576.0                      645.0                   0.0   
 
        0_Average power_S1

In [247]:
from datetime import datetime
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

LABEL_TIERS = 5

def get_test_agg(test_preds_list):
    test_level_agg = []
    for _level in tqdm(range(LABEL_TIERS), desc=f"[{datetime.now()}] Aggregating predictions"):
        _level_res = pd.concat(test_preds_list[_level], axis=1).groupby(level=0, axis=1).mean()
        assert not _level_res.isna().values.any()

        for col in _level_res.columns:
            _level_res = _level_res.rename(columns={col: f"{col}_{_level}"})

        test_level_agg.append(_level_res)

    return test_level_agg

def get_stacked_res(test_level_agg):
    stacked = np.stack(
        test_level_agg[i].idxmax(axis=1).apply(lambda x: x[:-2])
        for i in range(LABEL_TIERS)
    ).transpose()

    for row in tqdm(stacked, desc=f"[{datetime.now()}] Postprocessing Nones"):
        # Find first occurrence of 'None' if any
        none_idx = np.where(row == 'None')[0]
        if len(none_idx) > 0:
            # Set all elements after first None to None
            first_none = none_idx[0]
            row[first_none:] = 'None'
            
    return stacked

def post_processing(test_preds, columnlist, listtestfile):
    stackedfinalresult = pd.DataFrame(columns=['filename'])
    stackedfinalresult['filename'] = pd.Series(listtestfile).apply(lambda x: x.split("/")[-1])

    for labelname in columnlist:
        stackedfinalresult[labelname] = 0

    for i in tqdm(range(len(test_preds)), desc=f"[{datetime.now()}] Preparing final result file"):
        predlist = test_preds[i].tolist()
        predlist = [x for x in predlist if x != 'None']
        for predlabelname in predlist:
            stackedfinalresult.loc[i, predlabelname] = 1

    stackedfinalresult = stackedfinalresult.assign(**{col: stackedfinalresult[col].astype(float) for col in stackedfinalresult.columns if col != "filename"})
    
    return stackedfinalresult

def evaluate(label_df, pred_df):
    report = []
    for col in label_df:
        if col == "filename": continue

        col_eval = pd.DataFrame({"label": label_df[col], "pred": pred_df[col]})
        col_eval = col_eval[col_eval["label"] != 0]
        col_eval['label'] = (col_eval['label'] > 0).astype(int)
        col_eval['pred'] = (col_eval['pred'] > 0.5).astype(int)

        # Compute precision, recall, and f1-score
        precision, recall, f1, support = precision_recall_fscore_support(
            col_eval["label"],
            col_eval["pred"],
            average="binary",
            zero_division=0  # to handle divisions by zero if any
        )

        
        try:
            auroc = roc_auc_score(
                col_eval["label"],
                col_eval["pred"],
            )
        except:
            print(f"{col} auroc failed, replace with 1")
            auroc = 1
        
        # Add results to the report list
        report.append({
            "col": col,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "auroc": auroc,
            "support": support
        })
    
    # Convert the report list to a DataFrame
    report_df = pd.DataFrame(report)
    
    # Calculate averages (mean) for precision, recall, f1, and sum for support
    avg_row = {
        "col": "AVERAGE",
        "precision": report_df["precision"].mean(),
        "recall": report_df["recall"].mean(),
        "f1": report_df["f1"].mean(),
        "auroc": auroc,
        "support": report_df["support"].sum()
    }

    print(f"[{datetime.now()}] Avg Precision: {avg_row['precision']:.3f}, Recall {avg_row['recall']:.3f}, F1 {avg_row['f1']}, AUROC {avg_row['auroc']}")
    report_df = pd.concat([report_df, pd.DataFrame([avg_row])], ignore_index=True)

    return report_df

def evaluation(train_y: pd.DataFrame, val_preds, label_start_col_idx=90):
    """
    Macro precision, recall and f1 evaluation
    """
    # Get columns to split tiers
    col_tiers = [[] for _ in range(LABEL_TIERS)]

    for col in list(val_preds.columns)[label_start_col_idx:]:
        col_tiers[int(col[-1])].append(col)

    # keep the label related columns
    # one more column for dataset_idx, one more for fold_idx
    val_preds = [
        g_df.sort_values("fold_idx").iloc[:, label_start_col_idx:]
        for _, g_df in val_preds.groupby("dataset_idx")
    ]

    # Prepare a list of list
    # The first level is tiers, and the second level is prediction from each dataset
    val_preds_list = []
    for cols in col_tiers:
        tier_preds = []
        for dataset_preds in val_preds:
            _df = dataset_preds[cols]
            tier_preds.append(_df.rename(columns={col: col[:-2] for col in _df.columns}).reset_index(drop=True))

        val_preds_list.append(tier_preds)

    # Use the same pipeline as test prediction to prepare final result
    val_level_pred_list = get_test_agg(val_preds_list)
    stacked_val_res = get_stacked_res(val_level_pred_list)
    val_final_res = post_processing(stacked_val_res, LABEL_NAMES, list(train_y['filename']))

    report = evaluate(train_y, val_final_res)

    return report

In [248]:
train_y = pd.read_csv("../downloads/train_y_v0.1.0.csv")

In [249]:
xgb_report = evaluation(train_y, xgb_val)

[2025-01-27 20:14:26.846086] Aggregating predictions: 100%|██████████| 5/5 [00:00<00:00,  6.67it/s]
[2025-01-27 20:14:27.808415] Postprocessing Nones: 100%|██████████| 31839/31839 [00:00<00:00, 131334.97it/s]
[2025-01-27 20:14:28.109296] Preparing final result file: 100%|██████████| 31839/31839 [00:07<00:00, 4333.89it/s]


Cooling_Demand_Sensor auroc failed, replace with 1
Heating_Demand_Sensor auroc failed, replace with 1
[2025-01-27 20:14:37.594993] Avg Precision: 0.818, Recall 0.805, F1 0.7985722210101738, AUROC 0.9870692868465244


In [250]:
lgb_report = evaluation(train_y, lgb_val)

[2025-01-27 20:14:59.583396] Aggregating predictions: 100%|██████████| 5/5 [00:00<00:00,  6.94it/s]
[2025-01-27 20:15:00.510327] Postprocessing Nones: 100%|██████████| 31839/31839 [00:00<00:00, 132390.52it/s]
[2025-01-27 20:15:00.807818] Preparing final result file: 100%|██████████| 31839/31839 [00:07<00:00, 4279.29it/s]


Cooling_Demand_Sensor auroc failed, replace with 1
Heating_Demand_Sensor auroc failed, replace with 1
[2025-01-27 20:15:10.292808] Avg Precision: 0.359, Recall 0.345, F1 0.34084651283427275, AUROC 0.7203836888114631


In [251]:
rf_report = evaluation(train_y, rf_val)

[2025-01-27 20:15:14.956017] Aggregating predictions: 100%|██████████| 5/5 [00:00<00:00,  6.89it/s]
[2025-01-27 20:15:15.892715] Postprocessing Nones: 100%|██████████| 31839/31839 [00:00<00:00, 128907.08it/s]
[2025-01-27 20:15:16.197170] Preparing final result file: 100%|██████████| 31839/31839 [00:06<00:00, 4585.87it/s]


Cooling_Demand_Sensor auroc failed, replace with 1
Heating_Demand_Sensor auroc failed, replace with 1
[2025-01-27 20:15:25.228423] Avg Precision: 0.799, Recall 0.802, F1 0.7913839227920957, AUROC 0.990047298993989


In [433]:
check_level_report = [[] for _ in range(5)]

for col in lgb_report['col']:
    if get_tier(col) != None:
        check_level_report[get_tier(col)-1].append(lgb_report[lgb_report['col'] == col])

for i in range(5):
    print(i, f"precision: {pd.concat(check_level_report[i])['precision'].mean():.3f}, "
          f"recall: {pd.concat(check_level_report[i])['recall'].mean():.3f}, "
          f"f1: {pd.concat(check_level_report[i])['f1'].mean():.3f}, "
    )

0 precision: 0.766, recall: 0.710, f1: 0.711, 
1 precision: 0.242, recall: 0.226, f1: 0.224, 
2 precision: 0.518, recall: 0.565, f1: 0.536, 
3 precision: 0.221, recall: 0.220, f1: 0.209, 
4 precision: 0.595, recall: 0.443, f1: 0.502, 


In [437]:
check_level_report = [[] for _ in range(5)]

for col in rec_lgb_report['col']:
    if get_tier(col) != None:
        check_level_report[get_tier(col)-1].append(rec_lgb_report[rec_lgb_report['col'] == col])

for i in range(5):
    print(i, f"precision: {pd.concat(check_level_report[i])['precision'].mean():.3f}, "
          f"recall: {pd.concat(check_level_report[i])['recall'].mean():.3f}, "
          f"f1: {pd.concat(check_level_report[i])['f1'].mean():.3f}, "
    )

0 precision: 0.765, recall: 0.709, f1: 0.710, 
1 precision: 0.229, recall: 0.244, f1: 0.219, 
2 precision: 0.453, recall: 0.564, f1: 0.000, 
3 precision: 0.155, recall: 0.194, f1: 0.000, 
4 precision: 0.517, recall: 0.516, f1: 0.000, 


In [447]:
rec_xgb_report = pd.read_csv("../logs/0127_recall/xgb_report.csv")
rec_lgb_report = pd.read_csv("../logs/0127_recall/lgb_report.csv")
rec_rf_report = pd.read_csv("../logs/0127_recall/rf_report.csv")

rec_xgb_report['f1'] = rec_xgb_report['f1'] / 2
rec_lgb_report['f1'] = rec_lgb_report['f1'] / 2
rec_rf_report['f1'] = rec_rf_report['f1'] / 2

# # get_tier
for col in rec_rf_report['col']:
    if get_tier(col) and get_tier(col) > 2:
        rec_xgb_report.loc[(rec_xgb_report['col'] == col), 'f1'] = 0.0
        rec_lgb_report.loc[(rec_lgb_report['col'] == col), 'f1'] = 0.0
        rec_rf_report.loc[(rec_rf_report['col'] == col), 'f1'] = 0.0

In [448]:
# List of reports and their corresponding model names
reports = [
    (lgb_report, 'f1_lgb'),
    (rf_report, 'f1_rf'),
    (xgb_report, 'f1_xgb'),
    (rec_xgb_report, 'f1_rec_xgb'),
    (rec_lgb_report, 'f1_rec_lgb'),
    (rec_rf_report, 'f1_rec_rf'),
    # Add more reports here as needed
]

model_cols = [r[1] for r in reports]

# Initialize the merged DataFrame with the first report
norm_weight = reports[0][0][['col', 'f1']].rename(columns={'f1': reports[0][1]})

# Merge the remaining reports in a loop
for report, col_name in reports[1:]:
    norm_weight = pd.merge(
        norm_weight,
        report[['col', 'f1']].rename(columns={'f1': col_name}),
        on=['col']
    )

# Calculate the sum of weights for normalization
weight_sum = norm_weight[model_cols].sum(axis=1)

# Normalize the weights
for col in model_cols:
    norm_weight[col] = norm_weight[col] / weight_sum

norm_weight

Unnamed: 0,col,f1_lgb,f1_rf,f1_xgb,f1_rec_xgb,f1_rec_lgb,f1_rec_rf
0,Active_Power_Sensor,0.164282,0.417352,0.418366,0.000000,0.000000,0.000000
1,Air_Flow_Sensor,0.288351,0.355153,0.356496,0.000000,0.000000,0.000000
2,Air_Flow_Setpoint,0.239896,0.376007,0.384097,0.000000,0.000000,0.000000
3,Air_Temperature_Sensor,0.323457,0.335554,0.340990,0.000000,0.000000,0.000000
4,Air_Temperature_Setpoint,0.315986,0.341222,0.342792,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
90,Zone_Air_Dewpoint_Sensor,0.029086,0.311217,0.311217,0.152971,0.039900,0.155609
91,Zone_Air_Humidity_Sensor,0.121877,0.267461,0.268059,0.134037,0.074835,0.133731
92,Zone_Air_Humidity_Setpoint,0.132640,0.358908,0.508453,0.000000,0.000000,0.000000
93,Zone_Air_Temperature_Sensor,0.156163,0.421572,0.422266,0.000000,0.000000,0.000000


In [365]:
from zipfile import ZipFile
zipftest = ZipFile('../downloads/test_X_v0.1.0.zip', 'r')
listtestfile = zipftest.namelist()[1:]

In [366]:
columnlist = [
    'Active_Power_Sensor', 'Air_Flow_Sensor',
    'Air_Flow_Setpoint', 'Air_Temperature_Sensor',
    'Air_Temperature_Setpoint', 'Alarm', 'Angle_Sensor',
    'Average_Zone_Air_Temperature_Sensor',
    'Chilled_Water_Differential_Temperature_Sensor',
    'Chilled_Water_Return_Temperature_Sensor',
    'Chilled_Water_Supply_Flow_Sensor',
    'Chilled_Water_Supply_Temperature_Sensor', 'Command',
    'Cooling_Demand_Sensor', 'Cooling_Demand_Setpoint',
    'Cooling_Supply_Air_Temperature_Deadband_Setpoint',
    'Cooling_Temperature_Setpoint', 'Current_Sensor',
    'Damper_Position_Sensor', 'Damper_Position_Setpoint', 'Demand_Sensor',
    'Dew_Point_Setpoint', 'Differential_Pressure_Sensor',
    'Differential_Pressure_Setpoint',
    'Differential_Supply_Return_Water_Temperature_Sensor',
    'Discharge_Air_Dewpoint_Sensor', 'Discharge_Air_Temperature_Sensor',
    'Discharge_Air_Temperature_Setpoint',
    'Discharge_Water_Temperature_Sensor', 'Duration_Sensor',
    'Electrical_Power_Sensor', 'Energy_Usage_Sensor',
    'Filter_Differential_Pressure_Sensor', 'Flow_Sensor', 'Flow_Setpoint',
    'Frequency_Sensor', 'Heating_Demand_Sensor', 'Heating_Demand_Setpoint',
    'Heating_Supply_Air_Temperature_Deadband_Setpoint',
    'Heating_Temperature_Setpoint', 'Hot_Water_Flow_Sensor',
    'Hot_Water_Return_Temperature_Sensor',
    'Hot_Water_Supply_Temperature_Sensor', 'Humidity_Setpoint',
    'Load_Current_Sensor', 'Low_Outside_Air_Temperature_Enable_Setpoint',
    'Max_Air_Temperature_Setpoint', 'Min_Air_Temperature_Setpoint',
    'Outside_Air_CO2_Sensor', 'Outside_Air_Enthalpy_Sensor',
    'Outside_Air_Humidity_Sensor',
    'Outside_Air_Lockout_Temperature_Setpoint',
    'Outside_Air_Temperature_Sensor', 'Outside_Air_Temperature_Setpoint',
    'Parameter', 'Peak_Power_Demand_Sensor', 'Position_Sensor',
    'Power_Sensor', 'Pressure_Sensor', 'Rain_Sensor',
    'Reactive_Power_Sensor', 'Reset_Setpoint',
    'Return_Air_Temperature_Sensor', 'Return_Water_Temperature_Sensor',
    'Room_Air_Temperature_Setpoint', 'Sensor', 'Setpoint',
    'Solar_Radiance_Sensor', 'Speed_Setpoint', 'Static_Pressure_Sensor',
    'Static_Pressure_Setpoint', 'Status', 'Supply_Air_Humidity_Sensor',
    'Supply_Air_Static_Pressure_Sensor',
    'Supply_Air_Static_Pressure_Setpoint', 'Supply_Air_Temperature_Sensor',
    'Supply_Air_Temperature_Setpoint', 'Temperature_Sensor',
    'Temperature_Setpoint', 'Thermal_Power_Sensor', 'Time_Setpoint',
    'Usage_Sensor', 'Valve_Position_Sensor', 'Voltage_Sensor',
    'Warmest_Zone_Air_Temperature_Sensor', 'Water_Flow_Sensor',
    'Water_Temperature_Sensor', 'Water_Temperature_Setpoint',
    'Wind_Direction_Sensor', 'Wind_Speed_Sensor',
    'Zone_Air_Dewpoint_Sensor', 'Zone_Air_Humidity_Sensor',
    'Zone_Air_Humidity_Setpoint', 'Zone_Air_Temperature_Sensor'
]

In [367]:
import numpy as np

In [443]:
def get_pred_res(path: str):
    pred_idx = np.load(path)
    pred_res = np.zeros((len(listtestfile), len(columnlist)))
    pred_res[pred_idx[0], pred_idx[1]] = 1.0

    pred_filename = pd.DataFrame(columns=['filename'])
    pred_filename['filename'] = pd.Series(listtestfile).apply(lambda x: x.split("/")[-1])

    pred_res = pd.DataFrame(data=pred_res, columns=columnlist)

    pred_res = pd.concat([pred_filename, pred_res], axis=1)

    return pred_res

In [369]:
xgb_prec = get_pred_res("../logs/0127_xgb_base.npy")
lgb_prec = get_pred_res("../logs/0127_lgb_base.npy")
rf_prec = get_pred_res("../logs/0127_rf_base.npy")
xgb_recall = get_pred_res("../logs/0127_recall/0127_xgb_recall.npy")
lgb_recall = get_pred_res("../logs/0127_recall/0127_lgb_recall.npy")
rf_recall = get_pred_res("../logs/0127_recall/0127_rf_recall.npy")

In [449]:
weighted_res = weighted_merge_mean(
    df_list=[xgb_prec, rf_prec, lgb_prec, xgb_recall, rf_recall, lgb_recall],
    weights=[
        dict(norm_weight[['col', 'f1_xgb']].values),
        dict(norm_weight[['col', 'f1_rf']].values),
        dict(norm_weight[['col', 'f1_lgb']].values),
        dict(norm_weight[['col', 'f1_rec_xgb']].values),
        dict(norm_weight[['col', 'f1_rec_rf']].values),
        dict(norm_weight[['col', 'f1_rec_lgb']].values)
    ]
)

In [445]:
ensemble_res = merge_mean([xgb_prec, rf_prec, lgb_prec])
# ensemble_res = merge_mean([xgb_pred_res, rf_pred_res])
# ensemble_res.to_csv("../logs/submit/0128_ensemble_xgb_rf.csv")

In [450]:
pd.DataFrame({
    'lgb': check_pred_num(lgb_prec, thr=0.5).value_counts(),
    'rf': check_pred_num(rf_prec, thr=0.5).value_counts(),
    'xgb': check_pred_num(xgb_prec, thr=0.5).value_counts(),
    'lgb_rec': check_pred_num(lgb_recall, thr=0.5).value_counts(),
    'rf_rec': check_pred_num(rf_recall, thr=0.5).value_counts(),
    'xgb_rec': check_pred_num(xgb_recall, thr=0.5).value_counts(),
    'ensemble_res': check_pred_num(ensemble_res, thr=0.5).value_counts(),
    'weighted_res': check_pred_num(weighted_res, thr=0.5).value_counts(),
})

Unnamed: 0,lgb,rf,xgb,lgb_rec,rf_rec,xgb_rec,ensemble_res,weighted_res
0,1399.0,2852,3137,23954.0,28885.0,32338.0,3947,7922.0
1,111113.0,104233,100659,117959.0,109788.0,102480.0,111547,115614.0
2,85842.0,87210,82777,71267.0,73395.0,68476.0,85080,80120.0
3,77629.0,69986,76259,66559.0,58108.0,66451.0,69235,68032.0
4,33536.0,37394,37805,30760.0,34399.0,35460.0,35024,33517.0
5,5808.0,12645,13446,5221.0,11144.0,10513.0,10243,10072.0
6,368.0,1029,1256,,1.0,2.0,514,398.0
7,23.0,246,275,,,,99,29.0
8,2.0,95,81,,,,26,15.0
9,,21,23,,,,4,1.0


In [310]:
pd.DataFrame({
    'lgb': check_pred_num(lgb_pred_res, thr=0.5).value_counts(),
    'rf': check_pred_num(rf_pred_res, thr=0.5).value_counts(),
    'xgb': check_pred_num(xgb_pred_res, thr=0.5).value_counts(),
    'ensemble_res': check_pred_num(ensemble_res, thr=0.5).value_counts(),
    'weighted_res': check_pred_num(weighted_res, thr=0.5).value_counts(),
    # 'recall_weighted_res': check_pred_num(recall_weighted_res, thr=0.5).value_counts(),
})

Unnamed: 0,lgb,rf,xgb,ensemble_res,weighted_res,recall_weighted_res
0,1399.0,2852,3137,3947,3946,3946
1,111113.0,104233,100659,111547,111160,111159
2,85842.0,87210,82777,85080,85029,85010
3,77629.0,69986,76259,69235,69430,69380
4,33536.0,37394,37805,35024,35137,35120
5,5808.0,12645,13446,10243,10337,10328
6,368.0,1029,1256,514,546,642
7,23.0,246,275,99,101,101
8,2.0,95,81,26,29,29
9,,21,23,4,4,4


In [373]:
ensemble_res.to_csv("../logs/submit/0128_prec_recall_ensemble.csv", index=False)

In [398]:
weighted_res.to_csv("../logs/submit/0128_prec_div1_recall_div3__ensemble_weighted.csv", index=False)

In [353]:
# weighted_res = pd.read_csv("../logs/submit/0127_ensemble_weighted.csv")
weighted_res.isna().any().value_counts()

False    95
Name: count, dtype: int64

# Occurence post filtering

In [399]:
LEVEL_LABLES = [
    ['Alarm', 'Command', 'Parameter', 'Sensor', 'Setpoint', 'Status'],
    ['Reset_Setpoint',
    'Usage_Sensor',
    'Pressure_Sensor',
    'Flow_Setpoint',
    'Static_Pressure_Setpoint',
    'Angle_Sensor',
    'Humidity_Setpoint',
    'Temperature_Sensor',
    'Temperature_Setpoint',
    'Supply_Air_Humidity_Sensor',
    'Outside_Air_CO2_Sensor',
    'Differential_Pressure_Setpoint',
    'Damper_Position_Setpoint',
    'Heating_Demand_Setpoint',
    'Cooling_Demand_Setpoint',
    'Current_Sensor',
    'Wind_Speed_Sensor',
    'Flow_Sensor',
    'Dew_Point_Setpoint',
    'Zone_Air_Dewpoint_Sensor',
    'Power_Sensor',
    'Position_Sensor',
    'Solar_Radiance_Sensor',
    'Duration_Sensor',
    'Time_Setpoint',
    'Discharge_Air_Dewpoint_Sensor',
    'Wind_Direction_Sensor',
    'Voltage_Sensor',
    'Zone_Air_Humidity_Sensor',
    'Demand_Sensor',
    'Speed_Setpoint',
    'Rain_Sensor',
    'Frequency_Sensor',
    'Outside_Air_Humidity_Sensor',
    'Outside_Air_Enthalpy_Sensor'],
    ['Air_Flow_Sensor',
    'Water_Temperature_Setpoint',
    'Water_Flow_Sensor',
    'Electrical_Power_Sensor',
    'Zone_Air_Humidity_Setpoint',
    'Heating_Temperature_Setpoint',
    'Air_Flow_Setpoint',
    'Energy_Usage_Sensor',
    'Supply_Air_Static_Pressure_Setpoint',
    'Air_Temperature_Sensor',
    'Valve_Position_Sensor',
    'Cooling_Temperature_Setpoint',
    'Water_Temperature_Sensor',
    'Load_Current_Sensor',
    'Damper_Position_Sensor',
    'Static_Pressure_Sensor',
    'Air_Temperature_Setpoint',
    'Thermal_Power_Sensor',
    'Differential_Pressure_Sensor'],
    ['Supply_Air_Temperature_Sensor',
    'Discharge_Air_Temperature_Sensor',
    'Discharge_Water_Temperature_Sensor',
    'Zone_Air_Temperature_Sensor',
    'Supply_Air_Static_Pressure_Sensor',
    'Outside_Air_Temperature_Setpoint',
    'Supply_Air_Temperature_Setpoint',
    'Chilled_Water_Supply_Flow_Sensor',
    'Chilled_Water_Supply_Temperature_Sensor',
    'Peak_Power_Demand_Sensor',
    'Room_Air_Temperature_Setpoint',
    'Hot_Water_Supply_Temperature_Sensor',
    'Active_Power_Sensor',
    'Min_Air_Temperature_Setpoint',
    'Return_Air_Temperature_Sensor',
    'Hot_Water_Flow_Sensor',
    'Chilled_Water_Differential_Temperature_Sensor',
    'Filter_Differential_Pressure_Sensor',
    'Max_Air_Temperature_Setpoint',
    'Cooling_Supply_Air_Temperature_Deadband_Setpoint',
    'Outside_Air_Temperature_Sensor',
    'Heating_Supply_Air_Temperature_Deadband_Setpoint',
    'Discharge_Air_Temperature_Setpoint',
    'Return_Water_Temperature_Sensor',
    'Reactive_Power_Sensor'],
    ['Low_Outside_Air_Temperature_Enable_Setpoint',
    'Cooling_Demand_Sensor',
    'Chilled_Water_Return_Temperature_Sensor',
    'Average_Zone_Air_Temperature_Sensor',
    'Warmest_Zone_Air_Temperature_Sensor',
    'Heating_Demand_Sensor',
    'Differential_Supply_Return_Water_Temperature_Sensor',
    'Hot_Water_Return_Temperature_Sensor',
    'Outside_Air_Lockout_Temperature_Setpoint']
]

In [400]:
tiers = {i+1: LEVEL_LABLES[i] for i in range(len(LEVEL_LABLES))}

def get_tier(label):
    for tier_num, tier_list in tiers.items():
        if label in tier_list:
            return tier_num
    return None  # Handle cases where the label isn't found in any tier

In [315]:
train_y = pd.read_csv("../downloads/train_y_v0.1.0.csv")

In [322]:
stackedfinalresult = weighted_res.copy()

In [330]:
check = stackedfinalresult
occurence = []

for col1 in tqdm(list(check.columns)[1:]):
    for col2 in list(check.columns)[1:]:
        if col1 != col2:
            oc = check[[col1, col2]]
            oc = oc[(oc[col1] >= 0.5) & (oc[col2] >= 0.5) & ~(oc[col1] == oc[col2])]
            occurence.append([col1, col2, len(oc)])

100%|██████████| 94/94 [00:27<00:00,  3.43it/s]


In [331]:
tst_oc = pd.DataFrame(occurence)
tst_oc = tst_oc[tst_oc[2] > 0]
tst_oc

Unnamed: 0,0,1,2
2,Active_Power_Sensor,Air_Temperature_Sensor,1
5,Active_Power_Sensor,Angle_Sensor,13
16,Active_Power_Sensor,Current_Sensor,20
19,Active_Power_Sensor,Demand_Sensor,11
23,Active_Power_Sensor,Differential_Supply_Return_Water_Temperature_S...,1
...,...,...,...
8727,Zone_Air_Temperature_Sensor,Temperature_Setpoint,56
8730,Zone_Air_Temperature_Sensor,Usage_Sensor,2
8733,Zone_Air_Temperature_Sensor,Warmest_Zone_Air_Temperature_Sensor,15
8735,Zone_Air_Temperature_Sensor,Water_Temperature_Sensor,4


In [336]:
trn_oc_check = train_y.copy()
trn_occurence = []

for col1 in tqdm(list(trn_oc_check.columns)[1:]):
    for col2 in list(trn_oc_check.columns)[1:]:
        if col1 != col2:
            oc = trn_oc_check[[col1, col2]]
            oc = oc[(oc[col1] == 1) & (oc[col2] == 1)]
            trn_occurence.append([col1, col2, len(oc)])

100%|██████████| 94/94 [00:12<00:00,  7.70it/s]


In [337]:
trn_oc = pd.DataFrame(trn_occurence)
trn_oc = trn_oc[trn_oc[2] > 0]

In [338]:
trn_oc.values[:, :2].shape

oc_map = {}
for src, tgt in trn_oc.values[:, :2]:
    if src not in oc_map:
        oc_map[src] = [tgt]
    else:
        oc_map[src].append(tgt)

In [339]:
rm_label = []
for row in tqdm(tst_oc.values, total=len(tst_oc)):
    if oc_map.get(row[0]) and row[1] not in oc_map.get(row[0]):
        rm_label.append((row[1], row[0]))
        # if get_tier(row[0]) < get_tier(row[1]):
        #     rm_label.append((row[0], row[1]))
        # else:
        #     rm_label.append((row[1], row[0]))

100%|██████████| 1760/1760 [00:00<00:00, 275344.09it/s]


In [341]:
# filtered_res = weighted_res.copy()
# remove_record = []
# for src, tgt in tqdm(rm_label):
#     remove_size = len(filtered_res[(filtered_res[src] == 1) & (filtered_res[tgt] == 1)])

#     # Skip if it removes too much
#     skipped = False
#     if remove_size > len(filtered_res[filtered_res[tgt] == 1]) * 0.01:
#         skipped = True

#     # Count how many rows match the condition
#     remove_record.append((src, tgt, remove_size, len(filtered_res[filtered_res[tgt] == 1]), skipped))
    
#     if not skipped:
#         # Properly update the matching rows in the original DataFrame
#         filtered_res.loc[(filtered_res[src] == 1) & (filtered_res[tgt] == 1), tgt] = 0.0

filtered_res = weighted_res.copy()
remove_record = []
for src, tgt in tqdm(rm_label):
    remove_size = len(filtered_res[~(filtered_res[src] == filtered_res[tgt])])

    # Count how many rows match the condition
    remove_record.append((src, tgt, remove_size))

    # Properly update the matching rows in the original DataFrame
    filtered_res.loc[(filtered_res[src] > filtered_res[tgt]), tgt] = 0.0
    filtered_res.loc[(filtered_res[src] < filtered_res[tgt]), src] = 0.0

100%|██████████| 1320/1320 [00:40<00:00, 32.52it/s]


In [342]:
rm_check = pd.DataFrame(remove_record)

In [343]:
rm_check

Unnamed: 0,0,1,2
0,Air_Temperature_Sensor,Active_Power_Sensor,34717
1,Angle_Sensor,Active_Power_Sensor,11396
2,Current_Sensor,Active_Power_Sensor,25262
3,Demand_Sensor,Active_Power_Sensor,28310
4,Differential_Supply_Return_Water_Temperature_S...,Active_Power_Sensor,11769
...,...,...,...
1315,Supply_Air_Temperature_Setpoint,Zone_Air_Temperature_Sensor,12127
1316,Temperature_Setpoint,Zone_Air_Temperature_Sensor,28205
1317,Usage_Sensor,Zone_Air_Temperature_Sensor,13946
1318,Water_Temperature_Sensor,Zone_Air_Temperature_Sensor,15645


In [170]:
rm_check[4].value_counts()

4
False    1341
True      457
Name: count, dtype: int64

In [171]:
rm_check[[1, 2]].groupby(1).agg('sum').sort_values(2, ascending=False)

Unnamed: 0_level_0,2
1,Unnamed: 1_level_1
Electrical_Power_Sensor,9003
Position_Sensor,7686
Damper_Position_Sensor,7045
Sensor,6864
Peak_Power_Demand_Sensor,5236
...,...
Discharge_Air_Temperature_Setpoint,2
Hot_Water_Flow_Sensor,2
Solar_Radiance_Sensor,0
Wind_Direction_Sensor,0


In [None]:
# 1     100659
# 2      82777
# 3      76259
# 4      37805
# 5      13446
# 0       3137
# 6       1256
# 7        275
# 8         81
# 9         23
# 10         2

# 1    103552
# 2     86048
# 3     75239
# 4     35331
# 5     12143
# 0      3137
# 6       270

In [346]:
pd.DataFrame({
    'lgb': check_pred_num(lgb_pred_res, thr=0.5).value_counts(),
    'rf': check_pred_num(rf_pred_res, thr=0.5).value_counts(),
    'xgb': check_pred_num(xgb_pred_res, thr=0.5).value_counts(),
    'ensemble_res': check_pred_num(ensemble_res, thr=0.5).value_counts(),
    'weighted_res': check_pred_num(weighted_res, thr=0.5).value_counts(),
    'recall_weighted_res': check_pred_num(recall_weighted_res, thr=0.5).value_counts(),
    'filtered_res': check_pred_num(filtered_res, thr=0.5).value_counts(),
})

Unnamed: 0,lgb,rf,xgb,ensemble_res,weighted_res,recall_weighted_res,filtered_res
0,1399.0,2852,3137,3947,3946,3946,3946.0
1,111113.0,104233,100659,111547,111160,111159,119632.0
2,85842.0,87210,82777,85080,85029,85010,88436.0
3,77629.0,69986,76259,69235,69430,69380,62611.0
4,33536.0,37394,37805,35024,35137,35120,32456.0
5,5808.0,12645,13446,10243,10337,10328,8626.0
6,368.0,1029,1256,514,546,642,11.0
7,23.0,246,275,99,101,101,2.0
8,2.0,95,81,26,29,29,
9,,21,23,4,4,4,


In [345]:
filtered_res.to_csv("../logs/submit/0127_ensemble_weigthed_post_filter_v2.csv", index=False)

In [52]:
semi_pred_res

Unnamed: 0,filename,Active_Power_Sensor,Air_Flow_Sensor,Air_Flow_Setpoint,Air_Temperature_Sensor,Air_Temperature_Setpoint,Alarm,Angle_Sensor,Average_Zone_Air_Temperature_Sensor,Chilled_Water_Differential_Temperature_Sensor,...,Warmest_Zone_Air_Temperature_Sensor,Water_Flow_Sensor,Water_Temperature_Sensor,Water_Temperature_Setpoint,Wind_Direction_Sensor,Wind_Speed_Sensor,Zone_Air_Dewpoint_Sensor,Zone_Air_Humidity_Sensor,Zone_Air_Humidity_Setpoint,Zone_Air_Temperature_Sensor
0,test_X20367.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_X103084.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_X6910.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_X66332.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,test_X38528.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315715,test_X325790.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315716,test_X61444.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315717,test_X221284.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315718,test_X115827.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
