In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import time
from zipfile import ZipFile
import warnings
import pickle
import torch
from torch.utils.data import Dataset
import tsfel
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
from hiclass import LocalClassifierPerNode, LocalClassifierPerParentNode, LocalClassifierPerLevel
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from typing import List

def merge_mean(df_list):
    """
    Given a list of DataFrames with identical columns and index,
    return one DataFrame whose numeric columns are the mean of all
    those numeric columns across the list.

    The "filename" column is assumed to be the same or can be taken
    from the first DataFrame.
    """
    # Make a copy of the first DataFrame to initialize the output
    merged_df = df_list[0].copy()

    # Identify all numeric columns (excluding "filename")
    numeric_cols = merged_df.columns.drop("filename")

    # Sum up the numeric columns from all other DataFrames
    for df in df_list[1:]:
        merged_df[numeric_cols] += df[numeric_cols]

    # Divide by the number of DataFrames to get the mean
    merged_df[numeric_cols] /= len(df_list)

    return merged_df


def weighted_merge_mean(df_list, weights: List[dict]):
    """
    Given a list of DataFrames with identical columns and index,
    return one DataFrame whose numeric columns are the mean of all
    those numeric columns across the list.

    The "filename" column is assumed to be the same or can be taken
    from the first DataFrame.
    """
    # Make a copy of the first DataFrame to initialize the output
    merged_df = df_list[0].copy()

    # Identify all numeric columns (excluding "filename")
    numeric_cols = merged_df.columns.drop("filename")

    # Sum up the numeric columns from all other DataFrames
    for idx, df in enumerate(df_list):
        for col in numeric_cols:
            if idx == 0:
                merged_df[col] = df[col] * weights[idx][col]
            else:
                merged_df[col] += df[col] * weights[idx][col]
    
    for col in numeric_cols:
        merged_df[col] = merged_df[col].round(3)
        merged_df[col] = merged_df[col].fillna(0.0)
        
    return merged_df



In [3]:
def check_pred_num(_final_res, thr=0.4):
    # Exclude 'filename' column if it exists
    filtered_df = _final_res.drop(columns=['filename'], errors='ignore')

    return (filtered_df >= thr).sum(axis=1)

# Ensemble

### Get importance of each model of each class according to CV

In [4]:
LABEL_NAMES = [
    'Active_Power_Sensor', 'Air_Flow_Sensor',
    'Air_Flow_Setpoint', 'Air_Temperature_Sensor',
    'Air_Temperature_Setpoint', 'Alarm', 'Angle_Sensor',
    'Average_Zone_Air_Temperature_Sensor',
    'Chilled_Water_Differential_Temperature_Sensor',
    'Chilled_Water_Return_Temperature_Sensor',
    'Chilled_Water_Supply_Flow_Sensor',
    'Chilled_Water_Supply_Temperature_Sensor', 'Command',
    'Cooling_Demand_Sensor', 'Cooling_Demand_Setpoint',
    'Cooling_Supply_Air_Temperature_Deadband_Setpoint',
    'Cooling_Temperature_Setpoint', 'Current_Sensor',
    'Damper_Position_Sensor', 'Damper_Position_Setpoint', 'Demand_Sensor',
    'Dew_Point_Setpoint', 'Differential_Pressure_Sensor',
    'Differential_Pressure_Setpoint',
    'Differential_Supply_Return_Water_Temperature_Sensor',
    'Discharge_Air_Dewpoint_Sensor', 'Discharge_Air_Temperature_Sensor',
    'Discharge_Air_Temperature_Setpoint',
    'Discharge_Water_Temperature_Sensor', 'Duration_Sensor',
    'Electrical_Power_Sensor', 'Energy_Usage_Sensor',
    'Filter_Differential_Pressure_Sensor', 'Flow_Sensor', 'Flow_Setpoint',
    'Frequency_Sensor', 'Heating_Demand_Sensor', 'Heating_Demand_Setpoint',
    'Heating_Supply_Air_Temperature_Deadband_Setpoint',
    'Heating_Temperature_Setpoint', 'Hot_Water_Flow_Sensor',
    'Hot_Water_Return_Temperature_Sensor',
    'Hot_Water_Supply_Temperature_Sensor', 'Humidity_Setpoint',
    'Load_Current_Sensor', 'Low_Outside_Air_Temperature_Enable_Setpoint',
    'Max_Air_Temperature_Setpoint', 'Min_Air_Temperature_Setpoint',
    'Outside_Air_CO2_Sensor', 'Outside_Air_Enthalpy_Sensor',
    'Outside_Air_Humidity_Sensor',
    'Outside_Air_Lockout_Temperature_Setpoint',
    'Outside_Air_Temperature_Sensor', 'Outside_Air_Temperature_Setpoint',
    'Parameter', 'Peak_Power_Demand_Sensor', 'Position_Sensor',
    'Power_Sensor', 'Pressure_Sensor', 'Rain_Sensor',
    'Reactive_Power_Sensor', 'Reset_Setpoint',
    'Return_Air_Temperature_Sensor', 'Return_Water_Temperature_Sensor',
    'Room_Air_Temperature_Setpoint', 'Sensor', 'Setpoint',
    'Solar_Radiance_Sensor', 'Speed_Setpoint', 'Static_Pressure_Sensor',
    'Static_Pressure_Setpoint', 'Status', 'Supply_Air_Humidity_Sensor',
    'Supply_Air_Static_Pressure_Sensor',
    'Supply_Air_Static_Pressure_Setpoint', 'Supply_Air_Temperature_Sensor',
    'Supply_Air_Temperature_Setpoint', 'Temperature_Sensor',
    'Temperature_Setpoint', 'Thermal_Power_Sensor', 'Time_Setpoint',
    'Usage_Sensor', 'Valve_Position_Sensor', 'Voltage_Sensor',
    'Warmest_Zone_Air_Temperature_Sensor', 'Water_Flow_Sensor',
    'Water_Temperature_Sensor', 'Water_Temperature_Setpoint',
    'Wind_Direction_Sensor', 'Wind_Speed_Sensor',
    'Zone_Air_Dewpoint_Sensor', 'Zone_Air_Humidity_Sensor',
    'Zone_Air_Humidity_Setpoint', 'Zone_Air_Temperature_Sensor'
]

In [5]:
xgb_report = pd.read_csv("../logs/0127_recall/xgb_report.csv")
lgb_report = pd.read_csv("../logs/0127_recall/lgb_report.csv")
rf_report = pd.read_csv("../logs/0127_recall/rf_report.csv")

In [8]:
lgb_report.sort_values('f1').iloc[-20:]

Unnamed: 0,col,precision,recall,f1,support
77,Temperature_Sensor,0.608854,0.59767,0.60321,
1,Air_Flow_Sensor,0.562628,0.80826,0.663438,
84,Warmest_Zone_Air_Temperature_Sensor,0.777778,0.583333,0.666667,
69,Static_Pressure_Sensor,0.645933,0.692308,0.668317,
57,Power_Sensor,0.699976,0.646285,0.67206,
86,Water_Temperature_Sensor,0.666667,0.701493,0.683636,
83,Voltage_Sensor,0.788356,0.684426,0.732724,
30,Electrical_Power_Sensor,0.635852,0.870697,0.734971,
15,Cooling_Supply_Air_Temperature_Deadband_Setpoint,0.842105,0.666667,0.744186,
45,Low_Outside_Air_Temperature_Enable_Setpoint,0.926829,0.633333,0.752475,


In [6]:
# List of reports and their corresponding model names
reports = [
    (lgb_report, 'f1_lgb'),
    (rf_report, 'f1_rf'),
    (xgb_report, 'f1_xgb'),
    # Add more reports here as needed
]

model_cols = [r[1] for r in reports]

# Initialize the merged DataFrame with the first report
norm_weight = reports[0][0][['col', 'f1']].rename(columns={'f1': reports[0][1]})

# Merge the remaining reports in a loop
for report, col_name in reports[1:]:
    norm_weight = pd.merge(
        norm_weight,
        report[['col', 'f1']].rename(columns={'f1': col_name}),
        on=['col']
    )

# Calculate the sum of weights for normalization
weight_sum = norm_weight[model_cols].sum(axis=1)

# Normalize the weights
for col in model_cols:
    norm_weight[col] = norm_weight[col] / weight_sum

norm_weight

Unnamed: 0,col,f1_lgb,f1_rf,f1_xgb
0,Active_Power_Sensor,0.132866,0.432550,0.434584
1,Air_Flow_Sensor,0.264937,0.368485,0.366578
2,Air_Flow_Setpoint,0.233539,0.369500,0.396962
3,Air_Temperature_Sensor,0.320380,0.338186,0.341434
4,Air_Temperature_Setpoint,0.317729,0.339608,0.342663
...,...,...,...,...
90,Zone_Air_Dewpoint_Sensor,0.114496,0.446536,0.438968
91,Zone_Air_Humidity_Sensor,0.218431,0.390338,0.391231
92,Zone_Air_Humidity_Setpoint,0.000000,0.500000,0.500000
93,Zone_Air_Temperature_Sensor,0.147381,0.425285,0.427335


In [7]:
from zipfile import ZipFile
zipftest = ZipFile('../downloads/test_X_v0.1.0.zip', 'r')
listtestfile = zipftest.namelist()[1:]

In [8]:
columnlist = [
    'Active_Power_Sensor', 'Air_Flow_Sensor',
    'Air_Flow_Setpoint', 'Air_Temperature_Sensor',
    'Air_Temperature_Setpoint', 'Alarm', 'Angle_Sensor',
    'Average_Zone_Air_Temperature_Sensor',
    'Chilled_Water_Differential_Temperature_Sensor',
    'Chilled_Water_Return_Temperature_Sensor',
    'Chilled_Water_Supply_Flow_Sensor',
    'Chilled_Water_Supply_Temperature_Sensor', 'Command',
    'Cooling_Demand_Sensor', 'Cooling_Demand_Setpoint',
    'Cooling_Supply_Air_Temperature_Deadband_Setpoint',
    'Cooling_Temperature_Setpoint', 'Current_Sensor',
    'Damper_Position_Sensor', 'Damper_Position_Setpoint', 'Demand_Sensor',
    'Dew_Point_Setpoint', 'Differential_Pressure_Sensor',
    'Differential_Pressure_Setpoint',
    'Differential_Supply_Return_Water_Temperature_Sensor',
    'Discharge_Air_Dewpoint_Sensor', 'Discharge_Air_Temperature_Sensor',
    'Discharge_Air_Temperature_Setpoint',
    'Discharge_Water_Temperature_Sensor', 'Duration_Sensor',
    'Electrical_Power_Sensor', 'Energy_Usage_Sensor',
    'Filter_Differential_Pressure_Sensor', 'Flow_Sensor', 'Flow_Setpoint',
    'Frequency_Sensor', 'Heating_Demand_Sensor', 'Heating_Demand_Setpoint',
    'Heating_Supply_Air_Temperature_Deadband_Setpoint',
    'Heating_Temperature_Setpoint', 'Hot_Water_Flow_Sensor',
    'Hot_Water_Return_Temperature_Sensor',
    'Hot_Water_Supply_Temperature_Sensor', 'Humidity_Setpoint',
    'Load_Current_Sensor', 'Low_Outside_Air_Temperature_Enable_Setpoint',
    'Max_Air_Temperature_Setpoint', 'Min_Air_Temperature_Setpoint',
    'Outside_Air_CO2_Sensor', 'Outside_Air_Enthalpy_Sensor',
    'Outside_Air_Humidity_Sensor',
    'Outside_Air_Lockout_Temperature_Setpoint',
    'Outside_Air_Temperature_Sensor', 'Outside_Air_Temperature_Setpoint',
    'Parameter', 'Peak_Power_Demand_Sensor', 'Position_Sensor',
    'Power_Sensor', 'Pressure_Sensor', 'Rain_Sensor',
    'Reactive_Power_Sensor', 'Reset_Setpoint',
    'Return_Air_Temperature_Sensor', 'Return_Water_Temperature_Sensor',
    'Room_Air_Temperature_Setpoint', 'Sensor', 'Setpoint',
    'Solar_Radiance_Sensor', 'Speed_Setpoint', 'Static_Pressure_Sensor',
    'Static_Pressure_Setpoint', 'Status', 'Supply_Air_Humidity_Sensor',
    'Supply_Air_Static_Pressure_Sensor',
    'Supply_Air_Static_Pressure_Setpoint', 'Supply_Air_Temperature_Sensor',
    'Supply_Air_Temperature_Setpoint', 'Temperature_Sensor',
    'Temperature_Setpoint', 'Thermal_Power_Sensor', 'Time_Setpoint',
    'Usage_Sensor', 'Valve_Position_Sensor', 'Voltage_Sensor',
    'Warmest_Zone_Air_Temperature_Sensor', 'Water_Flow_Sensor',
    'Water_Temperature_Sensor', 'Water_Temperature_Setpoint',
    'Wind_Direction_Sensor', 'Wind_Speed_Sensor',
    'Zone_Air_Dewpoint_Sensor', 'Zone_Air_Humidity_Sensor',
    'Zone_Air_Humidity_Setpoint', 'Zone_Air_Temperature_Sensor'
]

In [10]:
def get_pred_res(path: str):
    pred_idx = np.load(path)
    pred_res = np.zeros((len(listtestfile), len(columnlist)))
    pred_res[pred_idx[0], pred_idx[1]] = 1.0

    pred_filename = pd.DataFrame(columns=['filename'])
    pred_filename['filename'] = pd.Series(listtestfile).apply(lambda x: x.split("/")[-1])

    pred_res = pd.DataFrame(data=pred_res, columns=columnlist)

    pred_res = pd.concat([pred_filename, pred_res], axis=1)

    return pred_res

In [11]:
xgb_res = get_pred_res("../logs/0127_xgb_base.npy")
lgb_res = get_pred_res("../logs/0127_lgb_base.npy")
rf_res = get_pred_res("../logs/0127_rf_base.npy")

In [12]:
weighted_res = weighted_merge_mean(
    df_list=[xgb_res, rf_res, lgb_res],
    weights=[
        dict(norm_weight[['col', 'f1_xgb']].values),
        dict(norm_weight[['col', 'f1_rf']].values),
        dict(norm_weight[['col', 'f1_lgb']].values),
    ]
)

In [13]:
ensemble_res = merge_mean([xgb_res, rf_res, lgb_res])

In [14]:
pd.DataFrame({
    'lgb': check_pred_num(lgb_res, thr=0.5).value_counts(),
    'rf': check_pred_num(rf_res, thr=0.5).value_counts(),
    'xgb': check_pred_num(xgb_res, thr=0.5).value_counts(),
    'ensemble_res': check_pred_num(ensemble_res, thr=0.5).value_counts(),
    'weighted_res': check_pred_num(weighted_res, thr=0.5).value_counts(),
})

Unnamed: 0,lgb,rf,xgb,ensemble_res,weighted_res
0,1399.0,2852,3137,3947,3945
1,111113.0,104233,100659,111547,111071
2,85842.0,87210,82777,85080,84866
3,77629.0,69986,76259,69235,69568
4,33536.0,37394,37805,35024,35160
5,5808.0,12645,13446,10243,10305
6,368.0,1029,1256,514,670
7,23.0,246,275,99,101
8,2.0,95,81,26,29
9,,21,23,4,4


In [15]:
ensemble_res.to_csv("../logs/submit/ensemble_res.csv", index=False)

In [16]:
weighted_res.to_csv("../logs/submit/weighted_ensemble_res.csv", index=False)

In [17]:
weighted_res.isna().any().value_counts()

False    95
Name: count, dtype: int64