In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import time
from zipfile import ZipFile
import warnings
import pickle
import torch
from torch.utils.data import Dataset
import tsfel
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
from hiclass import LocalClassifierPerNode, LocalClassifierPerParentNode, LocalClassifierPerLevel
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

# Data split

In [22]:
def create_folds(train_y, n_splits=5):
    # Create a label array for stratification
    # We'll use the first non-zero label for each row as the stratification target
    stratify_labels = []
    for _, row in train_y.iterrows():
        labels = row[train_y.columns != 'filename'].values
        # Get first non-negative label, or 0 if all negative
        first_positive = next((i for i, x in enumerate(labels) if x >= 0), 0)
        stratify_labels.append(first_positive)
    
    # Create StratifiedKFold object
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Generate fold indices
    folds = []
    for train_idx, val_idx in skf.split(train_y, stratify_labels):
        folds.append({
            'train': train_idx,
            'val': val_idx
        })
    
    return folds

In [23]:
train_y = pd.read_csv("../downloads/train_y_v0.1.0.csv")

In [24]:
folds = create_folds(train_y)

# Prepare features

### Prepare pre-extracted features

In [5]:
raw_train_X_full = pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_full_v3.csv")
raw_train_X_1 = pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split1_2_v3.csv")
raw_train_X_2 = pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split2_2_v3.csv")

In [6]:
raw_test_X = pd.read_csv("../downloads/test_features_full_v3.csv")

In [7]:
pre_feature_list = ['0_Absolute energy',
 '0_Area under the curve',
 '0_Autocorrelation',
 '0_Average power',
 '0_Centroid',
 '0_ECDF Percentile Count_0',
 '0_ECDF Percentile Count_1',
 '0_ECDF Percentile_0',
 '0_ECDF Percentile_1',
 '0_ECDF_0',
 '0_ECDF_1',
 '0_ECDF_2',
 '0_ECDF_3',
 '0_ECDF_4',
 '0_ECDF_5',
 '0_ECDF_6',
 '0_ECDF_7',
 '0_ECDF_8',
 '0_ECDF_9',
 '0_Entropy',
 '0_Histogram mode',
 '0_Interquartile range',
 '0_Kurtosis',
 '0_Max',
 '0_Mean',
 '0_Mean absolute deviation',
 '0_Mean absolute diff',
 '0_Mean diff',
 '0_Median',
 '0_Median absolute deviation',
 '0_Median absolute diff',
 '0_Median diff',
 '0_Min',
 '0_Negative turning points',
 '0_Neighbourhood peaks',
 '0_Peak to peak distance',
 '0_Positive turning points',
 '0_Root mean square',
 '0_Signal distance',
 '0_Skewness',
 '0_Slope',
 '0_Standard deviation',
 '0_Sum absolute diff',
 '0_Variance',
 '0_Zero crossing rate',
 '0_Fundamental frequency',
 '0_Human range energy',
 '0_Max power spectrum',
 '0_Maximum frequency',
 '0_Median frequency',
 '0_Power bandwidth',
 '0_Wavelet entropy',
 'value_median',
 'value_mean',
 'value_qmean',
 'value_max',
 'value_min',
 'value_maxmin',
 'value_diffmax',
 'value_diffmin',
 'value_diffmean',
 'value_diffqmean',
 'value_diffmedian',
 'value_diffmaxmin',
 'time_diffmean',
 'time_diffqmean',
 'time_diffmax',
 'time_diffmin',
 'time_diffmedian',
 'value_std',
 'value_var',
 'value_diffstd',
 'value_diffvar',
 'time_diffstd',
 'time_diffvar',
 'time_burstiness',
 'time_total',
 'time_event_density',
 'time_entropy',
 'time_slope'
]

In [8]:
train_X_full = raw_train_X_full[pre_feature_list]
train_X_1 = raw_train_X_1[pre_feature_list]
train_X_2 = raw_train_X_2[pre_feature_list]
test_X = raw_test_X[pre_feature_list]

# Prepare labels

In [9]:
def get_active_labels_np(row):
    """More efficient version using numpy"""
    arr = row.to_numpy() # convert to numpy array
    indices = np.where(arr == 1)[0] # get indices where value is 1
    labels = row.index[indices].tolist() # get labels from indices
    return labels

labelhir = train_y.apply(get_active_labels_np, axis=1).tolist()

In [10]:
level_labels = [list(train_y.columns[1:]), [], [], [], []]

for k in range(0, 4):
    check_labels = level_labels[k]
    label_len = len(check_labels)
    idx_is_subset_of_col = pd.DataFrame(0, index=check_labels, columns=check_labels)
    is_subset = []

    for i in tqdm(range(label_len)):
        for j in range(label_len):
            src_lb, tgt_lb = check_labels[i], check_labels[j]
            src = train_y[train_y[src_lb] == 1]
            tgt = train_y[(train_y[src_lb] == 1) & (train_y[tgt_lb] == 1)]

            idx_is_subset_of_col.loc[src_lb, tgt_lb] = len(src) <= len(tgt)
            if len(src) <= len(tgt) and src_lb != tgt_lb:
                is_subset.append([src_lb, tgt_lb])

    remove_label = set([s[0] for s in is_subset])
    print(f"Level {k}")
    print(is_subset)
    print(remove_label)
    print()
    
    for rl in remove_label:
        level_labels[k].remove(rl)
        level_labels[k+1].append(rl)

100%|██████████| 94/94 [00:14<00:00,  6.63it/s]


Level 0
[['Active_Power_Sensor', 'Electrical_Power_Sensor'], ['Active_Power_Sensor', 'Power_Sensor'], ['Active_Power_Sensor', 'Sensor'], ['Air_Flow_Sensor', 'Flow_Sensor'], ['Air_Flow_Sensor', 'Sensor'], ['Air_Flow_Setpoint', 'Flow_Setpoint'], ['Air_Flow_Setpoint', 'Setpoint'], ['Air_Temperature_Sensor', 'Sensor'], ['Air_Temperature_Sensor', 'Temperature_Sensor'], ['Air_Temperature_Setpoint', 'Setpoint'], ['Air_Temperature_Setpoint', 'Temperature_Setpoint'], ['Angle_Sensor', 'Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Temperature_Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Zone_Air_Temperature_Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Temperature_Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Chilled_Water_Return_Temperature_Sensor',

100%|██████████| 88/88 [00:10<00:00,  8.11it/s]


Level 1
[['Filter_Differential_Pressure_Sensor', 'Differential_Pressure_Sensor'], ['Filter_Differential_Pressure_Sensor', 'Pressure_Sensor'], ['Warmest_Zone_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Warmest_Zone_Air_Temperature_Sensor', 'Zone_Air_Temperature_Sensor'], ['Warmest_Zone_Air_Temperature_Sensor', 'Temperature_Sensor'], ['Air_Temperature_Sensor', 'Temperature_Sensor'], ['Differential_Supply_Return_Water_Temperature_Sensor', 'Temperature_Sensor'], ['Differential_Supply_Return_Water_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Differential_Supply_Return_Water_Temperature_Sensor', 'Return_Water_Temperature_Sensor'], ['Reactive_Power_Sensor', 'Electrical_Power_Sensor'], ['Reactive_Power_Sensor', 'Power_Sensor'], ['Return_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Return_Air_Temperature_Sensor', 'Temperature_Sensor'], ['Water_Flow_Sensor', 'Flow_Sensor'], ['Air_Flow_Setpoint', 'Flow_Setpoint'], ['Air_Temperature_Setpoint', 'Temperature_Setpoint'], ['H

100%|██████████| 53/53 [00:03<00:00, 14.61it/s]


Level 2
[['Discharge_Air_Temperature_Setpoint', 'Air_Temperature_Setpoint'], ['Filter_Differential_Pressure_Sensor', 'Differential_Pressure_Sensor'], ['Warmest_Zone_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Warmest_Zone_Air_Temperature_Sensor', 'Zone_Air_Temperature_Sensor'], ['Hot_Water_Return_Temperature_Sensor', 'Return_Water_Temperature_Sensor'], ['Hot_Water_Return_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Discharge_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Min_Air_Temperature_Setpoint', 'Air_Temperature_Setpoint'], ['Average_Zone_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Zone_Air_Temperature_Sensor'], ['Zone_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Differential_Supply_Return_Water_Temperature_Sensor', 'Return_Water_Temperature_Sensor'], ['Differential_Supply_Return_Water_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Outside_Air_Temperature_Setpoint', 'Air_Temperature_Setpoint'],

100%|██████████| 34/34 [00:01<00:00, 24.92it/s]

Level 3
[['Hot_Water_Return_Temperature_Sensor', 'Return_Water_Temperature_Sensor'], ['Warmest_Zone_Air_Temperature_Sensor', 'Zone_Air_Temperature_Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Zone_Air_Temperature_Sensor'], ['Differential_Supply_Return_Water_Temperature_Sensor', 'Return_Water_Temperature_Sensor'], ['Low_Outside_Air_Temperature_Enable_Setpoint', 'Outside_Air_Temperature_Setpoint'], ['Cooling_Demand_Sensor', 'Discharge_Air_Temperature_Setpoint'], ['Cooling_Demand_Sensor', 'Hot_Water_Return_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Filter_Differential_Pressure_Sensor'], ['Cooling_Demand_Sensor', 'Warmest_Zone_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Discharge_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Min_Air_Temperature_Setpoint'], ['Cooling_Demand_Sensor', 'Average_Zone_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Zone_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Differential_Supply_Return_Water_Temperature_Sensor'], ['C




In [11]:
tiers = {
    1: level_labels[0],
    2: level_labels[1],
    3: level_labels[2],
    4: level_labels[3],
    5: level_labels[4]
}

def get_tier(label):
    for tier_num, tier_list in tiers.items():
        if label in tier_list:
            return tier_num
    return None  # Handle cases where the label isn't found in any tier

def sort_labels(labels):
    return sorted(labels, key=lambda label: (get_tier(label) or float('inf'), label))


In [12]:
sorted_labelhir = [sort_labels(labels) for labels in labelhir]

In [13]:
label_hier = np.array(
    sorted_labelhir,
    dtype=object,
)

In [14]:
padded_label = pd.Series(label_hier).apply(lambda x: x + ['None'] * (5 - len(x)) if len(x) < 5 else x)

In [15]:
# Count Nones at each level
for i in range(5):
    none_count = sum(padded_label.apply(lambda x: x[i] == 'None'))
    print(f"Level {i+1}: {none_count} None values out of {len(padded_label)} total ({none_count/len(padded_label):.2%})")

Level 1: 0 None values out of 31839 total (0.00%)
Level 2: 12321 None values out of 31839 total (38.70%)
Level 3: 20247 None values out of 31839 total (63.59%)
Level 4: 27216 None values out of 31839 total (85.48%)
Level 5: 30936 None values out of 31839 total (97.16%)


# Model Training

In [163]:
from typing import List

def train_random_forest(
    train_X: List[pd.DataFrame],
    _label,
    folds,
    model_class,
    params: dict,
    none_ratio_thr: float,
):
    """
    Train random forest models using k-fold cross validation
    
    Args:
        train_X: Training features DataFrame
        _label: Array of labels
        folds: List of dictionaries containing train/val indices
        drop_none: Whether to drop samples with "None" labels
        
    Returns:
        tuple: (list of trained classifiers, list of scores, list of validation predictions)
    """
    classifiers = []
    scores = []
    val_predictions = []  # List to store validation predictions

    for f_idx, fold in enumerate(folds):
        # Prepare train and validation data for this fold
        train_X_fold_list = []
        train_y_fold_list = []
        for trn_x in train_X:
            train_X_fold_list.append(trn_x.iloc[fold['train']])
            train_y_fold_list.append(_label[fold['train']])

        train_X_fold = pd.concat(train_X_fold_list)
        train_y_fold = np.concatenate(train_y_fold_list)

        valid_X_fold_list = []
        valid_y_fold_list = []
        for trn_x in train_X:
            valid_X_fold_list.append(trn_x.iloc[fold['val']])
            valid_y_fold_list.append(_label[fold['val']])

        val_X_fold = pd.concat(valid_X_fold_list)
        val_y_fold = np.concatenate(valid_y_fold_list)
        
        print(f"Train size: {len(train_X_fold)}, Valid size: {len(val_X_fold)}")
        
        # Check the train_y_fold. If more than 30% of samples are labeled "None",
        # randomly sample from the "None" to make that ratio no more than 30%.
        none_mask = (train_y_fold == "None")
        none_count = np.sum(none_mask)
        total_samples = len(train_y_fold)
        none_ratio = none_count / total_samples if total_samples > 0 else 0

        if none_ratio > none_ratio_thr:
            # Calculate how many "None" labels we should keep (30% of total)
            max_none_to_keep = int(none_ratio_thr * (total_samples - none_count))

            # Randomly choose which "None" labels to keep
            none_indices = np.where(none_mask)[0]

            # Fix the random seed before shuffling for reproducibility
            rng = np.random.RandomState(f_idx)
            rng.shuffle(none_indices)
            
            keep_none_indices = none_indices[:max_none_to_keep]

            # Indices of all non-"None" labels
            other_indices = np.where(~none_mask)[0]

            # Combine indices to keep and then sort
            new_indices = np.concatenate([keep_none_indices, other_indices])
            new_indices = np.sort(new_indices)  # Sort so we can index the DataFrame consistently

            # Subset the training data
            train_X_fold = train_X_fold.iloc[new_indices]
            train_y_fold = train_y_fold[new_indices]

            print(f"Sampled: none-ratio: {none_ratio}, removed: {none_count - max_none_to_keep}")

        # Create and train Random Forest model
        model = model_class(**params)
        model.fit(train_X_fold, train_y_fold)
        
        classifiers.append(model)
        
        # Calculate score and save predictions on validation set
        val_preds = model.predict(val_X_fold)
        val_proba = model.predict_proba(val_X_fold)
        score = np.mean(val_preds == val_y_fold)
        scores.append(score)
        val_predictions.append({
            'true_labels': val_y_fold,
            'predicted_labels': val_preds,
            'predicted_proba': val_proba,
            'fold_indices': fold['val'],
            'classes': model.classes_
        })
        print(f"Fold score: {score:.4f}")

    print(f"Average score: {np.mean(scores)}")
    return classifiers, scores, val_predictions

### Train the high precision model by allowing None prediction

In [164]:
prec_classifiers = []
prec_scores = []
prec_val_predictions = []

params = {
    'n_estimators': 100,
    'random_state': 42,
    'n_jobs': 8  # Use all available cores
}

model_cls = RandomForestClassifier

training_set = [train_X_full, train_X_1, train_X_2]
none_ratio_thr_list = [0.1, 0.15, 0.35, 0.75, 0.85]

for i in range(5):
    print(f"Training level {i}")
    level_params = params.copy()
    level_params['class_weight'] = {c: 1 for c in level_labels[i]}
    
    _classifiers, _scores, _val_predictions = train_random_forest(
        training_set,
        np.array([x[i] for x in padded_label]),
        folds,
        params=params,
        model_class=model_cls,
        none_ratio_thr=none_ratio_thr_list[i]
    )
    prec_classifiers.append(_classifiers)
    prec_scores.append(_scores)
    prec_val_predictions.append(_val_predictions)

Training level 0
Train size: 76413, Valid size: 19104
Fold score: 0.8118
Train size: 76413, Valid size: 19104
Fold score: 0.8115
Train size: 76413, Valid size: 19104
Fold score: 0.8078
Train size: 76413, Valid size: 19104
Fold score: 0.8080
Train size: 76416, Valid size: 19101
Fold score: 0.8120
Average score: 0.8102118509015419
Training level 1
Train size: 76413, Valid size: 19104
Sampled: none-ratio: 0.3871854265635429, removed: 22562
Fold score: 0.8661
Train size: 76413, Valid size: 19104
Sampled: none-ratio: 0.3870676455576931, removed: 22552
Fold score: 0.8597
Train size: 76413, Valid size: 19104
Sampled: none-ratio: 0.3870283852224098, removed: 22549
Fold score: 0.8597
Train size: 76413, Valid size: 19104
Sampled: none-ratio: 0.38679282321071023, removed: 22528
Fold score: 0.8604
Train size: 76416, Valid size: 19101
Sampled: none-ratio: 0.3868168969849246, removed: 22531
Fold score: 0.8630
Average score: 0.861794274826833
Training level 2
Train size: 76413, Valid size: 19104
Samp

In [165]:
from sklearn.metrics import classification_report, f1_score

eval_all_true_labels = []
eval_all_pred_labels = []

for _eval_level_id in range(5):
    # Stack all 5 levels' predictions and true labels together
    all_true_labels = np.concatenate([_fold['true_labels'] for _fold in prec_val_predictions[_eval_level_id]])
    all_predicted_labels = np.concatenate([_fold['predicted_labels'] for _fold in prec_val_predictions[_eval_level_id]])

    eval_all_true_labels.append(all_true_labels)
    eval_all_pred_labels.append(all_predicted_labels)

    print(f1_score(all_true_labels, all_predicted_labels, average='macro'))

0.6817774031537279
0.710281020649466
0.6913475487047043
0.737877078869601
0.5400968978893542


In [None]:
0.6817774031537279
0.710281020649466
0.6913475487047043
0.737877078869601
0.5400968978893542

0.683075249732744
0.7118744574641389
0.695832274816091
0.7429248141647662
0.5474381779137976

In [180]:
num_dataset = 3
prob_agg = []

for i, _level_pred in enumerate(prec_val_predictions):
    # get prediction for each data set of each fold with label names
    val_dataset = [[] for _ in range(num_dataset)]

    # list of predicted dataset
    val_dataset_pred_res = []

    # For each fold, get the prediction results
    for j, _fold_pred in tqdm(enumerate(_level_pred), total=len(_level_pred), desc=f"Processing folds for level {i}"):
        pred_labels = _fold_pred['classes'] # num_dataset * validation size
        pred_proba = _fold_pred['predicted_proba'] # num_dataset * validation size
        fold_idx = _fold_pred['fold_indices'] # validation size

        # The prediction results are the stack of `num_dataset` datasets, split them
        split_pred_proba = np.array_split(pred_proba, num_dataset)

        # for each prediction results, produce a dict[label: proba] for each sample
        # and a list of fold index to sort the prediction result later
        for k in range(num_dataset):
            labeled_proba = []
            for proba in split_pred_proba[k]:
                labeled_proba.append(dict(zip(pred_labels, proba)))
                
            val_dataset[k].append({
                "pred_proba": labeled_proba,
                "fold_indices": fold_idx
            })

    # concate each fold prediction result to produce a prediction for the whole dataset
    for _nd in range(num_dataset):
        # concate all folds prediction
        _val_pred_probas = sum([_val_fold_pred['pred_proba'] for _val_fold_pred in val_dataset[_nd]], [])
        _val_pred_index = sum([list(_val_fold_pred['fold_indices']) for _val_fold_pred in val_dataset[_nd]], [])

        # sort the predictions according to fold indicies
        sorted_val_pred_probas = [None] * len(_val_pred_probas)
        for curr_idx, f_idx in enumerate(_val_pred_index):
            sorted_val_pred_probas[curr_idx] = _val_pred_probas[f_idx]
        
        # store the sorted results for one dataset
        val_dataset_pred_res.append(sorted_val_pred_probas)
    
    prob_agg.append(val_dataset_pred_res)


casade_feat_ds = []
for _dataset_id in tqdm(range(num_dataset)):
    level_pred_results = []

    # prediction results in each level
    for i in range(5):
        level_pred_res = pd.DataFrame(prob_agg[i][_dataset_id])
        for col in level_pred_res.columns:
            level_pred_res.rename(columns={col: f"{col}_{i}"}, inplace=True)

        level_pred_results.append(level_pred_res)
    
    casade_feat_ds.append(pd.concat(level_pred_results, axis=1))

Processing folds for level 0: 100%|██████████| 5/5 [00:00<00:00,  8.99it/s]
Processing folds for level 1: 100%|██████████| 5/5 [00:02<00:00,  1.78it/s]
Processing folds for level 2: 100%|██████████| 5/5 [00:01<00:00,  2.81it/s]
Processing folds for level 3: 100%|██████████| 5/5 [00:02<00:00,  2.30it/s]
Processing folds for level 4: 100%|██████████| 5/5 [00:00<00:00,  5.02it/s]
100%|██████████| 3/3 [00:05<00:00,  1.95s/it]


In [174]:
casade_feat_cols = ["Sensor_0", "Setpoint_0"]

In [175]:
cascade_prec_classifiers = []
cascade_prec_scores = []
cascade_prec_val_predictions = []

params = {
    'n_estimators': 100,
    'random_state': 42,
    'n_jobs': 8  # Use all available cores
}

model_cls = RandomForestClassifier

training_set = [
    pd.concat([train_X_full, casade_feat_ds[0][casade_feat_cols]]),
    pd.concat([train_X_1, casade_feat_ds[1][casade_feat_cols]]),
    pd.concat([train_X_2, casade_feat_ds[2][casade_feat_cols]]),
]
none_ratio_thr_list = [0.1, 0.15, 0.35, 0.75, 0.85]

for i in range(5):
    print(f"Training level {i}")
    _classifiers, _scores, _val_predictions = train_random_forest(
        training_set,
        np.array([x[i] for x in padded_label]),
        folds,
        params=params,
        model_class=model_cls,
        none_ratio_thr=none_ratio_thr_list[i]
    )
    cascade_prec_classifiers.append(_classifiers)
    cascade_prec_scores.append(_scores)
    cascade_prec_val_predictions.append(_val_predictions)

Training level 0
Train size: 76413, Valid size: 19104
Fold score: 0.8101
Train size: 76413, Valid size: 19104
Fold score: 0.8127
Train size: 76413, Valid size: 19104
Fold score: 0.8117
Train size: 76413, Valid size: 19104
Fold score: 0.8097
Train size: 76416, Valid size: 19101
Fold score: 0.8117
Average score: 0.8111749901147011
Training level 1
Train size: 76413, Valid size: 19104
Sampled: none-ratio: 0.3871854265635429, removed: 22562
Fold score: 0.8653
Train size: 76413, Valid size: 19104
Sampled: none-ratio: 0.3870676455576931, removed: 22552
Fold score: 0.8608
Train size: 76413, Valid size: 19104
Sampled: none-ratio: 0.3870283852224098, removed: 22549
Fold score: 0.8594
Train size: 76413, Valid size: 19104
Sampled: none-ratio: 0.38679282321071023, removed: 22528
Fold score: 0.8604
Train size: 76416, Valid size: 19101
Sampled: none-ratio: 0.3868168969849246, removed: 22531
Fold score: 0.8644
Average score: 0.8620665118824846
Training level 2
Train size: 76413, Valid size: 19104
Sam

In [176]:
from sklearn.metrics import classification_report, f1_score

eval_all_true_labels = []
eval_all_pred_labels = []

for _eval_level_id in range(5):
    # Stack all 5 levels' predictions and true labels together
    all_true_labels = np.concatenate([_fold['true_labels'] for _fold in cascade_prec_val_predictions[_eval_level_id]])
    all_predicted_labels = np.concatenate([_fold['predicted_labels'] for _fold in cascade_prec_val_predictions[_eval_level_id]])

    eval_all_true_labels.append(all_true_labels)
    eval_all_pred_labels.append(all_predicted_labels)

    print(f1_score(all_true_labels, all_predicted_labels, average='macro'))

0.683075249732744
0.7118744574641389
0.695832274816091
0.7429248141647662
0.5474381779137976


# Cascade Inference

In [178]:
len(prec_classifiers[0])

5

In [191]:
from typing import List

def prepare_cascade_feat(predictions):
    prob_agg = []

    for i, _level_pred in enumerate(predictions):
        fold_pred_list = []

        # For each fold, get the prediction results
        for j, _fold_pred in tqdm(enumerate(_level_pred), total=len(_level_pred), desc=f"Processing folds for level {i}"):
            pred_labels = _fold_pred['classes'] # num_dataset * validation size
            pred_proba = _fold_pred['predicted_proba'] # num_dataset * validation size
            
            labeled_proba = []
            for proba in pred_proba:
                labeled_proba.append(dict(zip(pred_labels, proba)))
        
            fold_pred_list.append(labeled_proba)

        # Averaging predicted probability for each level
        sample_size = len(fold_pred_list[0])
        avg_fold_pred_proba = []
        for sample_idx in tqdm(range(sample_size), desc=f"Averaging fold results"):
            avg_proba = {}

            for _fold_id in range(len(fold_pred_list)):
                for pred_key, pred_val in fold_pred_list[_fold_id][sample_idx].items():
                    if pred_key not in avg_proba:
                        avg_proba[pred_key] = []
                    elif pred_key in avg_proba:
                        avg_proba[pred_key].append(pred_val)
                    else:
                        raise Exception("You are not supposed to be here...")
            
            for _k in avg_proba:
                avg_proba[_k] = sum(avg_proba[_k]) / len(avg_proba[_k])
            
            avg_fold_pred_proba.append(avg_proba)
        
        prob_agg.append(avg_fold_pred_proba)

    level_pred_results = []

    # prediction results in each level
    for i in range(5):
        level_pred_res = pd.DataFrame(prob_agg[i])
        for col in level_pred_res.columns:
            level_pred_res.rename(columns={col: f"{col}_{i}"}, inplace=True)

        level_pred_results.append(level_pred_res)
    
    return pd.concat(level_pred_results, axis=1)

def cascade_random_forest_inference(
    test_data: pd.DataFrame,
    base_classifier_list: List,
    cascade_classifier_list: List,
    cascade_cols: List[str]
):
    base_predictions = []  # List to store validation predictions

    # Base (1st) stage classification
    for level_models in tqdm(base_classifier_list, desc=f"Inferencing different levels for base classifier"):
        level_base_pred = []
        for f_idx, model in enumerate(level_models):
            val_preds = model.predict(test_data)
            val_proba = model.predict_proba(test_data)

            level_base_pred.append({
                'predicted_labels': val_preds,
                'predicted_proba': val_proba,
                'classes': model.classes_
            })
        
        base_predictions.append(level_base_pred)

    base_pred_df = prepare_cascade_feat(base_predictions)
    
    cascade_feat = base_pred_df[cascade_cols]
    cascade_input = pd.concat([test_data, cascade_feat], axis=1)

    # Cascade (2nd) stage classification
    cascade_level_predictions = []
    for level_models in tqdm(cascade_classifier_list, desc=f"Inferencing different levels for cascade classifier"):
        level_cascade_pred = []
        for f_idx, model in enumerate(level_models):
            val_preds = model.predict(cascade_input)
            val_proba = model.predict_proba(cascade_input)

            level_cascade_pred.append({
                'predicted_labels': val_preds,
                'predicted_proba': val_proba,
                'classes': model.classes_
            })

        cascade_level_predictions.append(level_cascade_pred)

    cascade_pred_df = prepare_cascade_feat(cascade_level_predictions)

    return base_pred_df, cascade_pred_df

In [194]:
cliped_test_X = np.clip(test_X, a_min=None, a_max=np.finfo(np.float32).max)

base_pred_df, cascade_pred_df = cascade_random_forest_inference(
    test_data=cliped_test_X,
    base_classifier_list=prec_classifiers,
    cascade_classifier_list=cascade_prec_classifiers,
    cascade_cols=casade_feat_cols
)

Inferencing different levels for base classifier: 100%|██████████| 5/5 [01:54<00:00, 22.95s/it]
Processing folds for level 0: 100%|██████████| 5/5 [00:08<00:00,  1.76s/it]
Processing folds for level 1: 100%|██████████| 5/5 [00:46<00:00,  9.22s/it]
Processing folds for level 2: 100%|██████████| 5/5 [00:29<00:00,  5.98s/it]
Processing folds for level 3: 100%|██████████| 5/5 [00:37<00:00,  7.42s/it]
Processing folds for level 4: 100%|██████████| 5/5 [00:17<00:00,  3.46s/it]
Inferencing different levels for cascade classifier: 100%|██████████| 5/5 [04:04<00:00, 48.95s/it]
Processing folds for level 0: 100%|██████████| 5/5 [00:19<00:00,  3.98s/it]
Processing folds for level 1: 100%|██████████| 5/5 [01:34<00:00, 18.92s/it]
Processing folds for level 2: 100%|██████████| 5/5 [01:08<00:00, 13.77s/it]
Processing folds for level 3: 100%|██████████| 5/5 [01:21<00:00, 16.30s/it]
Processing folds for level 4: 100%|██████████| 5/5 [00:48<00:00,  9.70s/it]


In [236]:
def prepare_final_results(df):
    res = df.copy()

    # 1) Strip off the suffix "_some_number" from each column
    res.columns = res.columns.str.replace(r'_\d+$', '', regex=True)

    # 2) Group columns by their new names and average any duplicates
    res = res.groupby(level=0, axis=1).mean()

    res['Cooling_Demand_Sensor'] = 0.0
    res['Heating_Demand_Sensor'] = 0.0
    res = res.drop(columns="None")

    return res.astype('float32').round(2)


In [260]:
res_base = prepare_final_results(base_pred_df)
res_casc = prepare_final_results(cascade_pred_df)

In [259]:
zipftest = ZipFile('../downloads/test_X_v0.1.0.zip', 'r')
listtestfile = zipftest.namelist()[1:]
filename_col = pd.DataFrame(data=listtestfile, columns=["filename"])
filename_col['filename'] = filename_col['filename'].apply(lambda x: x.split("/")[1])

In [264]:
final_res_base = pd.concat([filename_col, res_base], axis=1)
final_res_base.to_csv("../logs/submit/5_fold_rf_base_0122.csv", index=False)

In [266]:
pd.read_csv("../logs/submit/5_fold_rf_base_0122.csv")

Unnamed: 0,filename,Active_Power_Sensor,Air_Flow_Sensor,Air_Flow_Setpoint,Air_Temperature_Sensor,Air_Temperature_Setpoint,Alarm,Angle_Sensor,Average_Zone_Air_Temperature_Sensor,Chilled_Water_Differential_Temperature_Sensor,...,Water_Temperature_Sensor,Water_Temperature_Setpoint,Wind_Direction_Sensor,Wind_Speed_Sensor,Zone_Air_Dewpoint_Sensor,Zone_Air_Humidity_Sensor,Zone_Air_Humidity_Setpoint,Zone_Air_Temperature_Sensor,Cooling_Demand_Sensor,Heating_Demand_Sensor
0,test_X20367.pkl,0.03,0.05,0.02,0.00,0.17,0.00,0.0,0.03,0.01,...,0.04,0.01,0.00,0.01,0.0,0.0,0.0,0.0,0.0,0.0
1,test_X103084.pkl,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,...,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
2,test_X6910.pkl,0.00,0.00,0.00,0.00,0.10,0.00,0.0,0.00,0.00,...,0.34,0.02,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
3,test_X66332.pkl,0.02,0.03,0.01,0.04,0.00,0.00,0.0,0.02,0.00,...,0.03,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
4,test_X38528.pkl,0.00,0.00,0.00,0.30,0.00,0.05,0.0,0.00,0.00,...,0.01,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315715,test_X325790.pkl,0.00,0.25,0.09,0.00,0.00,0.00,0.0,0.00,0.01,...,0.01,0.00,0.03,0.00,0.0,0.0,0.0,0.0,0.0,0.0
315716,test_X61444.pkl,0.02,0.04,0.04,0.00,0.00,0.00,0.0,0.00,0.00,...,0.00,0.00,0.01,0.00,0.0,0.0,0.0,0.0,0.0,0.0
315717,test_X221284.pkl,0.04,0.00,0.00,0.00,0.00,0.08,0.0,0.00,0.00,...,0.08,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
315718,test_X115827.pkl,0.00,0.00,0.00,0.00,0.00,0.16,0.0,0.00,0.00,...,0.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0


In [247]:
res_casc

Unnamed: 0,Active_Power_Sensor,Air_Flow_Sensor,Air_Flow_Setpoint,Air_Temperature_Sensor,Air_Temperature_Setpoint,Alarm,Angle_Sensor,Average_Zone_Air_Temperature_Sensor,Chilled_Water_Differential_Temperature_Sensor,Chilled_Water_Return_Temperature_Sensor,...,Water_Temperature_Sensor,Water_Temperature_Setpoint,Wind_Direction_Sensor,Wind_Speed_Sensor,Zone_Air_Dewpoint_Sensor,Zone_Air_Humidity_Sensor,Zone_Air_Humidity_Setpoint,Zone_Air_Temperature_Sensor,Cooling_Demand_Sensor,Heating_Demand_Sensor
0,0.03,0.05,0.02,0.01,0.14,0.00,0.0,0.01,0.02,0.01,...,0.02,0.01,0.0,0.0,0.0,0.00,0.0,0.01,0.0,0.0
1,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,...,0.00,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0
2,0.00,0.00,0.00,0.00,0.12,0.00,0.0,0.00,0.00,0.31,...,0.29,0.01,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0
3,0.02,0.06,0.02,0.01,0.00,0.00,0.0,0.01,0.00,0.00,...,0.04,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0
4,0.00,0.00,0.00,0.26,0.00,0.04,0.0,0.00,0.00,0.00,...,0.00,0.00,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631435,0.11,0.05,0.01,0.32,0.21,0.43,0.0,0.04,0.00,0.01,...,0.01,0.01,0.0,0.0,0.0,0.05,0.0,0.41,0.0,0.0
631436,0.11,0.05,0.01,0.32,0.21,0.43,0.0,0.04,0.00,0.01,...,0.01,0.01,0.0,0.0,0.0,0.05,0.0,0.41,0.0,0.0
631437,0.11,0.05,0.01,0.32,0.21,0.43,0.0,0.04,0.00,0.01,...,0.01,0.01,0.0,0.0,0.0,0.05,0.0,0.41,0.0,0.0
631438,0.11,0.05,0.01,0.32,0.21,0.43,0.0,0.04,0.00,0.01,...,0.01,0.01,0.0,0.0,0.0,0.05,0.0,0.41,0.0,0.0


In [246]:
res_casc['filename'] = listtestfile
res_casc.to_csv("../logs/submit/5_fold_rf_casc_0122.csv", index=False)

ValueError: Length of values (315720) does not match length of index (631440)

In [421]:
cliped_text_X = np.clip(test_X, a_min=None, a_max=np.finfo(np.float32).max)

In [422]:
def make_predictions_with_models(classifiers, test_data):
    """
    Make probability predictions using multiple classifier models
    
    Args:
        classifiers: List of trained classifier models
        test_data: Test data to make predictions on
        
    Returns:
        List of probability predictions from each classifier
    """
    test_preds_all = []
    for clf in tqdm(classifiers):
        pred = clf.predict_proba(test_data)
        test_preds_all.append(pred)
    return test_preds_all

In [423]:
def align_and_combine_predictions(classifiers, test_preds_all, test_data, threshold=0.0):
    """
    Aligns predictions from multiple classifiers and combines them through averaging
    
    Args:
        classifiers: List of trained classifier models
        test_preds_all: List of probability predictions from each classifier
        test_data: Test data used for predictions
        threshold: Minimum probability threshold for making predictions
        
    Returns:
        Final class predictions after aligning and combining probabilities
    """
    # Get the common classes across all classifiers
    all_classes = classifiers[0].classes_
    test_preds_aligned = []

    # Make predictions with each fold's model and align them 
    for i, clf in tqdm(enumerate(classifiers)):
        pred = test_preds_all[i]
        # Create a mapping to align predictions with common classes
        pred_dict = {_cls: idx for idx, _cls in enumerate(clf.classes_)}
        aligned_pred = np.zeros((len(test_data), len(all_classes)))
        
        for i, _cls in enumerate(all_classes):
            if _cls in pred_dict:
                aligned_pred[:, i] = pred[:, pred_dict[_cls]]
        
        test_preds_aligned.append(aligned_pred)

    # Stack and average the aligned predictions
    test_preds_all = np.stack(test_preds_aligned)
    test_preds_proba = test_preds_all.mean(axis=0)

    # Get max probabilities for each prediction
    max_probs = np.max(test_preds_proba, axis=1)
    
    # Convert probabilities to class predictions, using threshold
    test_preds = np.array(['None'] * len(test_data), dtype=object)
    confident_mask = max_probs >= threshold
    test_preds[confident_mask] = all_classes[np.argmax(test_preds_proba[confident_mask], axis=1)]
    
    return test_preds

In [424]:
test_preds_list = []
for i in range(5):
    print(f"Predicting level {i}")
    test_preds_all = make_predictions_with_models(prec_classifiers[i], cliped_text_X)
    test_preds_list.append(align_and_combine_predictions(prec_classifiers[i], test_preds_all, cliped_text_X))

Predicting level 0


100%|██████████| 5/5 [00:08<00:00,  1.75s/it]
5it [00:00, 53.19it/s]


Predicting level 1


100%|██████████| 5/5 [00:19<00:00,  3.94s/it]
5it [00:01,  2.61it/s]


Predicting level 2


100%|██████████| 5/5 [00:11<00:00,  2.34s/it]
5it [00:00,  6.38it/s]


Predicting level 3


100%|██████████| 5/5 [00:13<00:00,  2.60s/it]
5it [00:01,  4.00it/s]


Predicting level 4


100%|██████████| 5/5 [00:06<00:00,  1.29s/it]
5it [00:00, 19.56it/s]


In [425]:
# Convert to array and process None values
stacked = np.stack(test_preds_list).transpose()
for row in tqdm(stacked):
    # Find first occurrence of 'None' if any
    none_idx = np.where(row == 'None')[0]
    if len(none_idx) > 0:
        # Set all elements after first None to None
        first_none = none_idx[0]
        row[first_none:] = 'None'
        
stacked

100%|██████████| 315720/315720 [00:02<00:00, 121793.19it/s]


array([['Sensor', 'Pressure_Sensor', 'None', 'None', 'None'],
       ['Sensor', 'Flow_Sensor', 'Water_Flow_Sensor',
        'Chilled_Water_Supply_Flow_Sensor', 'None'],
       ['Setpoint', 'Temperature_Setpoint', 'None', 'None', 'None'],
       ...,
       ['Sensor', 'None', 'None', 'None', 'None'],
       ['Sensor', 'None', 'None', 'None', 'None'],
       ['Alarm', 'None', 'None', 'None', 'None']], dtype=object)

In [200]:
columnlist = ['Active_Power_Sensor', 'Air_Flow_Sensor',
       'Air_Flow_Setpoint', 'Air_Temperature_Sensor',
       'Air_Temperature_Setpoint', 'Alarm', 'Angle_Sensor',
       'Average_Zone_Air_Temperature_Sensor',
       'Chilled_Water_Differential_Temperature_Sensor',
       'Chilled_Water_Return_Temperature_Sensor',
       'Chilled_Water_Supply_Flow_Sensor',
       'Chilled_Water_Supply_Temperature_Sensor', 'Command',
       'Cooling_Demand_Sensor', 'Cooling_Demand_Setpoint',
       'Cooling_Supply_Air_Temperature_Deadband_Setpoint',
       'Cooling_Temperature_Setpoint', 'Current_Sensor',
       'Damper_Position_Sensor', 'Damper_Position_Setpoint', 'Demand_Sensor',
       'Dew_Point_Setpoint', 'Differential_Pressure_Sensor',
       'Differential_Pressure_Setpoint',
       'Differential_Supply_Return_Water_Temperature_Sensor',
       'Discharge_Air_Dewpoint_Sensor', 'Discharge_Air_Temperature_Sensor',
       'Discharge_Air_Temperature_Setpoint',
       'Discharge_Water_Temperature_Sensor', 'Duration_Sensor',
       'Electrical_Power_Sensor', 'Energy_Usage_Sensor',
       'Filter_Differential_Pressure_Sensor', 'Flow_Sensor', 'Flow_Setpoint',
       'Frequency_Sensor', 'Heating_Demand_Sensor', 'Heating_Demand_Setpoint',
       'Heating_Supply_Air_Temperature_Deadband_Setpoint',
       'Heating_Temperature_Setpoint', 'Hot_Water_Flow_Sensor',
       'Hot_Water_Return_Temperature_Sensor',
       'Hot_Water_Supply_Temperature_Sensor', 'Humidity_Setpoint',
       'Load_Current_Sensor', 'Low_Outside_Air_Temperature_Enable_Setpoint',
       'Max_Air_Temperature_Setpoint', 'Min_Air_Temperature_Setpoint',
       'Outside_Air_CO2_Sensor', 'Outside_Air_Enthalpy_Sensor',
       'Outside_Air_Humidity_Sensor',
       'Outside_Air_Lockout_Temperature_Setpoint',
       'Outside_Air_Temperature_Sensor', 'Outside_Air_Temperature_Setpoint',
       'Parameter', 'Peak_Power_Demand_Sensor', 'Position_Sensor',
       'Power_Sensor', 'Pressure_Sensor', 'Rain_Sensor',
       'Reactive_Power_Sensor', 'Reset_Setpoint',
       'Return_Air_Temperature_Sensor', 'Return_Water_Temperature_Sensor',
       'Room_Air_Temperature_Setpoint', 'Sensor', 'Setpoint',
       'Solar_Radiance_Sensor', 'Speed_Setpoint', 'Static_Pressure_Sensor',
       'Static_Pressure_Setpoint', 'Status', 'Supply_Air_Humidity_Sensor',
       'Supply_Air_Static_Pressure_Sensor',
       'Supply_Air_Static_Pressure_Setpoint', 'Supply_Air_Temperature_Sensor',
       'Supply_Air_Temperature_Setpoint', 'Temperature_Sensor',
       'Temperature_Setpoint', 'Thermal_Power_Sensor', 'Time_Setpoint',
       'Usage_Sensor', 'Valve_Position_Sensor', 'Voltage_Sensor',
       'Warmest_Zone_Air_Temperature_Sensor', 'Water_Flow_Sensor',
       'Water_Temperature_Sensor', 'Water_Temperature_Setpoint',
       'Wind_Direction_Sensor', 'Wind_Speed_Sensor',
       'Zone_Air_Dewpoint_Sensor', 'Zone_Air_Humidity_Sensor',
       'Zone_Air_Humidity_Setpoint', 'Zone_Air_Temperature_Sensor'
]

In [427]:
zipftest = ZipFile('../downloads/test_X_v0.1.0.zip', 'r')

In [428]:
listtestfile = zipftest.namelist()[1:]

In [429]:
stackedfinalresult = pd.DataFrame(columns=['filename'])
stackedfinalresult['filename'] = pd.Series(listtestfile).apply(lambda x: x.split("/")[-1])

for labelname in columnlist:
    stackedfinalresult[labelname] = 0

test_preds = stacked
for i in tqdm(range(len(test_preds))):
    # stackedfinalresult.loc[i, test_preds[i]] = 1
    predlist = test_preds[i].tolist()
    predlist = [x for x in predlist if x != 'None']
    for predlabelname in predlist:
    	stackedfinalresult.loc[i, predlabelname] = 1

100%|██████████| 315720/315720 [01:08<00:00, 4587.14it/s]


In [430]:
stackedfinalresult = stackedfinalresult.assign(**{col: stackedfinalresult[col].astype(float) for col in stackedfinalresult.columns if col != "filename"})

In [431]:
stackedfinalresult.to_csv("../logs/submit/hier_rf_slide_aug_2_label_fix_2_unfixed.csv", index=False)

In [333]:
stackedfinalresult

Unnamed: 0,filename,Active_Power_Sensor,Air_Flow_Sensor,Air_Flow_Setpoint,Air_Temperature_Sensor,Air_Temperature_Setpoint,Alarm,Angle_Sensor,Average_Zone_Air_Temperature_Sensor,Chilled_Water_Differential_Temperature_Sensor,...,Warmest_Zone_Air_Temperature_Sensor,Water_Flow_Sensor,Water_Temperature_Sensor,Water_Temperature_Setpoint,Wind_Direction_Sensor,Wind_Speed_Sensor,Zone_Air_Dewpoint_Sensor,Zone_Air_Humidity_Sensor,Zone_Air_Humidity_Setpoint,Zone_Air_Temperature_Sensor
0,test_X20367.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_X103084.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_X6910.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_X66332.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,test_X38528.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315715,test_X325790.pkl,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315716,test_X61444.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315717,test_X221284.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315718,test_X115827.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [432]:
target_cols = ["Power_Sensor", "Demand_Sensor", "Electrical_Power_Sensor", "Air_Temperature_Setpoint", "Cooling_Temperature_Setpoint", "Heating_Temperature_Setpoint"]
# target_cols = ["Active_Power_Sensor", "Air_Flow_Sensor", "Air_Flow_Setpoint", "Air_Temperature_Sensor", "Air_Temperature_Setpoint", "Alarm"]

In [251]:
fixed_label_res = pd.read_csv("../logs/submit/hier_rf_slide_aug_2_label_fix_2_unfixed.csv")
fixed_label_res

Unnamed: 0,filename,Active_Power_Sensor,Air_Flow_Sensor,Air_Flow_Setpoint,Air_Temperature_Sensor,Air_Temperature_Setpoint,Alarm,Angle_Sensor,Average_Zone_Air_Temperature_Sensor,Chilled_Water_Differential_Temperature_Sensor,...,Warmest_Zone_Air_Temperature_Sensor,Water_Flow_Sensor,Water_Temperature_Sensor,Water_Temperature_Setpoint,Wind_Direction_Sensor,Wind_Speed_Sensor,Zone_Air_Dewpoint_Sensor,Zone_Air_Humidity_Sensor,Zone_Air_Humidity_Setpoint,Zone_Air_Temperature_Sensor
0,test_X20367.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_X103084.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_X6910.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_X66332.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,test_X38528.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315715,test_X325790.pkl,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315716,test_X61444.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315717,test_X221284.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315718,test_X115827.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [434]:
# 0.519
for col in target_cols:
    print(col, fixed_label_res[col].value_counts().iloc[1])

Power_Sensor 48746
Demand_Sensor 12087
Electrical_Power_Sensor 44809
Air_Temperature_Setpoint 8542
Cooling_Temperature_Setpoint 1662
Heating_Temperature_Setpoint 1061


In [435]:
unfixed_label_res = pd.read_csv("../logs/submit/hier_rf_slide_aug_2_label_fix_2_power_to_demand.csv")

In [436]:
# 0.512
for col in target_cols:
    print(col, unfixed_label_res[col].value_counts().iloc[1])

Power_Sensor 45111
Demand_Sensor 12087
Electrical_Power_Sensor 44843
Air_Temperature_Setpoint 7950
Cooling_Temperature_Setpoint 1638
Heating_Temperature_Setpoint 1549
