In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import time
from zipfile import ZipFile
import warnings
import pickle
import torch
from torch.utils.data import Dataset
import tsfel
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
from hiclass import LocalClassifierPerNode, LocalClassifierPerParentNode, LocalClassifierPerLevel
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

In [11]:
train_y = pd.read_csv("../downloads/train_y_v0.1.0.csv")

# Prepare features

### Prepare pre-extracted features

In [2]:
raw_train_X_full = pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_full_v3.csv")
raw_train_X_1 = pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split1_2_v3.csv")
raw_train_X_2 = pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split2_2_v3.csv")

In [3]:
raw_test_X = pd.read_csv("../downloads/test_features_full_v3.csv")

In [4]:
pre_feature_list = ['0_Absolute energy',
 '0_Area under the curve',
 '0_Autocorrelation',
 '0_Average power',
 '0_Centroid',
 '0_ECDF Percentile Count_0',
 '0_ECDF Percentile Count_1',
 '0_ECDF Percentile_0',
 '0_ECDF Percentile_1',
 '0_ECDF_0',
 '0_ECDF_1',
 '0_ECDF_2',
 '0_ECDF_3',
 '0_ECDF_4',
 '0_ECDF_5',
 '0_ECDF_6',
 '0_ECDF_7',
 '0_ECDF_8',
 '0_ECDF_9',
 '0_Entropy',
 '0_Histogram mode',
 '0_Interquartile range',
 '0_Kurtosis',
 '0_Max',
 '0_Mean',
 '0_Mean absolute deviation',
 '0_Mean absolute diff',
 '0_Mean diff',
 '0_Median',
 '0_Median absolute deviation',
 '0_Median absolute diff',
 '0_Median diff',
 '0_Min',
 '0_Negative turning points',
 '0_Neighbourhood peaks',
 '0_Peak to peak distance',
 '0_Positive turning points',
 '0_Root mean square',
 '0_Signal distance',
 '0_Skewness',
 '0_Slope',
 '0_Standard deviation',
 '0_Sum absolute diff',
 '0_Variance',
 '0_Zero crossing rate',
 '0_Fundamental frequency',
 '0_Human range energy',
 '0_Max power spectrum',
 '0_Maximum frequency',
 '0_Median frequency',
 '0_Power bandwidth',
 '0_Wavelet entropy',
 'value_median',
 'value_mean',
 'value_qmean',
 'value_max',
 'value_min',
 'value_maxmin',
 'value_diffmax',
 'value_diffmin',
 'value_diffmean',
 'value_diffqmean',
 'value_diffmedian',
 'value_diffmaxmin',
 'time_diffmean',
 'time_diffqmean',
 'time_diffmax',
 'time_diffmin',
 'time_diffmedian',
 'value_std',
 'value_var',
 'value_diffstd',
 'value_diffvar',
 'time_diffstd',
 'time_diffvar',
 'time_burstiness',
 'time_total',
 'time_event_density',
 'time_entropy',
 'time_slope'
]

In [5]:
train_X_full = raw_train_X_full[pre_feature_list]
train_X_1 = raw_train_X_1[pre_feature_list]
train_X_2 = raw_train_X_2[pre_feature_list]
test_X = raw_test_X[pre_feature_list]

### Add spectrogram clustering feature

In [6]:
# trn_zipf = ZipFile("../downloads/train_X_v0.1.0.zip")
# trn_filenames = list(trn_zipf.namelist()[1:])

In [7]:
# spec_cluster_dist = torch.load("../downloads/distances_to_centers.pt")

In [8]:
# max_indices = spec_cluster_dist.argmax(dim=1)
# spec_cluster = torch.zeros_like(spec_cluster_dist)
# spec_cluster[torch.arange(spec_cluster_dist.size(0)), max_indices] = 1

# spec_trn_dist = spec_cluster[:len(train_y)]
# spec_tst_dist = spec_cluster[len(train_y):]

In [9]:
# spec_feat_cols = [f"dist_to_spec_cluster_{i}" for i in range(6)]

In [12]:
# spec_trn_feat = pd.DataFrame(spec_trn_dist, columns=spec_feat_cols)
# spec_trn_feat['index'] = trn_filenames

# spec_trn_feat['index'] = spec_trn_feat['index'].apply(lambda x: x.split("/train_X")[-1].split(".")[0]).astype(int)
# spec_trn_feat = spec_trn_feat.sort_values(by='index')
# spec_trn_feat = spec_trn_feat.drop(columns='index').reset_index(drop=True)

In [13]:
# # Add spec cluster into train feature
# train_X_full = pd.concat([train_X_full, spec_trn_feat], axis=1)
# train_X_1 = pd.concat([train_X_1, spec_trn_feat], axis=1)
# train_X_2 = pd.concat([train_X_2, spec_trn_feat], axis=1)

In [14]:
# test_X = pd.concat([
#     test_X,
#     pd.DataFrame(spec_tst_dist, columns=spec_feat_cols)
# ], axis=1)

In [15]:
# feature_list = pre_feature_list + spec_feat_cols

# Prepare labels

In [16]:
def get_active_labels_np(row):
    """More efficient version using numpy"""
    arr = row.to_numpy() # convert to numpy array
    indices = np.where(arr == 1)[0] # get indices where value is 1
    labels = row.index[indices].tolist() # get labels from indices
    return labels

labelhir = train_y.apply(get_active_labels_np, axis=1).tolist()

In [17]:
level_labels = [list(train_y.columns[1:]), [], [], [], []]

for k in range(0, 4):
    check_labels = level_labels[k]
    label_len = len(check_labels)
    idx_is_subset_of_col = pd.DataFrame(0, index=check_labels, columns=check_labels)
    is_subset = []

    for i in tqdm(range(label_len)):
        for j in range(label_len):
            src_lb, tgt_lb = check_labels[i], check_labels[j]
            src = train_y[train_y[src_lb] == 1]
            tgt = train_y[(train_y[src_lb] == 1) & (train_y[tgt_lb] == 1)]

            idx_is_subset_of_col.loc[src_lb, tgt_lb] = len(src) <= len(tgt)
            if len(src) <= len(tgt) and src_lb != tgt_lb:
                is_subset.append([src_lb, tgt_lb])

    remove_label = set([s[0] for s in is_subset])
    print(f"Level {k}")
    print(is_subset)
    print(remove_label)
    print()
    
    for rl in remove_label:
        level_labels[k].remove(rl)
        level_labels[k+1].append(rl)

100%|██████████| 94/94 [00:14<00:00,  6.45it/s]


Level 0
[['Active_Power_Sensor', 'Electrical_Power_Sensor'], ['Active_Power_Sensor', 'Power_Sensor'], ['Active_Power_Sensor', 'Sensor'], ['Air_Flow_Sensor', 'Flow_Sensor'], ['Air_Flow_Sensor', 'Sensor'], ['Air_Flow_Setpoint', 'Flow_Setpoint'], ['Air_Flow_Setpoint', 'Setpoint'], ['Air_Temperature_Sensor', 'Sensor'], ['Air_Temperature_Sensor', 'Temperature_Sensor'], ['Air_Temperature_Setpoint', 'Setpoint'], ['Air_Temperature_Setpoint', 'Temperature_Setpoint'], ['Angle_Sensor', 'Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Temperature_Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Zone_Air_Temperature_Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Temperature_Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Chilled_Water_Return_Temperature_Sensor',

100%|██████████| 88/88 [00:11<00:00,  7.60it/s]


Level 1
[['Water_Temperature_Setpoint', 'Temperature_Setpoint'], ['Min_Air_Temperature_Setpoint', 'Temperature_Setpoint'], ['Min_Air_Temperature_Setpoint', 'Air_Temperature_Setpoint'], ['Discharge_Water_Temperature_Sensor', 'Temperature_Sensor'], ['Discharge_Water_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Outside_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Outside_Air_Temperature_Sensor', 'Temperature_Sensor'], ['Low_Outside_Air_Temperature_Enable_Setpoint', 'Temperature_Setpoint'], ['Low_Outside_Air_Temperature_Enable_Setpoint', 'Air_Temperature_Setpoint'], ['Low_Outside_Air_Temperature_Enable_Setpoint', 'Outside_Air_Temperature_Setpoint'], ['Load_Current_Sensor', 'Current_Sensor'], ['Heating_Temperature_Setpoint', 'Temperature_Setpoint'], ['Hot_Water_Flow_Sensor', 'Water_Flow_Sensor'], ['Hot_Water_Flow_Sensor', 'Flow_Sensor'], ['Supply_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Supply_Air_Temperature_Sensor', 'Temperature_Sensor'], ['Air_Temperature_Set

100%|██████████| 53/53 [00:04<00:00, 12.79it/s]


Level 2
[['Zone_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Heating_Supply_Air_Temperature_Deadband_Setpoint', 'Heating_Temperature_Setpoint'], ['Heating_Supply_Air_Temperature_Deadband_Setpoint', 'Air_Temperature_Setpoint'], ['Hot_Water_Supply_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Hot_Water_Return_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Hot_Water_Return_Temperature_Sensor', 'Return_Water_Temperature_Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Cooling_Supply_Air_Temperature_Deadband_Setpoint', 'Cooling_Temperature_Setpoint'], ['Cooling_Supply_Air_Temperature_Deadband_Setpoint', 'Air_Temperature_Setpoint'], ['Min_Air_Temperature_Setpoint', 'Air_Temperature_Setpoint'], ['Discharge_Water_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Supply_Air_Temperature_Setpoint', 'Air_Temperature_Setpoint'], ['Outside_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Chilled_Water_Supply_Flow_Sensor', 'Water_Fl

100%|██████████| 34/34 [00:01<00:00, 19.56it/s]

Level 3
[['Hot_Water_Return_Temperature_Sensor', 'Return_Water_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Zone_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Heating_Supply_Air_Temperature_Deadband_Setpoint'], ['Cooling_Demand_Sensor', 'Hot_Water_Supply_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Hot_Water_Return_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Cooling_Supply_Air_Temperature_Deadband_Setpoint'], ['Cooling_Demand_Sensor', 'Min_Air_Temperature_Setpoint'], ['Cooling_Demand_Sensor', 'Discharge_Water_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Supply_Air_Temperature_Setpoint'], ['Cooling_Demand_Sensor', 'Outside_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Chilled_Water_Supply_Flow_Sensor'], ['Cooling_Demand_Sensor', 'Outside_Air_Lockout_Temperature_Setpoint'], ['Cooling_Demand_Sensor', 'Discharge_Air_Temperature_Setpoint'], ['Cooling_Demand_Sensor', 'Peak_Power_Demand_Sensor'], ['Cooling_Demand_Sensor', 'Filter_Differential_Pressure_Sensor'], 




In [18]:
tiers = {
    1: level_labels[0],
    2: level_labels[1],
    3: level_labels[2],
    4: level_labels[3],
    5: level_labels[4]
}

def get_tier(label):
    for tier_num, tier_list in tiers.items():
        if label in tier_list:
            return tier_num
    return None  # Handle cases where the label isn't found in any tier

def sort_labels(labels):
    return sorted(labels, key=lambda label: (get_tier(label) or float('inf'), label))


In [19]:
sorted_labelhir = [sort_labels(labels) for labels in labelhir]

In [20]:
label_hier = np.array(
    sorted_labelhir,
    dtype=object,
)

In [21]:
padded_label = pd.Series(label_hier).apply(lambda x: x + ['None'] * (5 - len(x)) if len(x) < 5 else x)

In [22]:
# Count Nones at each level
for i in range(5):
    none_count = sum(padded_label.apply(lambda x: x[i] == 'None'))
    print(f"Level {i+1}: {none_count} None values out of {len(padded_label)} total ({none_count/len(padded_label):.2%})")

Level 1: 0 None values out of 31839 total (0.00%)
Level 2: 12321 None values out of 31839 total (38.70%)
Level 3: 20247 None values out of 31839 total (63.59%)
Level 4: 27216 None values out of 31839 total (85.48%)
Level 5: 30936 None values out of 31839 total (97.16%)


# Model Training

In [40]:
from typing import List

def train_random_forest(
    train_X: List[pd.DataFrame],
    _label,
    model_class,
    params: dict,
    none_ratio_thr: float,
):
    """
    Train random forest models without k-fold cross validation
    
    Args:
        train_X: Training features DataFrame
        _label: Array of labels
        folds: List of dictionaries containing train/val indices
        
    Returns:
        tuple: (list of trained classifiers, list of scores, list of validation predictions)
    """
    
    # Prepare train and validation data for this fold
    train_X_fold_list = []
    train_y_fold_list = []
    for trn_x in train_X:
        train_X_fold_list.append(trn_x)
        train_y_fold_list.append(_label)

    train_X_fold = pd.concat(train_X_fold_list)
    train_y_fold = np.concatenate(train_y_fold_list)
    
    print(f"Train size: {len(train_X_fold)}")
    
    # Check the train_y_fold. If more than 30% of samples are labeled "None",
    # randomly sample from the "None" to make that ratio no more than 30%.
    none_mask = (train_y_fold == "None")
    none_count = np.sum(none_mask)
    total_samples = len(train_y_fold)
    none_ratio = none_count / total_samples if total_samples > 0 else 0

    if none_ratio > none_ratio_thr:
        # Calculate how many "None" labels we should keep (30% of total)
        max_none_to_keep = int(none_ratio_thr * (total_samples - none_count))

        # Randomly choose which "None" labels to keep
        none_indices = np.where(none_mask)[0]

        # Fix the random seed before shuffling for reproducibility
        rng = np.random.RandomState(42)
        rng.shuffle(none_indices)
        
        keep_none_indices = none_indices[:max_none_to_keep]

        # Indices of all non-"None" labels
        other_indices = np.where(~none_mask)[0]

        # Combine indices to keep and then sort
        new_indices = np.concatenate([keep_none_indices, other_indices])
        new_indices = np.sort(new_indices)  # Sort so we can index the DataFrame consistently

        # Subset the training data
        train_X_fold = train_X_fold.iloc[new_indices]
        train_y_fold = train_y_fold[new_indices]

        print(f"Sampled: none-ratio: {none_ratio}, removed: {none_count - max_none_to_keep}")

    # Create and train Random Forest model
    model = model_class(**params)
    model.fit(train_X_fold, train_y_fold)
    
    return model

### Train the high precision model by allowing None prediction

In [41]:
prec_classifiers = []

params = {
    'n_estimators': 100,
    'random_state': 42,
    'n_jobs': 8  # Use all available cores
}

model_cls = RandomForestClassifier

none_ratio_thr_list = [0.1, 0.15, 0.35, 0.75, 0.85]

for i in range(5):
    print(f"Training level {i}")
    _classifier = train_random_forest(
        [train_X_full, train_X_1, train_X_2],
        np.array([x[i] for x in padded_label]),
        params=params,
        model_class=model_cls,
        none_ratio_thr=none_ratio_thr_list[i]
    )
    prec_classifiers.append(_classifier)

Training level 0
Train size: 95517
Training level 1
Train size: 95517
Sampled: none-ratio: 0.3869782342410252, removed: 28180
Training level 2
Train size: 95517
Sampled: none-ratio: 0.635918213511731, removed: 48570
Training level 3
Train size: 95517
Sampled: none-ratio: 0.8548007161028927, removed: 71247
Training level 4
Train size: 95517
Sampled: none-ratio: 0.9716385564873269, removed: 90506


In [42]:
cliped_text_X = np.clip(test_X, a_min=None, a_max=np.finfo(np.float32).max)

In [43]:
def make_predictions_with_models(classifiers, test_data):
    """
    Make probability predictions using multiple classifier models
    
    Args:
        classifiers: List of trained classifier models
        test_data: Test data to make predictions on
        
    Returns:
        List of probability predictions from each classifier
    """
    test_preds_all = []
    for clf in tqdm(classifiers):
        pred = clf.predict_proba(test_data)
        test_preds_all.append(pred)
    return test_preds_all

In [44]:
def align_and_combine_predictions(classifiers, test_preds_all, test_data, threshold=0.0):
    """
    Aligns predictions from multiple classifiers and combines them through averaging
    
    Args:
        classifiers: List of trained classifier models
        test_preds_all: List of probability predictions from each classifier
        test_data: Test data used for predictions
        threshold: Minimum probability threshold for making predictions
        
    Returns:
        Final class predictions after aligning and combining probabilities
    """
    # Get the common classes across all classifiers
    all_classes = classifiers[0].classes_
    test_preds_aligned = []

    # Make predictions with each fold's model and align them 
    for i, clf in tqdm(enumerate(classifiers)):
        pred = test_preds_all[i]
        # Create a mapping to align predictions with common classes
        pred_dict = {_cls: idx for idx, _cls in enumerate(clf.classes_)}
        aligned_pred = np.zeros((len(test_data), len(all_classes)))
        
        for i, _cls in enumerate(all_classes):
            if _cls in pred_dict:
                aligned_pred[:, i] = pred[:, pred_dict[_cls]]
        
        test_preds_aligned.append(aligned_pred)

    # Stack and average the aligned predictions
    test_preds_all = np.stack(test_preds_aligned)
    test_preds_proba = test_preds_all.mean(axis=0)

    # Get max probabilities for each prediction
    max_probs = np.max(test_preds_proba, axis=1)
    
    # Convert probabilities to class predictions, using threshold
    test_preds = np.array(['None'] * len(test_data), dtype=object)
    confident_mask = max_probs >= threshold
    test_preds[confident_mask] = all_classes[np.argmax(test_preds_proba[confident_mask], axis=1)]
    
    return test_preds

In [45]:
test_preds_list = []
for i in range(5):
    print(f"Predicting level {i}")
    test_preds_all = make_predictions_with_models([prec_classifiers[i]], cliped_text_X)
    test_preds_list.append(align_and_combine_predictions([prec_classifiers[i]], test_preds_all, cliped_text_X))

Predicting level 0


100%|██████████| 1/1 [00:01<00:00,  1.79s/it]
1it [00:00, 70.47it/s]


Predicting level 1


100%|██████████| 1/1 [00:03<00:00,  3.92s/it]
1it [00:00,  3.02it/s]


Predicting level 2


100%|██████████| 1/1 [00:02<00:00,  2.47s/it]
1it [00:00,  5.49it/s]


Predicting level 3


100%|██████████| 1/1 [00:02<00:00,  2.40s/it]
1it [00:00,  5.17it/s]


Predicting level 4


100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
1it [00:00, 21.05it/s]


In [46]:
# Convert to array and process None values
stacked = np.stack(test_preds_list).transpose()
for row in tqdm(stacked):
    # Find first occurrence of 'None' if any
    none_idx = np.where(row == 'None')[0]
    if len(none_idx) > 0:
        # Set all elements after first None to None
        first_none = none_idx[0]
        row[first_none:] = 'None'
        
stacked

100%|██████████| 315720/315720 [00:02<00:00, 106443.35it/s]


array([['Sensor', 'Current_Sensor', 'None', 'None', 'None'],
       ['Sensor', 'Flow_Sensor', 'Water_Flow_Sensor',
        'Chilled_Water_Supply_Flow_Sensor', 'None'],
       ['Setpoint', 'Temperature_Setpoint', 'None', 'None', 'None'],
       ...,
       ['Sensor', 'Position_Sensor', 'None', 'None', 'None'],
       ['Sensor', 'Position_Sensor', 'None', 'None', 'None'],
       ['Alarm', 'None', 'None', 'None', 'None']], dtype=object)

In [47]:
columnlist = ['Active_Power_Sensor', 'Air_Flow_Sensor',
       'Air_Flow_Setpoint', 'Air_Temperature_Sensor',
       'Air_Temperature_Setpoint', 'Alarm', 'Angle_Sensor',
       'Average_Zone_Air_Temperature_Sensor',
       'Chilled_Water_Differential_Temperature_Sensor',
       'Chilled_Water_Return_Temperature_Sensor',
       'Chilled_Water_Supply_Flow_Sensor',
       'Chilled_Water_Supply_Temperature_Sensor', 'Command',
       'Cooling_Demand_Sensor', 'Cooling_Demand_Setpoint',
       'Cooling_Supply_Air_Temperature_Deadband_Setpoint',
       'Cooling_Temperature_Setpoint', 'Current_Sensor',
       'Damper_Position_Sensor', 'Damper_Position_Setpoint', 'Demand_Sensor',
       'Dew_Point_Setpoint', 'Differential_Pressure_Sensor',
       'Differential_Pressure_Setpoint',
       'Differential_Supply_Return_Water_Temperature_Sensor',
       'Discharge_Air_Dewpoint_Sensor', 'Discharge_Air_Temperature_Sensor',
       'Discharge_Air_Temperature_Setpoint',
       'Discharge_Water_Temperature_Sensor', 'Duration_Sensor',
       'Electrical_Power_Sensor', 'Energy_Usage_Sensor',
       'Filter_Differential_Pressure_Sensor', 'Flow_Sensor', 'Flow_Setpoint',
       'Frequency_Sensor', 'Heating_Demand_Sensor', 'Heating_Demand_Setpoint',
       'Heating_Supply_Air_Temperature_Deadband_Setpoint',
       'Heating_Temperature_Setpoint', 'Hot_Water_Flow_Sensor',
       'Hot_Water_Return_Temperature_Sensor',
       'Hot_Water_Supply_Temperature_Sensor', 'Humidity_Setpoint',
       'Load_Current_Sensor', 'Low_Outside_Air_Temperature_Enable_Setpoint',
       'Max_Air_Temperature_Setpoint', 'Min_Air_Temperature_Setpoint',
       'Outside_Air_CO2_Sensor', 'Outside_Air_Enthalpy_Sensor',
       'Outside_Air_Humidity_Sensor',
       'Outside_Air_Lockout_Temperature_Setpoint',
       'Outside_Air_Temperature_Sensor', 'Outside_Air_Temperature_Setpoint',
       'Parameter', 'Peak_Power_Demand_Sensor', 'Position_Sensor',
       'Power_Sensor', 'Pressure_Sensor', 'Rain_Sensor',
       'Reactive_Power_Sensor', 'Reset_Setpoint',
       'Return_Air_Temperature_Sensor', 'Return_Water_Temperature_Sensor',
       'Room_Air_Temperature_Setpoint', 'Sensor', 'Setpoint',
       'Solar_Radiance_Sensor', 'Speed_Setpoint', 'Static_Pressure_Sensor',
       'Static_Pressure_Setpoint', 'Status', 'Supply_Air_Humidity_Sensor',
       'Supply_Air_Static_Pressure_Sensor',
       'Supply_Air_Static_Pressure_Setpoint', 'Supply_Air_Temperature_Sensor',
       'Supply_Air_Temperature_Setpoint', 'Temperature_Sensor',
       'Temperature_Setpoint', 'Thermal_Power_Sensor', 'Time_Setpoint',
       'Usage_Sensor', 'Valve_Position_Sensor', 'Voltage_Sensor',
       'Warmest_Zone_Air_Temperature_Sensor', 'Water_Flow_Sensor',
       'Water_Temperature_Sensor', 'Water_Temperature_Setpoint',
       'Wind_Direction_Sensor', 'Wind_Speed_Sensor',
       'Zone_Air_Dewpoint_Sensor', 'Zone_Air_Humidity_Sensor',
       'Zone_Air_Humidity_Setpoint', 'Zone_Air_Temperature_Sensor'
]

In [48]:
zipftest = ZipFile('../downloads/test_X_v0.1.0.zip', 'r')

In [49]:
listtestfile = zipftest.namelist()[1:]

In [None]:
stackedfinalresult = pd.DataFrame(columns=['filename'])
stackedfinalresult['filename'] = pd.Series(listtestfile).apply(lambda x: x.split("/")[-1])

for labelname in columnlist:
    stackedfinalresult[labelname] = 0

test_preds = stacked
for i in tqdm(range(len(test_preds))):
    # stackedfinalresult.loc[i, test_preds[i]] = 1
    predlist = test_preds[i].tolist()
    predlist = [x for x in predlist if x != 'None']
    for predlabelname in predlist:
    	stackedfinalresult.loc[i, predlabelname] = 1

 11%|█▏        | 35571/315720 [00:08<01:02, 4464.15it/s]

In [36]:
stackedfinalresult = stackedfinalresult.assign(**{col: stackedfinalresult[col].astype(float) for col in stackedfinalresult.columns if col != "filename"})

In [37]:
stackedfinalresult.to_csv("../logs/submit/hier_rf_slide_aug_2_no_cv_dyn_none_thr.csv", index=False)

In [38]:
stackedfinalresult

Unnamed: 0,filename,Active_Power_Sensor,Air_Flow_Sensor,Air_Flow_Setpoint,Air_Temperature_Sensor,Air_Temperature_Setpoint,Alarm,Angle_Sensor,Average_Zone_Air_Temperature_Sensor,Chilled_Water_Differential_Temperature_Sensor,...,Warmest_Zone_Air_Temperature_Sensor,Water_Flow_Sensor,Water_Temperature_Sensor,Water_Temperature_Setpoint,Wind_Direction_Sensor,Wind_Speed_Sensor,Zone_Air_Dewpoint_Sensor,Zone_Air_Humidity_Sensor,Zone_Air_Humidity_Setpoint,Zone_Air_Temperature_Sensor
0,test_X20367.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_X103084.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_X6910.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_X66332.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,test_X38528.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315715,test_X325790.pkl,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315716,test_X61444.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315717,test_X221284.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315718,test_X115827.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
