In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import time
from zipfile import ZipFile
import warnings
import pickle
import torch
from torch.utils.data import Dataset
import tsfel
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
from hiclass import LocalClassifierPerNode, LocalClassifierPerParentNode, LocalClassifierPerLevel
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

# Data split

In [844]:
def create_folds(train_y, n_splits=10):
    # Create a label array for stratification
    # We'll use the first non-zero label for each row as the stratification target
    stratify_labels = []
    for _, row in train_y.iterrows():
        labels = row[train_y.columns != 'filename'].values
        # Get first non-negative label, or 0 if all negative
        first_positive = next((i for i, x in enumerate(labels) if x >= 0), 0)
        stratify_labels.append(first_positive)
    
    # Create StratifiedKFold object
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Generate fold indices
    folds = []
    for train_idx, val_idx in skf.split(train_y, stratify_labels):
        folds.append({
            'train': train_idx,
            'val': val_idx
        })
    
    return folds

In [845]:
train_y = pd.read_csv("../downloads/train_y_v0.1.0.csv")

In [846]:
folds = create_folds(train_y)

# Prepare features

### Prepare pre-extracted features

In [850]:
raw_train_sets = [
    pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_full_v3.csv"),
    pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split1_2_v3.csv"),
    pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split2_2_v3.csv"),
    pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split1_3_v3.csv"),
    pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split2_3_v3.csv"),
    pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split3_3_v3.csv"),
    pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split1_4_v3.csv"),
    pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split2_4_v3.csv"),
    pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split3_4_v3.csv"),
    pd.read_csv("../downloads/train_data_features_v3_fixed/train_features_split4_4_v3.csv")
]

In [851]:
raw_test_X = pd.read_csv("../downloads/test_features_full_v3.csv")

In [852]:
pre_feature_list = ['0_Absolute energy',
 '0_Area under the curve',
 '0_Autocorrelation',
 '0_Average power',
 '0_Centroid',
 '0_ECDF Percentile Count_0',
 '0_ECDF Percentile Count_1',
 '0_ECDF Percentile_0',
 '0_ECDF Percentile_1',
 '0_ECDF_0',
 '0_ECDF_1',
 '0_ECDF_2',
 '0_ECDF_3',
 '0_ECDF_4',
 '0_ECDF_5',
 '0_ECDF_6',
 '0_ECDF_7',
 '0_ECDF_8',
 '0_ECDF_9',
 '0_Entropy',
 '0_Histogram mode',
 '0_Interquartile range',
 '0_Kurtosis',
 '0_Max',
 '0_Mean',
 '0_Mean absolute deviation',
 '0_Mean absolute diff',
 '0_Mean diff',
 '0_Median',
 '0_Median absolute deviation',
 '0_Median absolute diff',
 '0_Median diff',
 '0_Min',
 '0_Negative turning points',
 '0_Neighbourhood peaks',
 '0_Peak to peak distance',
 '0_Positive turning points',
 '0_Root mean square',
 '0_Signal distance',
 '0_Skewness',
 '0_Slope',
 '0_Standard deviation',
 '0_Sum absolute diff',
 '0_Variance',
 '0_Zero crossing rate',
 '0_Fundamental frequency',
 '0_Human range energy',
 '0_Max power spectrum',
 '0_Maximum frequency',
 '0_Median frequency',
 '0_Power bandwidth',
 '0_Wavelet entropy',
 'value_median',
 'value_mean',
 'value_qmean',
 'value_max',
 'value_min',
 'value_maxmin',
 'value_diffmax',
 'value_diffmin',
 'value_diffmean',
 'value_diffqmean',
 'value_diffmedian',
 'value_diffmaxmin',
 'time_diffmean',
 'time_diffqmean',
 'time_diffmax',
 'time_diffmin',
 'time_diffmedian',
 'value_std',
 'value_var',
 'value_diffstd',
 'value_diffvar',
 'time_diffstd',
 'time_diffvar',
 'time_burstiness',
 'time_total',
 'time_event_density',
 'time_entropy',
 'time_slope'
]

In [853]:
train_sets = [trn[pre_feature_list] for trn in raw_train_sets]
test_X = raw_test_X[pre_feature_list]

### Add spectrogram clustering feature

In [854]:
# trn_zipf = ZipFile("../downloads/train_X_v0.1.0.zip")
# trn_filenames = list(trn_zipf.namelist()[1:])

In [855]:
# spec_cluster_dist = torch.load("../downloads/distances_to_centers.pt")

In [856]:
# max_indices = spec_cluster_dist.argmax(dim=1)
# spec_cluster = torch.zeros_like(spec_cluster_dist)
# spec_cluster[torch.arange(spec_cluster_dist.size(0)), max_indices] = 1

# spec_trn_dist = spec_cluster[:len(train_y)]
# spec_tst_dist = spec_cluster[len(train_y):]

In [857]:
# spec_feat_cols = [f"dist_to_spec_cluster_{i}" for i in range(6)]

In [858]:
# spec_trn_feat = pd.DataFrame(spec_trn_dist, columns=spec_feat_cols)
# spec_trn_feat['index'] = trn_filenames

# spec_trn_feat['index'] = spec_trn_feat['index'].apply(lambda x: x.split("/train_X")[-1].split(".")[0]).astype(int)
# spec_trn_feat = spec_trn_feat.sort_values(by='index')
# spec_trn_feat = spec_trn_feat.drop(columns='index').reset_index(drop=True)

In [859]:
# # Add spec cluster into train feature
# train_X_full = pd.concat([train_X_full, spec_trn_feat], axis=1)
# train_X_1 = pd.concat([train_X_1, spec_trn_feat], axis=1)
# train_X_2 = pd.concat([train_X_2, spec_trn_feat], axis=1)

In [860]:
# test_X = pd.concat([
#     test_X,
#     pd.DataFrame(spec_tst_dist, columns=spec_feat_cols)
# ], axis=1)

In [861]:
# feature_list = pre_feature_list + spec_feat_cols

# Prepare labels

In [862]:
def get_active_labels_np(row):
    """More efficient version using numpy"""
    arr = row.to_numpy() # convert to numpy array
    indices = np.where(arr == 1)[0] # get indices where value is 1
    labels = row.index[indices].tolist() # get labels from indices
    return labels

labelhir = train_y.apply(get_active_labels_np, axis=1).tolist()

In [863]:
# # Get a tier dict
# ontology_list = list(train_y.columns[1:])

# from collections import defaultdict

# def build_tree(onto):
#     """
#     Build a tree so that each term has at most one parent.
#     The parent is determined by the longest existing term that is a substring of the child.
#     """
#     # Sort terms by length so that broader terms are processed (and assigned) first
#     sorted_onto = sorted(onto, key=len)
    
#     # Dictionaries for storing parent-child relationships
#     parent_map = {}             # term -> parent
#     children_map = defaultdict(list)  # parent -> [children]

#     processed = []
    
#     for term in sorted_onto:
#         # Find all processed terms that are substrings of 'term'
#         potential_parents = [p for p in processed if p in term]
        
#         if not potential_parents:
#             # No parent found; this term is at the root
#             parent_map[term] = None
#             children_map[None].append(term)
#         else:
#             # Pick the longest parent (closest match)
#             parent = max(potential_parents, key=len)
#             parent_map[term] = parent
#             children_map[parent].append(term)
        
#         processed.append(term)
    
#     return parent_map, children_map

# """
# Re-built hierachical labels
# """
# level_labels = [[], [], [], [], []]

# def print_tree(children_map, root=None, depth=0):
#     """
#     Recursively print the tree structure with indentation.
#     'root=None' means we are listing top-level (root) terms first.
#     """
    
#     if root is None:
#         # For all top-level terms
#         for child in sorted(children_map[root]):
#             print_tree(children_map, child, depth)
#     else:
#         # print("  " * depth + root)
#         level_labels[depth].append(root)
#         for child in sorted(children_map[root]):
#             print_tree(children_map, child, depth + 1)

# parent_map, children_map = build_tree(ontology_list)
# print_tree(children_map)

In [864]:
level_labels = [list(train_y.columns[1:]), [], [], [], []]

for k in range(0, 4):
    check_labels = level_labels[k]
    label_len = len(check_labels)
    idx_is_subset_of_col = pd.DataFrame(0, index=check_labels, columns=check_labels)
    is_subset = []

    for i in tqdm(range(label_len)):
        for j in range(label_len):
            src_lb, tgt_lb = check_labels[i], check_labels[j]
            src = train_y[train_y[src_lb] == 1]
            tgt = train_y[(train_y[src_lb] == 1) & (train_y[tgt_lb] == 1)]

            idx_is_subset_of_col.loc[src_lb, tgt_lb] = len(src) <= len(tgt)
            if len(src) <= len(tgt) and src_lb != tgt_lb:
                is_subset.append([src_lb, tgt_lb])

    remove_label = set([s[0] for s in is_subset])
    print(f"Level {k}")
    print(is_subset)
    print(remove_label)
    print()
    
    for rl in remove_label:
        level_labels[k].remove(rl)
        level_labels[k+1].append(rl)

100%|██████████| 94/94 [00:15<00:00,  6.24it/s]


Level 0
[['Active_Power_Sensor', 'Electrical_Power_Sensor'], ['Active_Power_Sensor', 'Power_Sensor'], ['Active_Power_Sensor', 'Sensor'], ['Air_Flow_Sensor', 'Flow_Sensor'], ['Air_Flow_Sensor', 'Sensor'], ['Air_Flow_Setpoint', 'Flow_Setpoint'], ['Air_Flow_Setpoint', 'Setpoint'], ['Air_Temperature_Sensor', 'Sensor'], ['Air_Temperature_Sensor', 'Temperature_Sensor'], ['Air_Temperature_Setpoint', 'Setpoint'], ['Air_Temperature_Setpoint', 'Temperature_Setpoint'], ['Angle_Sensor', 'Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Temperature_Sensor'], ['Average_Zone_Air_Temperature_Sensor', 'Zone_Air_Temperature_Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Temperature_Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Chilled_Water_Return_Temperature_Sensor',

100%|██████████| 88/88 [00:13<00:00,  6.63it/s]


Level 1
[['Chilled_Water_Differential_Temperature_Sensor', 'Temperature_Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Zone_Air_Humidity_Setpoint', 'Humidity_Setpoint'], ['Active_Power_Sensor', 'Power_Sensor'], ['Active_Power_Sensor', 'Electrical_Power_Sensor'], ['Filter_Differential_Pressure_Sensor', 'Pressure_Sensor'], ['Filter_Differential_Pressure_Sensor', 'Differential_Pressure_Sensor'], ['Cooling_Temperature_Setpoint', 'Temperature_Setpoint'], ['Cooling_Demand_Sensor', 'Solar_Radiance_Sensor'], ['Cooling_Demand_Sensor', 'Chilled_Water_Differential_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Zone_Air_Humidity_Setpoint'], ['Cooling_Demand_Sensor', 'Active_Power_Sensor'], ['Cooling_Demand_Sensor', 'Damper_Position_Setpoint'], ['Cooling_Demand_Sensor', 'Filter_Differential_Pressure_Sensor'], ['Cooling_Demand_Sensor', 'Cooling_Temperature_Setpoint'], ['Cooling_Demand_Sensor', 'Flow_Setpoint'], ['Cooling_Demand_Sensor', 'Air_Flow_Setpoin

100%|██████████| 53/53 [00:04<00:00, 12.62it/s]


Level 2
[['Return_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Supply_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Chilled_Water_Differential_Temperature_Sensor', 'Water_Temperature_Sensor'], ['Zone_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Filter_Differential_Pressure_Sensor', 'Differential_Pressure_Sensor'], ['Active_Power_Sensor', 'Electrical_Power_Sensor'], ['Cooling_Supply_Air_Temperature_Deadband_Setpoint', 'Cooling_Temperature_Setpoint'], ['Cooling_Supply_Air_Temperature_Deadband_Setpoint', 'Air_Temperature_Setpoint'], ['Warmest_Zone_Air_Temperature_Sensor', 'Zone_Air_Temperature_Sensor'], ['Warmest_Zone_Air_Temperature_Sensor', 'Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Return_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Electrical_Power_Sensor'], ['Cooling_Demand_Sensor', 'Supply_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Load_Current_Sensor'], ['Cooling_Demand_Sensor', 'Chilled_Water_Differential_Temperature_Sensor'],

100%|██████████| 34/34 [00:01<00:00, 19.41it/s]

Level 3
[['Warmest_Zone_Air_Temperature_Sensor', 'Zone_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Return_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Supply_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Zone_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Chilled_Water_Differential_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Active_Power_Sensor'], ['Cooling_Demand_Sensor', 'Filter_Differential_Pressure_Sensor'], ['Cooling_Demand_Sensor', 'Cooling_Supply_Air_Temperature_Deadband_Setpoint'], ['Cooling_Demand_Sensor', 'Warmest_Zone_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Hot_Water_Return_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Hot_Water_Supply_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Discharge_Air_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Chilled_Water_Supply_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Chilled_Water_Return_Temperature_Sensor'], ['Cooling_Demand_Sensor', 'Heating_Demand_Sensor'], ['Cooling_Demand_




In [865]:
tiers = {
    1: level_labels[0],
    2: level_labels[1],
    3: level_labels[2],
    4: level_labels[3],
    5: level_labels[4]
}

def get_tier(label):
    for tier_num, tier_list in tiers.items():
        if label in tier_list:
            return tier_num
    return None  # Handle cases where the label isn't found in any tier

def sort_labels(labels):
    return sorted(labels, key=lambda label: (get_tier(label) or float('inf'), label))


In [866]:
sorted_labelhir = [sort_labels(labels) for labels in labelhir]

In [867]:
label_hier = np.array(
    sorted_labelhir,
    dtype=object,
)

In [868]:
padded_label = pd.Series(label_hier).apply(lambda x: x + ['None'] * (5 - len(x)) if len(x) < 5 else x)

In [869]:
# Count Nones at each level
for i in range(5):
    none_count = sum(padded_label.apply(lambda x: x[i] == 'None'))
    print(f"Level {i+1}: {none_count} None values out of {len(padded_label)} total ({none_count/len(padded_label):.2%})")

Level 1: 0 None values out of 31839 total (0.00%)
Level 2: 12321 None values out of 31839 total (38.70%)
Level 3: 20247 None values out of 31839 total (63.59%)
Level 4: 27216 None values out of 31839 total (85.48%)
Level 5: 30936 None values out of 31839 total (97.16%)


In [870]:
# Level 1: 0 None values out of 31839 total (0.00%)
# Level 2: 12321 None values out of 31839 total (38.70%)
# Level 3: 20247 None values out of 31839 total (63.59%)
# Level 4: 27216 None values out of 31839 total (85.48%)
# Level 5: 30936 None values out of 31839 total (97.16%

# Model Training

In [887]:
from typing import List

def train_random_forest(
    train_X: List[pd.DataFrame],
    _label,
    folds,
    model_class,
    params: dict,
    none_ratio_thr: float,
):
    """
    Train random forest models using k-fold cross validation
    
    Args:
        train_X: Training features DataFrame
        _label: Array of labels
        folds: List of dictionaries containing train/val indices
        
    Returns:
        tuple: (list of trained classifiers, list of scores, list of validation predictions)
    """
    classifiers = []
    val_feat_df_list = []

    for f_idx, fold in enumerate(folds):
        # Prepare train and validation data for this fold
        train_X_fold_list = []
        train_y_fold_list = []
        for trn_x in train_X:
            train_X_fold_list.append(trn_x.iloc[fold['train']])
            train_y_fold_list.append(_label[fold['train']])

        train_X_fold = pd.concat(train_X_fold_list)
        train_y_fold = np.concatenate(train_y_fold_list)

        valid_X_fold_list = []
        valid_y_fold_list = []
        for trn_x in train_X:
            valid_X_fold_list.append(trn_x.iloc[fold['val']])
            valid_y_fold_list.append(_label[fold['val']])

        val_X_fold = pd.concat(valid_X_fold_list)
        val_y_fold = np.concatenate(valid_y_fold_list)
        
        # Check the train_y_fold. If more than 30% of samples are labeled "None",
        # randomly sample from the "None" to make that ratio no more than 30%.
        none_mask = (train_y_fold == "None")
        none_count = np.sum(none_mask)
        total_samples = len(train_y_fold)
        none_ratio = none_count / total_samples if total_samples > 0 else 0

        if none_ratio > none_ratio_thr:
            # Calculate how many "None" labels we should keep (30% of total)
            max_none_to_keep = int(none_ratio_thr * (total_samples - none_count))

            # Randomly choose which "None" labels to keep
            none_indices = np.where(none_mask)[0]

            # Fix the random seed before shuffling for reproducibility
            rng = np.random.RandomState(f_idx)
            rng.shuffle(none_indices)
            
            keep_none_indices = none_indices[:max_none_to_keep]

            # Indices of all non-"None" labels
            other_indices = np.where(~none_mask)[0]

            # Combine indices to keep and then sort
            new_indices = np.concatenate([keep_none_indices, other_indices])
            new_indices = np.sort(new_indices)  # Sort so we can index the DataFrame consistently

            # Subset the training data
            train_X_fold = train_X_fold.iloc[new_indices]
            train_y_fold = train_y_fold[new_indices]

            # print(f"Sampled: none-ratio: {none_ratio}, removed: {none_count - max_none_to_keep}")

        print(f"Train size: {len(train_X_fold)}, Valid size: {len(val_X_fold)}")
        
        # Create and train Random Forest model
        model = model_class(**params)
        model.fit(train_X_fold, train_y_fold)
        
        classifiers.append(model)
        
        # Calculate score and save predictions on validation set
        val_preds = model.predict_proba(val_X_fold)
        val_pred_df = pd.DataFrame(data=val_preds, columns=model.classes_)

        val_fold_info = []
        for _f in range(len(train_X)):
            f_info = pd.DataFrame(data=fold['val'], columns=["fold_idx"])
            f_info['dataset_idx'] = _f
            val_fold_info.append(f_info)

        val_fold_idx = pd.concat(val_fold_info)
        
        val_feat_df = pd.concat([
            val_fold_idx.reset_index(drop=True),
            val_X_fold.reset_index(drop=True),
            val_pred_df,
        ], axis=1)

        val_feat_df_list.append(val_feat_df)

    return classifiers, val_feat_df_list

def setup_prev_level_prediction(predictions, fold_num, num_datasets):
    new_train_level_x = pd.concat([predictions[i] for i in range(fold_num)]).sort_values(['dataset_idx', 'fold_idx'])
    return [
        new_train_level_x[new_train_level_x['dataset_idx'] == i] \
            .drop(columns=['dataset_idx', 'fold_idx']) \
            .reset_index(drop=True)
        for i in range(num_datasets)
    ]

### Train the high precision model by allowing None prediction

In [888]:
prec_classifiers = []
prec_val_predictions = []

params = {
    'n_estimators': 100,
    'random_state': 42,
    'n_jobs': 8  # Use all available cores
}

model_cls = RandomForestClassifier

none_ratio_thr_list = [0.1, 0.15, 0.35, 0.75, 0.85]

# train_input = [train_X_full, train_X_1, train_X_2]
train_input = train_sets

for i in range(5):
    print(f"Training level {i}")
    _classifiers, _val_predictions = train_random_forest(
        train_input,
        np.array([x[i] for x in padded_label]),
        folds,
        params=params,
        model_class=model_cls,
        none_ratio_thr=none_ratio_thr_list[i]
    )
    prec_classifiers.append(_classifiers)
    prec_val_predictions.append(_val_predictions)

    train_input = setup_prev_level_prediction(_val_predictions, fold_num=len(folds), num_datasets=len(train_input))
    

Training level 0
Train size: 286550, Valid size: 31840
Train size: 286550, Valid size: 31840
Train size: 286550, Valid size: 31840
Train size: 286550, Valid size: 31840
Train size: 286550, Valid size: 31840
Train size: 286550, Valid size: 31840
Train size: 286550, Valid size: 31840
Train size: 286550, Valid size: 31840
Train size: 286550, Valid size: 31840
Train size: 286560, Valid size: 31830
Training level 1
Train size: 202009, Valid size: 31840
Train size: 201951, Valid size: 31840
Train size: 202009, Valid size: 31840
Train size: 201974, Valid size: 31840
Train size: 201997, Valid size: 31840
Train size: 201997, Valid size: 31840
Train size: 202066, Valid size: 31840
Train size: 202020, Valid size: 31840
Train size: 202055, Valid size: 31840
Train size: 202032, Valid size: 31830
Training level 2
Train size: 140818, Valid size: 31840
Train size: 140994, Valid size: 31840
Train size: 141142, Valid size: 31840
Train size: 140791, Valid size: 31840
Train size: 140940, Valid size: 31840

In [934]:
cascade_train_set = []
fold_num = len(folds)

for _level_idx in tqdm(range(5)):
    new_train_level_x = pd.concat([prec_val_predictions[_level_idx][i] for i in range(fold_num)]).sort_values(['dataset_idx', 'fold_idx'])
    cascade_train_set.append([
        new_train_level_x[new_train_level_x['dataset_idx'] == i] \
            # .drop(columns=['dataset_idx', 'fold_idx']) \
            .drop(columns=pre_feature_list) \
            .reset_index(drop=True)
        for i in range(len(train_sets))
    ])

100%|██████████| 5/5 [00:06<00:00,  1.22s/it]


In [None]:
from sklearn.svm import SVC

ensemble_classifiers = []
ensemble_val_predictions = []

params = {
    'n_estimators': 100,
    'random_state': 42,
    'n_jobs': 8  # Use all available cores
}

model_cls = SVC

none_ratio_thr_list = [0.1, 0.15, 0.35, 0.75, 0.85]

# train_input = [train_X_full, train_X_1, train_X_2]
train_input = train_sets

for i in range(5):
    print(f"Training level {i}")
    _classifiers, _val_predictions = train_random_forest(
        train_input,
        np.array([x[i] for x in padded_label]),
        folds,
        params=params,
        model_class=model_cls,
        none_ratio_thr=none_ratio_thr_list[i]
    )
    ensemble_classifiers.append(_classifiers)
    ensemble_val_predictions.append(_val_predictions)

    train_input = setup_prev_level_prediction(_val_predictions, fold_num=len(folds), num_datasets=len(train_input))
    

In [936]:
cascade_train_set[4][0]

Unnamed: 0,fold_idx,dataset_idx,Alarm,Command,Parameter,Sensor,Setpoint,Status,Angle_Sensor,Cooling_Demand_Setpoint,...,Chilled_Water_Return_Temperature_Sensor,Cooling_Supply_Air_Temperature_Deadband_Setpoint,Differential_Supply_Return_Water_Temperature_Sensor,Heating_Supply_Air_Temperature_Deadband_Setpoint,Hot_Water_Return_Temperature_Sensor,Low_Outside_Air_Temperature_Enable_Setpoint,None,Outside_Air_Lockout_Temperature_Setpoint,Peak_Power_Demand_Sensor,Warmest_Zone_Air_Temperature_Sensor
0,0,0,0.00,0.0,1.00,0.00,0.00,0.00,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,1.00,0.0,0.00,0.0
1,1,0,0.00,0.0,0.30,0.00,0.70,0.00,0.0,0.0,...,0.0,0.01,0.00,0.0,0.0,0.0,0.99,0.0,0.00,0.0
2,2,0,0.00,0.0,0.00,0.99,0.01,0.00,0.0,0.0,...,0.0,0.00,0.01,0.0,0.0,0.0,0.98,0.0,0.01,0.0
3,3,0,0.00,0.0,0.00,1.00,0.00,0.00,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,0.99,0.0,0.01,0.0
4,4,0,0.00,0.0,0.03,0.03,0.90,0.04,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,0.98,0.0,0.02,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31834,31834,0,0.00,0.0,0.00,1.00,0.00,0.00,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,1.00,0.0,0.00,0.0
31835,31835,0,0.09,0.0,0.00,0.05,0.00,0.86,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,1.00,0.0,0.00,0.0
31836,31836,0,0.00,0.0,0.26,0.50,0.24,0.00,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,1.00,0.0,0.00,0.0
31837,31837,0,0.00,0.0,0.00,0.00,1.00,0.00,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,1.00,0.0,0.00,0.0


: 

In [None]:
import copy

cascade_train_level_agg = []
for _level in tqdm(range(5)):
    _level_feat = copy.deepcopy(level_labels[_level])
    if _level > 0:
        _level_feat = _level_feat + ["None"]

    _level_res = pd.concat(cascade_train_set[_level][_level_feat], axis=1).groupby(level=0, axis=1).mean()
    assert not _level_res.isna().values.any()

    for col in _level_res.columns:
        _level_res = _level_res.rename(columns={col: f"{col}_{_level}"})

    cascade_train_level_agg.append(_level_res)

Unnamed: 0,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Average power,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,0_ECDF Percentile_0,0_ECDF Percentile_1,0_ECDF_0,...,time_total,time_event_density,time_entropy,time_slope,Alarm,Command,Parameter,Sensor,Setpoint,Status
0,3.579300e+06,2.002655e+04,17.0,5.326999e+03,336.834861,30.0,30.0,30.00,30.00,0.000250,...,2418900.499,0.001655,11.910432,605.860232,0.00,0.0,1.00,0.00,0.00,0.00
1,4.051012e+06,3.009676e+04,16.0,3.014589e+03,672.251192,22.5,22.5,22.50,22.50,0.000125,...,4837688.953,0.001659,12.930601,603.250804,0.00,0.0,0.30,0.00,0.70,0.00
2,6.988878e+07,6.261729e+04,32.0,1.040152e+05,371.619154,805.0,3220.0,31.36,131.52,0.000248,...,2418874.283,0.001664,11.965989,601.617578,0.00,0.0,0.00,0.99,0.01,0.00
3,1.183314e+11,5.159254e+06,1.0,8.805016e+07,706.907352,3834.0,3834.0,3834.00,3834.00,0.000124,...,4838073.366,0.001665,12.970361,600.376320,0.00,0.0,0.00,1.00,0.00,0.00
4,4.404025e+08,3.141691e+05,2.0,3.292679e+05,672.200909,234.0,234.0,234.00,234.00,0.000124,...,4837577.658,0.001664,12.972059,600.985625,0.00,0.0,0.03,0.03,0.90,0.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31834,3.301126e+17,6.064403e+09,923.0,4.913830e+14,365.597667,804.0,3219.0,8306232.00,9759328.00,0.000249,...,2418491.301,0.001664,11.969393,601.652408,0.00,0.0,0.00,1.00,0.00,0.00
31835,4.013000e+03,6.705734e+02,2.0,5.972521e+00,335.720320,1.0,1.0,1.00,1.00,0.000249,...,2418877.932,0.001662,11.963712,601.982900,0.09,0.0,0.00,0.05,0.00,0.86
31836,2.020000e+07,3.358332e+04,1.0,6.014890e+04,167.916622,100.0,100.0,100.00,100.00,0.000495,...,1208999.676,0.001671,10.978559,598.986246,0.00,0.0,0.26,0.50,0.24,0.00
31837,1.827000e+08,1.007500e+05,1.0,5.440197e+05,167.916709,300.0,300.0,300.00,300.00,0.000493,...,1209000.308,0.001679,10.979803,595.854442,0.00,0.0,0.00,0.00,1.00,0.00


In [890]:
cliped_test_X = np.clip(test_X, a_min=None, a_max=np.finfo(np.float32).max)

In [891]:
# def make_predictions_with_models(classifiers, test_data):
#     """
#     Make probability predictions using multiple classifier models
    
#     Args:
#         classifiers: List of trained classifier models
#         test_data: Test data to make predictions on
        
#     Returns:
#         List of probability predictions from each classifier
#     """
#     test_preds_all = []
#     for clf in tqdm(classifiers):
#         pred = clf.predict_proba(test_data)
#         test_preds_all.append(pred)
#     return test_preds_all

In [892]:
# def align_and_combine_predictions(classifiers, test_preds_all, test_data, threshold=0.0):
#     """
#     Aligns predictions from multiple classifiers and combines them through averaging
    
#     Args:
#         classifiers: List of trained classifier models
#         test_preds_all: List of probability predictions from each classifier
#         test_data: Test data used for predictions
#         threshold: Minimum probability threshold for making predictions
        
#     Returns:
#         Final class predictions after aligning and combining probabilities
#     """
#     # Get the common classes across all classifiers
#     all_classes = classifiers[0].classes_
#     test_preds_aligned = []

#     # Make predictions with each fold's model and align them 
#     for i, clf in tqdm(enumerate(classifiers)):
#         pred = test_preds_all[i]
#         # Create a mapping to align predictions with common classes
#         pred_dict = {_cls: idx for idx, _cls in enumerate(clf.classes_)}
#         aligned_pred = np.zeros((len(test_data), len(all_classes)))
        
#         for i, _cls in enumerate(all_classes):
#             if _cls in pred_dict:
#                 aligned_pred[:, i] = pred[:, pred_dict[_cls]]
        
#         test_preds_aligned.append(aligned_pred)

#     # Stack and average the aligned predictions
#     test_preds_all = np.stack(test_preds_aligned)
#     test_preds_proba = test_preds_all.mean(axis=0)

#     # Get max probabilities for each prediction
#     max_probs = np.max(test_preds_proba, axis=1)
    
#     # Convert probabilities to class predictions, using threshold
#     test_preds = np.array(['None'] * len(test_data), dtype=object)
#     confident_mask = max_probs >= threshold
#     test_preds[confident_mask] = all_classes[np.argmax(test_preds_proba[confident_mask], axis=1)]
    
#     return test_preds

In [893]:
def make_predictions_with_models(classifiers, test_data):
    test_preds_all = []
    for clf in tqdm(classifiers):
        pred = clf.predict_proba(test_data)
        test_preds_all.append(pd.DataFrame(data=pred, columns=clf.classes_))
    return test_preds_all

In [894]:
test_preds_list = []
test_input = cliped_test_X
for i in range(5):
    print(f"Predicting level {i}")
    test_preds_all = make_predictions_with_models(prec_classifiers[i], test_input)

    _level_res = pd.concat(test_preds_all, axis=1).groupby(level=0, axis=1).mean()
    test_input = pd.concat([test_input, _level_res], axis=1)
    
    test_preds_list.append(test_preds_all)
    # test_preds_list.append(align_and_combine_predictions(prec_classifiers[i], test_preds_all, cliped_text_X))

Predicting level 0


100%|██████████| 10/10 [00:23<00:00,  2.38s/it]


Predicting level 1


100%|██████████| 10/10 [00:45<00:00,  4.56s/it]


Predicting level 2


100%|██████████| 10/10 [00:30<00:00,  3.08s/it]


Predicting level 3


100%|██████████| 10/10 [00:31<00:00,  3.11s/it]


Predicting level 4


100%|██████████| 10/10 [00:17<00:00,  1.78s/it]


In [895]:
test_level_agg = []
for _level in tqdm(range(5)):
    _level_res = pd.concat(test_preds_list[_level], axis=1).groupby(level=0, axis=1).mean()
    assert not _level_res.isna().values.any()

    for col in _level_res.columns:
        _level_res = _level_res.rename(columns={col: f"{col}_{_level}"})

    test_level_agg.append(_level_res)

100%|██████████| 5/5 [00:15<00:00,  3.12s/it]


In [896]:
stacked = np.stack(
    test_level_agg[i].idxmax(axis=1).apply(lambda x: x[:-2])
    for i in range(5)
).transpose()

for row in tqdm(stacked):
    # Find first occurrence of 'None' if any
    none_idx = np.where(row == 'None')[0]
    if len(none_idx) > 0:
        # Set all elements after first None to None
        first_none = none_idx[0]
        row[first_none:] = 'None'
        
stacked

100%|██████████| 315720/315720 [00:02<00:00, 134189.17it/s]


array([['Sensor', 'Power_Sensor', 'None', 'None', 'None'],
       ['Sensor', 'Flow_Sensor', 'Water_Flow_Sensor',
        'Chilled_Water_Supply_Flow_Sensor', 'None'],
       ['Setpoint', 'Temperature_Setpoint', 'Air_Temperature_Setpoint',
        'Outside_Air_Temperature_Setpoint',
        'Outside_Air_Lockout_Temperature_Setpoint'],
       ...,
       ['Sensor', 'Position_Sensor', 'None', 'None', 'None'],
       ['Sensor', 'Position_Sensor', 'None', 'None', 'None'],
       ['Alarm', 'None', 'None', 'None', 'None']], dtype=object)

In [913]:
pd.Series(list(stacked))

0                  [Sensor, Power_Sensor, None, None, None]
1         [Sensor, Flow_Sensor, Water_Flow_Sensor, Chill...
2         [Setpoint, Temperature_Setpoint, Air_Temperatu...
3               [Sensor, Position_Sensor, None, None, None]
4                          [Status, None, None, None, None]
                                ...                        
315715              [Sensor, Flow_Sensor, None, None, None]
315716    [Sensor, Current_Sensor, Load_Current_Sensor, ...
315717          [Sensor, Position_Sensor, None, None, None]
315718          [Sensor, Position_Sensor, None, None, None]
315719                      [Alarm, None, None, None, None]
Length: 315720, dtype: object

In [6]:
columnlist = ['Active_Power_Sensor', 'Air_Flow_Sensor',
       'Air_Flow_Setpoint', 'Air_Temperature_Sensor',
       'Air_Temperature_Setpoint', 'Alarm', 'Angle_Sensor',
       'Average_Zone_Air_Temperature_Sensor',
       'Chilled_Water_Differential_Temperature_Sensor',
       'Chilled_Water_Return_Temperature_Sensor',
       'Chilled_Water_Supply_Flow_Sensor',
       'Chilled_Water_Supply_Temperature_Sensor', 'Command',
       'Cooling_Demand_Sensor', 'Cooling_Demand_Setpoint',
       'Cooling_Supply_Air_Temperature_Deadband_Setpoint',
       'Cooling_Temperature_Setpoint', 'Current_Sensor',
       'Damper_Position_Sensor', 'Damper_Position_Setpoint', 'Demand_Sensor',
       'Dew_Point_Setpoint', 'Differential_Pressure_Sensor',
       'Differential_Pressure_Setpoint',
       'Differential_Supply_Return_Water_Temperature_Sensor',
       'Discharge_Air_Dewpoint_Sensor', 'Discharge_Air_Temperature_Sensor',
       'Discharge_Air_Temperature_Setpoint',
       'Discharge_Water_Temperature_Sensor', 'Duration_Sensor',
       'Electrical_Power_Sensor', 'Energy_Usage_Sensor',
       'Filter_Differential_Pressure_Sensor', 'Flow_Sensor', 'Flow_Setpoint',
       'Frequency_Sensor', 'Heating_Demand_Sensor', 'Heating_Demand_Setpoint',
       'Heating_Supply_Air_Temperature_Deadband_Setpoint',
       'Heating_Temperature_Setpoint', 'Hot_Water_Flow_Sensor',
       'Hot_Water_Return_Temperature_Sensor',
       'Hot_Water_Supply_Temperature_Sensor', 'Humidity_Setpoint',
       'Load_Current_Sensor', 'Low_Outside_Air_Temperature_Enable_Setpoint',
       'Max_Air_Temperature_Setpoint', 'Min_Air_Temperature_Setpoint',
       'Outside_Air_CO2_Sensor', 'Outside_Air_Enthalpy_Sensor',
       'Outside_Air_Humidity_Sensor',
       'Outside_Air_Lockout_Temperature_Setpoint',
       'Outside_Air_Temperature_Sensor', 'Outside_Air_Temperature_Setpoint',
       'Parameter', 'Peak_Power_Demand_Sensor', 'Position_Sensor',
       'Power_Sensor', 'Pressure_Sensor', 'Rain_Sensor',
       'Reactive_Power_Sensor', 'Reset_Setpoint',
       'Return_Air_Temperature_Sensor', 'Return_Water_Temperature_Sensor',
       'Room_Air_Temperature_Setpoint', 'Sensor', 'Setpoint',
       'Solar_Radiance_Sensor', 'Speed_Setpoint', 'Static_Pressure_Sensor',
       'Static_Pressure_Setpoint', 'Status', 'Supply_Air_Humidity_Sensor',
       'Supply_Air_Static_Pressure_Sensor',
       'Supply_Air_Static_Pressure_Setpoint', 'Supply_Air_Temperature_Sensor',
       'Supply_Air_Temperature_Setpoint', 'Temperature_Sensor',
       'Temperature_Setpoint', 'Thermal_Power_Sensor', 'Time_Setpoint',
       'Usage_Sensor', 'Valve_Position_Sensor', 'Voltage_Sensor',
       'Warmest_Zone_Air_Temperature_Sensor', 'Water_Flow_Sensor',
       'Water_Temperature_Sensor', 'Water_Temperature_Setpoint',
       'Wind_Direction_Sensor', 'Wind_Speed_Sensor',
       'Zone_Air_Dewpoint_Sensor', 'Zone_Air_Humidity_Sensor',
       'Zone_Air_Humidity_Setpoint', 'Zone_Air_Temperature_Sensor'
]

In [4]:
zipftest = ZipFile('../downloads/test_X_v0.1.0.zip', 'r')
listtestfile = zipftest.namelist()[1:]

In [899]:
stackedfinalresult = pd.DataFrame(columns=['filename'])
stackedfinalresult['filename'] = pd.Series(listtestfile).apply(lambda x: x.split("/")[-1])

for labelname in columnlist:
    stackedfinalresult[labelname] = 0

test_preds = stacked
for i in tqdm(range(len(test_preds))):
    # stackedfinalresult.loc[i, test_preds[i]] = 1
    predlist = test_preds[i].tolist()
    predlist = [x for x in predlist if x != 'None']
    for predlabelname in predlist:
    	stackedfinalresult.loc[i, predlabelname] = 1

100%|██████████| 315720/315720 [01:05<00:00, 4795.05it/s]


In [900]:
stackedfinalresult = stackedfinalresult.assign(**{col: stackedfinalresult[col].astype(float) for col in stackedfinalresult.columns if col != "filename"})

In [901]:
import copy

def prepare_final_results_thr(df, filename_col, thr=None):
    res = df.copy()

    if thr != None:
        print(f"Applying threshold {thr}")
        res = (res >= thr).astype(float)

    res['Cooling_Demand_Sensor'] = 0.0
    res['Heating_Demand_Sensor'] = 0.0
    
    res = pd.concat([filename_col, res], axis=1)

    return res

tqdm.pandas()

def prepare_final_results_level(df, filename_col, level_labels):
    _level_label_tiers = copy.deepcopy(level_labels)
    res = df.copy()

    res['Cooling_Demand_Sensor'] = 0.0
    res['Heating_Demand_Sensor'] = 0.0
    
    level_res = []
    curr_none = pd.Series([0.0] * len(df))

    for gid, group_of_columns in enumerate(_level_label_tiers):
        if gid > 0:
            group_of_columns.append(f"None_{gid}")
            
        # Subset the columns that belong to this level
        subset = res[group_of_columns]
        
        # Find the maximum value in each row
        row_max = subset.max(axis=1)
        
        # Compare each column's value to the row's maximum
        # If equal, set to 1.0, otherwise 0.0
        # (We use np.isclose or == depending on data numeric precision.)
        subset_bin = subset.apply(lambda col: ((col == row_max) & (col >= 0.01)).astype(float), axis=0)

        if gid > 0:
            curr_none = curr_none + subset_bin[f"None_{gid}"]
            print(f"Num prediction in level {gid}: {len(curr_none[curr_none == 0])}")
            
        subset_bin = subset_bin.apply(lambda col: ((col != 0) & (col + curr_none == col)).astype(float), axis=0)

        # collect them into a list if you want to concatenate later
        level_res.append(subset_bin)
    
    res = pd.concat(level_res, axis=1)
    
    res = pd.concat([filename_col, res], axis=1)

    res = res.drop(columns=[f"None_{i}" for i in range(1,5)])

    return res

In [902]:
# final_res = pd.concat(test_level_agg, axis=1).groupby(level=0, axis=1).mean()
# final_res = prepare_final_results_level(final_res, filename_col, level_labels=level_labels)

In [903]:
def check_pred_num(_final_res, thr=0.4):
    # Exclude 'filename' column if it exists
    filtered_df = _final_res.drop(columns=['filename'], errors='ignore')

    return (filtered_df >= thr).sum(axis=1)

In [904]:
"""  
1    115878
2    108085
3     61757
4     23471
5      6523
6         5
7         1

1    117837
2     86889
3     65924
4     34575
5     10495

1    116064
2     92604
3     62640
4     32632
5     11780
"""
check_pred_num(stackedfinalresult, thr=0.35).value_counts()

1    115818
2     89293
3     67651
4     32487
5     10471
Name: count, dtype: int64

In [905]:
stackedfinalresult.to_csv("../logs/submit/0123_baseline_dev_original_prev_level_full_aug.csv", index=False)

In [909]:
stackedfinalresult

Unnamed: 0,filename,Active_Power_Sensor,Air_Flow_Sensor,Air_Flow_Setpoint,Air_Temperature_Sensor,Air_Temperature_Setpoint,Alarm,Angle_Sensor,Average_Zone_Air_Temperature_Sensor,Chilled_Water_Differential_Temperature_Sensor,...,Warmest_Zone_Air_Temperature_Sensor,Water_Flow_Sensor,Water_Temperature_Sensor,Water_Temperature_Setpoint,Wind_Direction_Sensor,Wind_Speed_Sensor,Zone_Air_Dewpoint_Sensor,Zone_Air_Humidity_Sensor,Zone_Air_Humidity_Setpoint,Zone_Air_Temperature_Sensor
0,test_X20367.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_X103084.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_X6910.pkl,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_X66332.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,test_X38528.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315715,test_X325790.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315716,test_X61444.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315717,test_X221284.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315718,test_X115827.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [1]:
import numpy as np

pred_idx = np.load("../logs/0127_xgb_base.npy")

In [7]:
pred_res = np.zeros((len(listtestfile), len(columnlist)))
pred_res[pred_idx[0], pred_idx[1]] = 1.0

In [8]:
semi_pred_filename = pd.DataFrame(columns=['filename'])
semi_pred_filename['filename'] = pd.Series(listtestfile).apply(lambda x: x.split("/")[-1])

semi_pred_res = pd.DataFrame(data=pred_res, columns=columnlist)

semi_pred_res = pd.concat([semi_pred_filename, semi_pred_res], axis=1)

In [9]:
semi_pred_res.to_csv("../logs/submit/0124_semi_supervise_balanced_thr090.csv", index=False)

In [34]:
semi_pred_res

Unnamed: 0,filename,Active_Power_Sensor,Air_Flow_Sensor,Air_Flow_Setpoint,Air_Temperature_Sensor,Air_Temperature_Setpoint,Alarm,Angle_Sensor,Average_Zone_Air_Temperature_Sensor,Chilled_Water_Differential_Temperature_Sensor,...,Warmest_Zone_Air_Temperature_Sensor,Water_Flow_Sensor,Water_Temperature_Sensor,Water_Temperature_Setpoint,Wind_Direction_Sensor,Wind_Speed_Sensor,Zone_Air_Dewpoint_Sensor,Zone_Air_Humidity_Sensor,Zone_Air_Humidity_Setpoint,Zone_Air_Temperature_Sensor
0,test_X20367.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_X103084.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_X6910.pkl,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_X66332.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,test_X38528.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315715,test_X325790.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315716,test_X61444.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315717,test_X221284.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315718,test_X115827.pkl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
