In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import time
from zipfile import ZipFile
import warnings
import pickle
import torch
from torch.utils.data import Dataset
import tsfel
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
from hiclass import LocalClassifierPerNode, LocalClassifierPerParentNode, LocalClassifierPerLevel
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

In [2]:
#output model name
model_name = 'model_10012025_featsel_aug.pkl'

In [3]:
#output submission name
submission_name = "hier_lgbrfvote_improve_tier_10_fold_neg_sample_thr_0110_featsel_aug_leo.csv"

In [29]:
train_dir = "../downloads/train_data_features_v3_fixed"
train_y_path = "../downloads/train_y_v0.1.0.csv"
test_feat_path = "../downloads/test_features_full_v3.csv"

# Data split

In [94]:
def create_folds(train_y, n_splits=10):
    # Create a label array for stratification
    # We'll use the first non-zero label for each row as the stratification target
    stratify_labels = []
    for _, row in tqdm(train_y.iterrows(), total=len(train_y)):
        labels = row[train_y.columns != 'filename'].values
        stratify_labels.append(str(labels))
    
    # Create StratifiedKFold object
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Generate fold indices
    folds = []
    for train_idx, val_idx in skf.split(train_y, stratify_labels):
        folds.append({
            'train': train_idx,
            'val': val_idx
        })
    
    return folds

In [89]:
feature_list = ['0_Absolute energy',
 '0_Area under the curve',
 '0_Autocorrelation',
 '0_Average power',
 '0_Centroid',
 '0_ECDF Percentile Count_0',
 '0_ECDF Percentile Count_1',
 '0_ECDF Percentile_0',
 '0_ECDF Percentile_1',
 '0_ECDF_0',
 '0_ECDF_1',
 '0_ECDF_2',
 '0_ECDF_3',
 '0_ECDF_4',
 '0_ECDF_5',
 '0_ECDF_6',
 '0_ECDF_7',
 '0_ECDF_8',
 '0_ECDF_9',
 '0_Entropy',
 '0_Histogram mode',
 '0_Interquartile range',
 '0_Kurtosis',
 '0_Max',
 '0_Mean',
 '0_Mean absolute deviation',
 '0_Mean absolute diff',
 '0_Mean diff',
 '0_Median',
 '0_Median absolute deviation',
 '0_Median absolute diff',
 '0_Median diff',
 '0_Min',
 '0_Negative turning points',
 '0_Neighbourhood peaks',
 '0_Peak to peak distance',
 '0_Positive turning points',
 '0_Root mean square',
 '0_Signal distance',
 '0_Skewness',
 '0_Slope',
 '0_Standard deviation',
 '0_Sum absolute diff',
 '0_Variance',
 '0_Zero crossing rate',
 '0_Fundamental frequency',
 '0_Human range energy',
 '0_Max power spectrum',
 '0_Maximum frequency',
 '0_Median frequency',
 '0_Power bandwidth',
 '0_Wavelet entropy',
 'value_median',
 'value_mean',
 'value_qmean',
 'value_max',
 'value_min',
 'value_maxmin',
 'value_diffmax',
 'value_diffmin',
 'value_diffmean',
 'value_diffqmean',
 'value_diffmedian',
 'value_diffmaxmin',
 'time_diffmean',
 'time_diffqmean',
 'time_diffmax',
 'time_diffmin',
 'time_diffmedian',
 'value_std',
 'value_var',
 'value_diffstd',
 'value_diffvar',
 'time_diffstd',
 'time_diffvar',
 'time_burstiness',
 'time_total',
 'time_event_density',
 'time_entropy']

In [90]:
train_files = [os.path.join(train_dir, f) for f in sorted(os.listdir(train_dir))]
train_files = train_files[:6]
train_files

['../downloads/train_data_features_v3_fixed/train_features_full_v3.csv',
 '../downloads/train_data_features_v3_fixed/train_features_split1_2_v3.csv',
 '../downloads/train_data_features_v3_fixed/train_features_split1_3_v3.csv',
 '../downloads/train_data_features_v3_fixed/train_features_split1_4_v3.csv',
 '../downloads/train_data_features_v3_fixed/train_features_split2_2_v3.csv',
 '../downloads/train_data_features_v3_fixed/train_features_split2_3_v3.csv']

In [91]:
train_sets = [pd.read_csv(f) for f in train_files]

feature_list = train_sets[0].columns.tolist()
feature_list = [item for item in feature_list if "LPCC" not in item]
feature_list = [item for item in feature_list if "MFCC" not in item]

train_sets = [_df[feature_list] for _df in train_sets]

In [92]:
train_y = pd.read_csv(train_y_path)

In [95]:
folds = create_folds(train_y, n_splits=5)

100%|██████████| 31839/31839 [00:15<00:00, 2094.77it/s]


# Prepare features

In [96]:
test_X = pd.read_csv(test_feat_path)[feature_list]

In [97]:
train_X = pd.concat(train_sets, ignore_index=True).drop(columns=["Unnamed: 0"])

# Prepare labels

In [98]:
def get_active_labels_np(row):
    """More efficient version using numpy"""
    arr = row.to_numpy() # convert to numpy array
    indices = np.where(arr == 1)[0] # get indices where value is 1
    labels = row.index[indices].tolist() # get labels from indices
    return labels

labelhir = train_y.apply(get_active_labels_np, axis=1).tolist()

In [99]:
# Get a tier dict
ontology_list = list(train_y.columns[1:])

from collections import defaultdict

def build_tree(onto):
    """
    Build a tree so that each term has at most one parent.
    The parent is determined by the longest existing term that is a substring of the child.
    """
    # Sort terms by length so that broader terms are processed (and assigned) first
    sorted_onto = sorted(onto, key=len)
    
    # Dictionaries for storing parent-child relationships
    parent_map = {}             # term -> parent
    children_map = defaultdict(list)  # parent -> [children]

    processed = []
    
    for term in sorted_onto:
        # Find all processed terms that are substrings of 'term'
        potential_parents = [p for p in processed if p in term]
        
        if not potential_parents:
            # No parent found; this term is at the root
            parent_map[term] = None
            children_map[None].append(term)
        else:
            # Pick the longest parent (closest match)
            parent = max(potential_parents, key=len)
            parent_map[term] = parent
            children_map[parent].append(term)
        
        processed.append(term)
    
    return parent_map, children_map

"""
Re-built hierachical labels
"""
level_labels = [[], [], [], [], []]

def print_tree(children_map, root=None, depth=0):
    """
    Recursively print the tree structure with indentation.
    'root=None' means we are listing top-level (root) terms first.
    """
    
    if root is None:
        # For all top-level terms
        for child in sorted(children_map[root]):
            print_tree(children_map, child, depth)
    else:
        # print("  " * depth + root)
        level_labels[depth].append(root)
        for child in sorted(children_map[root]):
            print_tree(children_map, child, depth + 1)

parent_map, children_map = build_tree(ontology_list)
print_tree(children_map)

In [100]:
tiers = {
    1: level_labels[0],
    2: level_labels[1],
    3: level_labels[2],
    4: level_labels[3],
    5: level_labels[4]
}

def get_tier(label):
    for tier_num, tier_list in tiers.items():
        if label in tier_list:
            return tier_num
    return None  # Handle cases where the label isn't found in any tier

def sort_labels(labels):
    return sorted(labels, key=lambda label: (get_tier(label) or float('inf'), label))


In [101]:
sorted_labelhir = [sort_labels(labels) for labels in labelhir]

In [102]:
label_hier = np.array(
    sorted_labelhir,
    dtype=object,
)

In [103]:
len(label_hier)

31839

In [104]:
_label = np.array([x[-1] for x in label_hier])

In [105]:
len(_label)

31839

In [106]:
padded_label = pd.Series(label_hier).apply(lambda x: x + ['None'] * (5 - len(x)) if len(x) < 5 else x)

In [107]:
len(padded_label)

31839

In [108]:
# Count Nones at each level
for i in range(5):
    none_count = sum(padded_label.apply(lambda x: x[i] == 'None'))
    print(f"Level {i+1}: {none_count} None values out of {len(padded_label)} total ({none_count/len(padded_label):.2%})")

Level 1: 0 None values out of 31839 total (0.00%)
Level 2: 12321 None values out of 31839 total (38.70%)
Level 3: 20247 None values out of 31839 total (63.59%)
Level 4: 27216 None values out of 31839 total (85.48%)
Level 5: 30936 None values out of 31839 total (97.16%)


# Model Training

In [67]:
def train_lgbm_classifier(train_X, _label, folds, drop_none=False):
    """
    Train LightGBM models using k-fold cross validation

    Args:
        train_X: Training features DataFrame
        _label: Array of labels
        folds: List of dictionaries containing train/val indices
        drop_none: Whether to drop samples with "None" labels

    Returns:
        tuple: (list of trained classifiers, list of scores, list of validation predictions)
    """
    classifiers = []
    scores = []
    val_predictions = []  # List to store validation predictions

    # Define LightGBM parameters
    params = {
        'verbose':-1,
        'n_estimators': 100,
        'learning_rate': 0.1,
        'random_state': 42,
        'n_jobs': 8,  # Use all available cores
        'objective': 'multiclass',
        'num_class': len(set(_label))  # Set number of classes based on unique labels
    }

    for f_idx, fold in enumerate(folds):
        # Prepare train and validation data for this fold
        train_X_fold = train_X.iloc[fold['train']]
        train_y_fold = _label[fold['train']]
        val_X_fold = train_X.iloc[fold['val']]
        val_y_fold = _label[fold['val']]

        if drop_none:
            # Remove samples with "None" labels from training set
            train_mask = train_y_fold != "None"
            train_X_fold = train_X_fold[train_mask]
            train_y_fold = train_y_fold[train_mask]

            # Remove samples with "None" labels from validation set
            val_mask = val_y_fold != "None"
            val_X_fold = val_X_fold[val_mask]
            val_y_fold = val_y_fold[val_mask]
            print(f"Dropped train: {len(train_X_fold) - sum(train_mask)}, val: {len(val_X_fold) - sum(val_mask)}")

        # Check the train_y_fold. If more than 30% of samples are labeled "None",
        # randomly sample from the "None" to make that ratio no more than 30%.
        none_mask = (train_y_fold == "None")
        none_count = np.sum(none_mask)
        total_samples = len(train_y_fold)
        none_ratio = none_count / total_samples if total_samples > 0 else 0

        if none_ratio > 0.4:
            # Calculate how many "None" labels we should keep (30% of total)
            max_none_to_keep = int(0.4 * (total_samples - none_count))

            # Randomly choose which "None" labels to keep
            none_indices = np.where(none_mask)[0]

            # Fix the random seed before shuffling for reproducibility
            rng = np.random.RandomState(f_idx)
            rng.shuffle(none_indices)

            keep_none_indices = none_indices[:max_none_to_keep]

            # Indices of all non-"None" labels
            other_indices = np.where(~none_mask)[0]

            # Combine indices to keep and then sort
            new_indices = np.concatenate([keep_none_indices, other_indices])
            new_indices = np.sort(new_indices)  # Sort so we can index the DataFrame consistently

            # Subset the training data
            train_X_fold = train_X_fold.iloc[new_indices]
            train_y_fold = train_y_fold[new_indices]

            print(f"Sampled: none-ratio: {none_ratio}, removed: {total_samples - max_none_to_keep}")

        # Create and train LightGBM model
        model = LGBMClassifier(**params)
        model.fit(train_X_fold, train_y_fold)

        classifiers.append(model)

        # Calculate score and save predictions on validation set
        val_preds = model.predict(val_X_fold)
        score = np.mean(val_preds == val_y_fold)
        scores.append(score)
        val_predictions.append({
            'true_labels': val_y_fold,
            'predicted_labels': val_preds,
            'fold_indices': fold['val']
        })
        print(f"Fold score: {score:.4f}")

    print(f"Average score: {np.mean(scores)}")
    return classifiers, scores, val_predictions

### Train the high precision model by allowing None prediction

In [79]:
prec_lgbm_classifiers = []
prec_scores = []
prec_lgbm_val_predictions = []

for i in range(5):
    print(f"Training level {i}")
    _classifiers, _scores, _val_predictions = train_lgbm_classifier(train_X, np.array([x[i] for x in padded_label]), folds, drop_none=False)
    prec_lgbm_classifiers.append(_classifiers)
    prec_scores.append(_scores)
    prec_lgbm_val_predictions.append(_val_predictions)

Training level 0
Fold score: 0.8278
Fold score: 0.8268
Fold score: 0.8282
Fold score: 0.8298
Fold score: 0.8291
Average score: 0.8283457395018686
Training level 1
Fold score: 0.2406
Fold score: 0.4173
Fold score: 0.2397


KeyboardInterrupt: 

In [None]:
max_value = np.finfo(np.float32).max
cliped_text_X = np.clip(test_X, -max_value, max_value)

cliped_text_X = cliped_text_X.replace([np.inf, -np.inf], 0.0)  # Replace infinity
cliped_text_X = cliped_text_X.fillna(0.0)  # Replace NaN

print("Checking for NaN values:", np.any(np.isnan(cliped_text_X)))
print("Checking for infinite values:", np.any(np.isinf(cliped_text_X)))
print("Maximum value:", np.max(cliped_text_X))
print("Minimum value:", np.min(cliped_text_X))

In [None]:
def make_predictions_with_models(classifiers, test_data):
    """
    Make probability predictions using multiple classifier models
    
    Args:
        classifiers: List of trained classifier models
        test_data: Test data to make predictions on
        
    Returns:
        List of probability predictions from each classifier
    """
    test_preds_all = []
    for clf in tqdm(classifiers):
        pred = clf.predict_proba(test_data)
        test_preds_all.append(pred)
    return test_preds_all

In [None]:
def align_and_combine_predictions(classifiers, test_preds_all, test_data, threshold=0.0):
    """
    Aligns predictions from multiple classifiers and combines them through averaging
    
    Args:
        classifiers: List of trained classifier models
        test_preds_all: List of probability predictions from each classifier
        test_data: Test data used for predictions
        threshold: Minimum probability threshold for making predictions
        
    Returns:
        Final class predictions after aligning and combining probabilities
    """
    # Get the common classes across all classifiers
    all_classes = classifiers[0].classes_
    test_preds_aligned = []

    # Make predictions with each fold's model and align them 
    for i, clf in tqdm(enumerate(classifiers)):
        pred = test_preds_all[i]
        # Create a mapping to align predictions with common classes
        pred_dict = {_cls: idx for idx, _cls in enumerate(clf.classes_)}
        aligned_pred = np.zeros((len(test_data), len(all_classes)))
        
        for i, _cls in enumerate(all_classes):
            if _cls in pred_dict:
                aligned_pred[:, i] = pred[:, pred_dict[_cls]]
        
        test_preds_aligned.append(aligned_pred)

    # Stack and average the aligned predictions
    test_preds_all = np.stack(test_preds_aligned)
    test_preds_proba = test_preds_all.mean(axis=0)

    # Get max probabilities for each prediction
    max_probs = np.max(test_preds_proba, axis=1)
    
    # Convert probabilities to class predictions, using threshold
    test_preds = np.array(['None'] * len(test_data), dtype=object)
    confident_mask = max_probs >= threshold
    test_preds[confident_mask] = all_classes[np.argmax(test_preds_proba[confident_mask], axis=1)]
    
    return test_preds

In [None]:
test_preds = []
thr_list = [0.0, 0.2, 0.3, 0.4, 0.5]
for i in range(5):
    print(f"Predicting level {i}")
    test_preds_all = make_predictions_with_models(prec_classifiers[i], cliped_text_X)
    test_preds.append(align_and_combine_predictions(prec_classifiers[i], test_preds_all, cliped_text_X))

In [None]:
# import pickle

# # Loop through levels
# for level in range(len(prec_lgbm_classifiers)):  # Iterate over levels (5 in this case)
#     classifiers = prec_lgbm_classifiers[level]  # List of classifiers for this level
#     for fold_idx, classifier in enumerate(classifiers):  # Iterate over folds
#         # Define the file path for this classifier
#         file_path = f"./lgbm_classifier_level_4_fold_{fold_idx}.pkl"
        
#         # Save the classifier using pickle
#         with open(file_path, "wb") as f:
#             pickle.dump(classifier, f)
#         print(f"Saved LGBM classifier for level 4, fold {fold_idx} to {file_path}")

In [34]:
# prec_xgb_classifiers = []
# prec_scores = []
# prec_xgb_val_predictions = []

# for i in range(5):
#     print(f"Training level {i}")
#     _classifiers, _scores, _val_predictions = train_xgboost(train_X, np.array([x[i] for x in padded_label]), folds, drop_none=False)
#     prec_xgb_classifiers.append(_classifiers)
#     prec_scores.append(_scores)
#     prec_xgb_val_predictions.append(_val_predictions)

Training level 0
Fold score: 0.8322
Fold score: 0.8350
Fold score: 0.8347
Fold score: 0.8357
Fold score: 0.8348
Fold score: 0.8358
Fold score: 0.8349
Fold score: 0.8354
Fold score: 0.8347
Fold score: 0.8350
Average score: 0.834837777568391
Training level 1
Fold score: 0.9135
Fold score: 0.9145
Fold score: 0.9160
Fold score: 0.9141
Fold score: 0.9167
Fold score: 0.9170
Fold score: 0.9147
Fold score: 0.9171
Fold score: 0.9142
Fold score: 0.9161
Average score: 0.9153679449731461
Training level 2
Sampled: none-ratio: 0.6358588872486922, removed: 244813
Fold score: 0.8957
Sampled: none-ratio: 0.6359007646108371, removed: 244818
Fold score: 0.8963
Sampled: none-ratio: 0.6359740499945908, removed: 244827
Fold score: 0.8984
Sampled: none-ratio: 0.6358937850504797, removed: 244817
Fold score: 0.8970
Sampled: none-ratio: 0.6358658668090497, removed: 244814
Fold score: 0.8998
Sampled: none-ratio: 0.6360298864774507, removed: 244833
Fold score: 0.8982
Sampled: none-ratio: 0.636012437576557, remove

In [41]:
# import pickle

# # Loop through levels
# for level in range(len(prec_xgb_classifiers)):  # Iterate over levels (5 in this case)
#     classifiers = prec_xgb_classifiers[level]  # List of classifiers for this level
#     for fold_idx, classifier in enumerate(classifiers):  # Iterate over folds
#         # Define the file path for this classifier
#         file_path = f"./xgb_classifier_level_{level}_fold_{fold_idx}.pkl"
        
#         # Save the classifier using pickle
#         with open(file_path, "wb") as f:
#             pickle.dump(classifier, f)
#         print(f"Saved XGBoost classifier for level {level}, fold {fold_idx} to {file_path}")

Saved XGBoost classifier for level 0, fold 0 to ./xgb_classifier_level_0_fold_0.pkl
Saved XGBoost classifier for level 0, fold 1 to ./xgb_classifier_level_0_fold_1.pkl
Saved XGBoost classifier for level 0, fold 2 to ./xgb_classifier_level_0_fold_2.pkl
Saved XGBoost classifier for level 0, fold 3 to ./xgb_classifier_level_0_fold_3.pkl
Saved XGBoost classifier for level 0, fold 4 to ./xgb_classifier_level_0_fold_4.pkl
Saved XGBoost classifier for level 0, fold 5 to ./xgb_classifier_level_0_fold_5.pkl
Saved XGBoost classifier for level 0, fold 6 to ./xgb_classifier_level_0_fold_6.pkl
Saved XGBoost classifier for level 0, fold 7 to ./xgb_classifier_level_0_fold_7.pkl
Saved XGBoost classifier for level 0, fold 8 to ./xgb_classifier_level_0_fold_8.pkl
Saved XGBoost classifier for level 0, fold 9 to ./xgb_classifier_level_0_fold_9.pkl
Saved XGBoost classifier for level 1, fold 0 to ./xgb_classifier_level_1_fold_0.pkl
Saved XGBoost classifier for level 1, fold 1 to ./xgb_classifier_level_1_fol

In [None]:
#RF Confusion

In [None]:
# # Create confusion matrix
# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# from sklearn.metrics import classification_report

In [None]:
# # Stack all 5 folds' predictions and true labels together
# all_true_labels = np.concatenate([_fold['true_labels'] for _fold in prec_rf_val_predictions[2]])
# all_predicted_labels = np.concatenate([_fold['predicted_labels'] for _fold in prec_rf_val_predictions[2]])

# # Compute confusion matrix
# cm = confusion_matrix(all_true_labels, all_predicted_labels)

# # Create a figure with larger size
# plt.figure(figsize=(20, 16))

# # Create heatmap
# sns.heatmap(cm, 
#             xticklabels=np.unique(all_true_labels),
#             yticklabels=np.unique(all_true_labels),
#             annot=True, 
#             fmt='d',
#             cmap='Blues')

# plt.title('Confusion Matrix - Fold 0')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.xticks(rotation=90)
# plt.yticks(rotation=0)
# plt.tight_layout()
# plt.show()

# # Print classification report for more detailed metrics

# print("\nClassification Report:")
# print(classification_report(all_true_labels, all_predicted_labels))

In [None]:
#with open('random_forest_data.pkl', 'rb') as f:
#    data = pickle.load(f)
#prec_rf_classifiers = data['classifiers1']
#prec_lgbm_classifiers = data['classifiers2']

In [None]:
#lgbm confusion

In [None]:
# # Stack all 5 folds' predictions and true labels together
# all_true_labels = np.concatenate([_fold['true_labels'] for _fold in prec_lgbm_val_predictions[2]])
# all_predicted_labels = np.concatenate([_fold['predicted_labels'] for _fold in prec_lgbm_val_predictions[2]])

# # Compute confusion matrix
# cm = confusion_matrix(all_true_labels, all_predicted_labels)

# # Create a figure with larger size
# plt.figure(figsize=(20, 16))

# # Create heatmap
# sns.heatmap(cm, 
#             xticklabels=np.unique(all_true_labels),
#             yticklabels=np.unique(all_true_labels),
#             annot=True, 
#             fmt='d',
#             cmap='Blues')

# plt.title('Confusion Matrix - Fold 0')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.xticks(rotation=90)
# plt.yticks(rotation=0)
# plt.tight_layout()
# plt.show()

# # Print classification report for more detailed metrics

# print("\nClassification Report:")
# print(classification_report(all_true_labels, all_predicted_labels))

In [42]:
max_value = np.finfo(np.float32).max
cliped_text_X = np.clip(test_X, -max_value, max_value)
#cliped_text_X = np.clip(test_X, a_min=None, a_max=np.finfo(np.float32).max)
#cliped_text_X = np.nan_to_num(test_X, nan=0.0, posinf=np.finfo(np.float32).max, neginf=np.finfo(np.float32).min)

In [43]:
cliped_text_X

Unnamed: 0,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Average power,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,0_ECDF Percentile_0,0_ECDF Percentile_1,0_ECDF_0,...,time_diffmax,time_diffmin,time_diffmedian,time_diffstd,time_diffvar,time_burstiness,time_total,time_event_density,time_entropy,time_slope
0,8.440782e+04,1.503495e+03,122.0,2.512760e+02,350.938229,8.060000e+02,3.225000e+03,4.693100e+00,4.693100e+00,0.000248,...,1800.804,44.334,300.0210,37.923387,1.438183e+03,-0.775550,1209300.493,0.003334,11.969387,300.164008
1,1.474633e+17,5.765358e+09,2.0,1.097309e+14,671.841696,4.294968e+06,4.294968e+06,4.294968e+06,4.294968e+06,0.000125,...,2795.428,432.977,599.9825,67.312288,4.530944e+03,-0.799636,4837908.512,0.001654,12.960075,604.925698
2,4.051532e+05,4.752845e+03,1.0,1.211762e+03,167.236650,1.423650e+01,1.423650e+01,1.423650e+01,1.423650e+01,0.000500,...,1257.186,474.635,599.6460,34.498242,1.190129e+03,-0.891520,1203661.837,0.001663,10.964569,601.461213
3,1.639011e+07,3.919227e+04,1014.0,7.490392e+03,449.800951,1.464000e+03,5.857000e+03,0.000000e+00,7.471000e+01,0.000137,...,380113.304,187.020,599.9890,4449.767580,1.980043e+07,0.741404,4837597.819,0.001514,12.184404,671.192168
4,4.044000e+03,6.717552e+02,1.0,6.018561e+00,337.714842,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,0.000247,...,1202.283,228.718,599.9110,32.974216,1.087299e+03,-0.895506,2418916.993,0.001672,11.979174,598.376379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315715,5.784299e+07,3.735436e+04,31.0,1.722380e+05,166.327444,4.040000e+02,1.616000e+03,0.000000e+00,2.983040e+02,0.000495,...,1187.473,179.672,599.9310,28.724209,8.250802e+02,-0.908410,1208994.487,0.001672,10.978171,598.241785
315716,3.649651e+07,5.168745e+04,28.0,5.432153e+04,319.448996,8.050000e+02,3.220000e+03,3.396420e+01,1.113395e+02,0.000248,...,2180.024,460.211,599.9415,38.317201,1.468208e+03,-0.880144,2418699.333,0.001664,11.972423,600.968511
315717,0.000000e+00,0.000000e+00,1.0,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000261,...,76196.536,517.170,600.2930,1227.229968,1.506093e+06,0.320393,2418610.571,0.001584,11.711933,619.810285
315718,0.000000e+00,0.000000e+00,1.0,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000247,...,1796.102,3.384,7.1840,297.064473,8.824730e+04,-0.002320,1209004.416,0.003352,11.037622,298.463481


In [44]:
cliped_text_X = cliped_text_X.replace([np.inf, -np.inf], 0.0)  # Replace infinity
cliped_text_X = cliped_text_X.fillna(0.0)  # Replace NaN

In [45]:
print("Checking for NaN values:", np.any(np.isnan(cliped_text_X)))
print("Checking for infinite values:", np.any(np.isinf(cliped_text_X)))
print("Maximum value:", np.max(cliped_text_X))
print("Minimum value:", np.min(cliped_text_X))

Checking for NaN values: False
Checking for infinite values: False
Maximum value: 3.4028234663852886e+38
Minimum value: -3.4028234663852886e+38


In [46]:
cliped_text_X[cliped_text_X.isnull().any(axis=1)].index

Index([], dtype='int64')

Load in the downloaded classifiers

RF

In [47]:
import pickle

# Initialize the 2D list to store classifiers
prec_rf_classifiers = []

# Loop through each level
for level in range(5):
    level_classifiers = []  # List to store classifiers for the current level
    
    # Loop through each fold for the current level
    for fold in range(10):
        # Construct the filename
        file_path = f'./rf-models-default-params/rf_classifier_level_{level}_fold_{fold}.pkl'
        
        # Load the classifier
        with open(file_path, 'rb') as f:
            classifier = pickle.load(f)
        
        # Add the classifier to the list for the current level
        level_classifiers.append(classifier)
    
    # Append the list of classifiers for the current level to the main list
    prec_rf_classifiers.append(level_classifiers)

# The `prec_rf_classifiers` list now contains all classifiers organized by level and fold
print("All classifiers successfully loaded into `prec_rf_classifiers`.")

All classifiers successfully loaded into `prec_rf_classifiers`.


In [48]:
len(prec_rf_classifiers)

5

In [49]:
prec_rf_classifiers

[[RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42)],
 [RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_sta

LGBM

In [50]:
import pickle

# Initialize the 2D list to store classifiers
prec_lgbm_classifiers = []

# Loop through each level
for level in range(5):
    level_classifiers = []  # List to store classifiers for the current level
    
    # Loop through each fold for the current level
    for fold in range(10):
        # Construct the filename
        if (level == 1) or (level == 2) or (level == 3):
            level_classifiers.append(prec_rf_classifiers[level][fold])
            continue
        else:
            file_path = f'./lgbm-models-default-params/lgbm_classifier_level_{level}_fold_{fold}.pkl'
        
        # Load the classifier
        with open(file_path, 'rb') as f:
            classifier = pickle.load(f)
        
        # Add the classifier to the list for the current level
        level_classifiers.append(classifier)
    
    # Append the list of classifiers for the current level to the main list
    prec_lgbm_classifiers.append(level_classifiers)

# The `prec_rf_classifiers` list now contains all classifiers organized by level and fold
print("All classifiers successfully loaded into `prec_lgbm_classifiers`.")

All classifiers successfully loaded into `prec_lgbm_classifiers`.


In [51]:
prec_lgbm_classifiers

[[LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
 

In [52]:
len(prec_lgbm_classifiers)

5

XGB

In [39]:
import pickle

# Initialize the 2D list to store classifiers
prec_xgb_classifiers = []

# Loop through each level
for level in range(5):
    level_classifiers = []  # List to store classifiers for the current level
    
    # Loop through each fold for the current level
    for fold in range(10):
        # Construct the filename
        file_path = f'./xgb-models-400-estimators-0.3-lr/xgb_classifier_level_{level}_fold_{fold}.pkl'
        
        # Load the classifier
        with open(file_path, 'rb') as f:
            classifier = pickle.load(f)
        
        # Add the classifier to the list for the current level
        level_classifiers.append(classifier)
    
    # Append the list of classifiers for the current level to the main list
    prec_xgb_classifiers.append(level_classifiers)

# The `prec_rf_classifiers` list now contains all classifiers organized by level and fold
print("All classifiers successfully loaded into `xgb_rf_classifiers`.")

All classifiers successfully loaded into `xgb_rf_classifiers`.


In [53]:
prec_xgb_classifiers

[[XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=0.8, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric='logloss',
                feature_types=None, gamma=0, grow_policy=None,
                importance_type=None, interaction_constraints=None,
                learning_rate=0.3, max_bin=None, max_cat_threshold=None,
                max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
                max_leaves=None, min_child_weight=1, missing=nan,
                monotone_constraints=None, multi_strategy=None, n_estimators=400,
                n_jobs=8, num_parallel_tree=None, objective='multi:softprob', ...),
  XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               

In [54]:
len(prec_xgb_classifiers)

5

In [55]:
def make_predictions_with_models_vote(classifiers1, classifiers2, classifiers3, test_data):
    """
    Make probability predictions using multiple classifier models
    
    Args:
        classifiers: List of trained classifier models
        test_data: Test data to make predictions on
        
    Returns:
        List of probability predictions from each classifier
    """
    test_preds_all = []
    for i in tqdm(range(len(classifiers1))):
    #for clf in tqdm(classifiers):
        pred1 = classifiers1[i].predict_proba(test_data)
        pred2 = classifiers2[i].predict_proba(test_data)
        pred3 = classifiers3[i].predict_proba(test_data)
        pred = (pred1+pred2+pred3)/3.0
        test_preds_all.append(pred)
    return test_preds_all

In [56]:
def align_and_combine_predictions(classifiers1, classifiers2, classifiers3, test_preds_all, test_data, threshold=0.0):
    """
    Aligns predictions from multiple classifiers and combines them through averaging
    
    Args:
        classifiers: List of trained classifier models
        test_preds_all: List of probability predictions from each classifier
        test_data: Test data used for predictions
        threshold: Minimum probability threshold for making predictions
        
    Returns:
        Final class predictions after aligning and combining probabilities
    """
    # Get the common classes across all classifiers
    all_classes = classifiers2[0].classes_
    test_preds_aligned = []

    # Make predictions with each fold's model and align them 
    for i, clf in tqdm(enumerate(classifiers2)):
        pred = test_preds_all[i]
        # Create a mapping to align predictions with common classes
        pred_dict = {_cls: idx for idx, _cls in enumerate(clf.classes_)}
        aligned_pred = np.zeros((len(test_data), len(all_classes)))
        
        for i, _cls in enumerate(all_classes):
            if _cls in pred_dict:
                aligned_pred[:, i] = pred[:, pred_dict[_cls]]
        
        test_preds_aligned.append(aligned_pred)

    # Stack and average the aligned predictions
    test_preds_all = np.stack(test_preds_aligned)
    test_preds_proba = test_preds_all.mean(axis=0)

    # Get max probabilities for each prediction
    max_probs = np.max(test_preds_proba, axis=1)
    
    # Convert probabilities to class predictions, using threshold
    test_preds = np.array(['None'] * len(test_data), dtype=object)
    confident_mask = max_probs >= threshold
    test_preds[confident_mask] = all_classes[np.argmax(test_preds_proba[confident_mask], axis=1)]
    
    return test_preds

test_preds = []
thr_list = [0.0, 0.2, 0.3, 0.4, 0.5]
for i in range(5):
    print(f"Predicting level {i}")
    test_preds_all = make_predictions_with_models(prec_classifiers[i], cliped_text_X)
    test_preds.append(align_and_combine_predictions(prec_classifiers[i], test_preds_all, cliped_text_X))

In [None]:
import gc
gc.collect()

In [57]:
test_preds = []
thr_list = [0.0, 0.2, 0.3, 0.4, 0.5]
for i in range(5):
    print(f"Predicting level {i}")
    test_preds_all = make_predictions_with_models_vote(prec_rf_classifiers[i], prec_lgbm_classifiers[i], prec_xgb_classifiers[i], cliped_text_X)
    test_preds.append(align_and_combine_predictions(prec_rf_classifiers[i], prec_lgbm_classifiers[i], prec_xgb_classifiers[i], test_preds_all, cliped_text_X))
    #test_preds_all = make_predictions_with_models_vote(prec_rf_classifiers[i], prec_lgbm_classifiers[i], prec_xgb_classifiers[i], test_X)
    #test_preds.append(align_and_combine_predictions(prec_rf_classifiers[i], prec_lgbm_classifiers[i], prec_xgb_classifiers[i], test_preds_all, test_X))

Predicting level 0


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:39<00:00,  9.92s/it]
10it [00:00, 56.21it/s]


Predicting level 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [05:26<00:00, 32.67s/it]
10it [00:05,  1.90it/s]


Predicting level 2


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [04:05<00:00, 24.54s/it]
10it [00:02,  4.09it/s]


Predicting level 3


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:30<00:00, 15.05s/it]
10it [00:01,  5.51it/s]


Predicting level 4


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:26<00:00,  8.69s/it]
10it [00:00, 28.38it/s]


In [58]:
test_preds_all[0][0]

array([3.35642519e-03, 4.55702914e-06, 1.00267337e-02, 5.71873737e-06,
       8.75914799e-01, 9.72787444e-02, 1.00679583e-02, 3.33763706e-03,
       7.42273891e-06])

In [59]:
# Convert to array and process None values
stacked = np.stack(test_preds).transpose()
for row in tqdm(stacked):
    # Find first occurrence of 'None' if any
    none_idx = np.where(row == 'None')[0]
    if len(none_idx) > 0:
        # Set all elements after first None to None
        first_none = none_idx[0]
        row[first_none:] = 'None'
        
stacked

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 315720/315720 [00:03<00:00, 83791.68it/s]


array([['Sensor', 'Power_Sensor', 'Electrical_Power_Sensor', 'None',
        'None'],
       ['Sensor', 'Flow_Sensor', 'Chilled_Water_Supply_Flow_Sensor',
        'Water_Flow_Sensor', 'None'],
       ['Sensor', 'Temperature_Setpoint', 'Power_Sensor',
        'Electrical_Power_Sensor', 'Peak_Power_Demand_Sensor'],
       ...,
       ['Sensor', 'None', 'None', 'None', 'None'],
       ['Sensor', 'Power_Sensor', 'None', 'None', 'None'],
       ['Alarm', 'None', 'None', 'None', 'None']], dtype=object)

In [60]:
# # Assuming `number_to_label` is already defined from the previous code
# # Create a function to map numbers to labels
# def map_numbers_to_labels(array, mapping):
#     for row in tqdm(array):
#         for i, value in enumerate(row):
#             # If the value is numeric, map it to the label
#             if isinstance(value, (int, np.integer)):
#                 row[i] = mapping.get(value, value)  # Keep value if not in mapping

# # Example mapping dictionary (you should replace this with your actual mapping)
# number_to_label = {i: label for i, label in enumerate(train_y.columns[1:].tolist())}

# # Apply the mapping to the array
# map_numbers_to_labels(stacked, number_to_label)

# # Output the processed array
# stacked


In [61]:
columnlist = ['Active_Power_Sensor', 'Air_Flow_Sensor',
       'Air_Flow_Setpoint', 'Air_Temperature_Sensor',
       'Air_Temperature_Setpoint', 'Alarm', 'Angle_Sensor',
       'Average_Zone_Air_Temperature_Sensor',
       'Chilled_Water_Differential_Temperature_Sensor',
       'Chilled_Water_Return_Temperature_Sensor',
       'Chilled_Water_Supply_Flow_Sensor',
       'Chilled_Water_Supply_Temperature_Sensor', 'Command',
       'Cooling_Demand_Sensor', 'Cooling_Demand_Setpoint',
       'Cooling_Supply_Air_Temperature_Deadband_Setpoint',
       'Cooling_Temperature_Setpoint', 'Current_Sensor',
       'Damper_Position_Sensor', 'Damper_Position_Setpoint', 'Demand_Sensor',
       'Dew_Point_Setpoint', 'Differential_Pressure_Sensor',
       'Differential_Pressure_Setpoint',
       'Differential_Supply_Return_Water_Temperature_Sensor',
       'Discharge_Air_Dewpoint_Sensor', 'Discharge_Air_Temperature_Sensor',
       'Discharge_Air_Temperature_Setpoint',
       'Discharge_Water_Temperature_Sensor', 'Duration_Sensor',
       'Electrical_Power_Sensor', 'Energy_Usage_Sensor',
       'Filter_Differential_Pressure_Sensor', 'Flow_Sensor', 'Flow_Setpoint',
       'Frequency_Sensor', 'Heating_Demand_Sensor', 'Heating_Demand_Setpoint',
       'Heating_Supply_Air_Temperature_Deadband_Setpoint',
       'Heating_Temperature_Setpoint', 'Hot_Water_Flow_Sensor',
       'Hot_Water_Return_Temperature_Sensor',
       'Hot_Water_Supply_Temperature_Sensor', 'Humidity_Setpoint',
       'Load_Current_Sensor', 'Low_Outside_Air_Temperature_Enable_Setpoint',
       'Max_Air_Temperature_Setpoint', 'Min_Air_Temperature_Setpoint',
       'Outside_Air_CO2_Sensor', 'Outside_Air_Enthalpy_Sensor',
       'Outside_Air_Humidity_Sensor',
       'Outside_Air_Lockout_Temperature_Setpoint',
       'Outside_Air_Temperature_Sensor', 'Outside_Air_Temperature_Setpoint',
       'Parameter', 'Peak_Power_Demand_Sensor', 'Position_Sensor',
       'Power_Sensor', 'Pressure_Sensor', 'Rain_Sensor',
       'Reactive_Power_Sensor', 'Reset_Setpoint',
       'Return_Air_Temperature_Sensor', 'Return_Water_Temperature_Sensor',
       'Room_Air_Temperature_Setpoint', 'Sensor', 'Setpoint',
       'Solar_Radiance_Sensor', 'Speed_Setpoint', 'Static_Pressure_Sensor',
       'Static_Pressure_Setpoint', 'Status', 'Supply_Air_Humidity_Sensor',
       'Supply_Air_Static_Pressure_Sensor',
       'Supply_Air_Static_Pressure_Setpoint', 'Supply_Air_Temperature_Sensor',
       'Supply_Air_Temperature_Setpoint', 'Temperature_Sensor',
       'Temperature_Setpoint', 'Thermal_Power_Sensor', 'Time_Setpoint',
       'Usage_Sensor', 'Valve_Position_Sensor', 'Voltage_Sensor',
       'Warmest_Zone_Air_Temperature_Sensor', 'Water_Flow_Sensor',
       'Water_Temperature_Sensor', 'Water_Temperature_Setpoint',
       'Wind_Direction_Sensor', 'Wind_Speed_Sensor',
       'Zone_Air_Dewpoint_Sensor', 'Zone_Air_Humidity_Sensor',
       'Zone_Air_Humidity_Setpoint', 'Zone_Air_Temperature_Sensor'
]

In [62]:
zipftest = ZipFile('./test_X_v0.1.0.zip', 'r')

In [63]:
listtestfile = zipftest.namelist()[1:]

In [64]:
stackedfinalresult = pd.DataFrame(columns=['filename'])
stackedfinalresult['filename'] = pd.Series(listtestfile).apply(lambda x: x.split("/")[-1])

for labelname in columnlist:
    stackedfinalresult[labelname] = 0

test_preds = stacked
for i in tqdm(range(len(test_preds))):
    # stackedfinalresult.loc[i, test_preds[i]] = 1
    predlist = test_preds[i].tolist()
    predlist = [x for x in predlist if x != 'None']
    for predlabelname in predlist:
    	stackedfinalresult.loc[i, predlabelname] = 1

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 315720/315720 [03:51<00:00, 1363.27it/s]


In [65]:
stackedfinalresult = stackedfinalresult.assign(**{col: stackedfinalresult[col].astype(float) for col in stackedfinalresult.columns if col != "filename"})
stackedfinalresult.to_csv("13-01.csv.gz", index=False, compression='gzip')