In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import time
from zipfile import ZipFile
import warnings
import pickle
import torch
from torch.utils.data import Dataset
import tsfel
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
from hiclass import LocalClassifierPerNode, LocalClassifierPerParentNode, LocalClassifierPerLevel
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

# Data split

In [3]:
def create_folds(train_y, n_splits=10):
    # Create a label array for stratification
    # We'll use the first non-zero label for each row as the stratification target
    stratify_labels = []
    for _, row in train_y.iterrows():
        labels = row[train_y.columns != 'filename'].values
        # Get first non-negative label, or 0 if all negative
        first_positive = next((i for i, x in enumerate(labels) if x >= 0), 0)
        stratify_labels.append(first_positive)
    
    # Create StratifiedKFold object
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Generate fold indices
    folds = []
    for train_idx, val_idx in skf.split(train_y, stratify_labels):
        folds.append({
            'train': train_idx,
            'val': val_idx
        })
    
    return folds

In [4]:
feature_list = ['0_Absolute energy',
 '0_Area under the curve',
 '0_Autocorrelation',
 '0_Average power',
 '0_Centroid',
 '0_ECDF Percentile Count_0',
 '0_ECDF Percentile Count_1',
 '0_ECDF Percentile_0',
 '0_ECDF Percentile_1',
 '0_ECDF_0',
 '0_ECDF_1',
 '0_ECDF_2',
 '0_ECDF_3',
 '0_ECDF_4',
 '0_ECDF_5',
 '0_ECDF_6',
 '0_ECDF_7',
 '0_ECDF_8',
 '0_ECDF_9',
 '0_Entropy',
 '0_Histogram mode',
 '0_Interquartile range',
 '0_Kurtosis',
 '0_Max',
 '0_Mean',
 '0_Mean absolute deviation',
 '0_Mean absolute diff',
 '0_Mean diff',
 '0_Median',
 '0_Median absolute deviation',
 '0_Median absolute diff',
 '0_Median diff',
 '0_Min',
 '0_Negative turning points',
 '0_Neighbourhood peaks',
 '0_Peak to peak distance',
 '0_Positive turning points',
 '0_Root mean square',
 '0_Signal distance',
 '0_Skewness',
 '0_Slope',
 '0_Standard deviation',
 '0_Sum absolute diff',
 '0_Variance',
 '0_Zero crossing rate',
 '0_Fundamental frequency',
 '0_Human range energy',
 '0_Max power spectrum',
 '0_Maximum frequency',
 '0_Median frequency',
 '0_Power bandwidth',
 '0_Wavelet entropy',
 'value_median',
 'value_mean',
 'value_qmean',
 'value_max',
 'value_min',
 'value_maxmin',
 'value_diffmax',
 'value_diffmin',
 'value_diffmean',
 'value_diffqmean',
 'value_diffmedian',
 'value_diffmaxmin',
 'time_diffmean',
 'time_diffqmean',
 'time_diffmax',
 'time_diffmin',
 'time_diffmedian',
 'value_std',
 'value_var',
 'value_diffstd',
 'value_diffvar',
 'time_diffstd',
 'time_diffvar',
 'time_burstiness',
 'time_total',
 'time_event_density',
 'time_entropy']

In [5]:
#combineaugmented
train_feat_dir = "../downloads/train_data_features_v3_fixed"

train_0 = pd.read_csv(f"{train_feat_dir}/train_features_full_v3.csv", index_col=0)#[feature_list]
feature_list = train_0.columns.tolist()
feature_list = [item for item in feature_list if "LPCC" not in item]
feature_list = [item for item in feature_list if "MFCC" not in item]

train_0 = train_0[feature_list]
train_1 = pd.read_csv(f"{train_feat_dir}/train_features_split1_2_v3.csv")[feature_list]
train_2 = pd.read_csv(f"{train_feat_dir}/train_features_split2_2_v3.csv")[feature_list]
train_3 = pd.read_csv(f"{train_feat_dir}/train_features_split1_3_v3.csv")[feature_list]
train_4 = pd.read_csv(f"{train_feat_dir}/train_features_split2_3_v3.csv")[feature_list]
train_5 = pd.read_csv(f"{train_feat_dir}/train_features_split3_3_v3.csv")[feature_list]
train_6 = pd.read_csv(f"{train_feat_dir}/train_features_split1_4_v3.csv")[feature_list]
train_7 = pd.read_csv(f"{train_feat_dir}/train_features_split2_4_v3.csv")[feature_list]
train_8 = pd.read_csv(f"{train_feat_dir}/train_features_split3_4_v3.csv")[feature_list]
train_9 = pd.read_csv(f"{train_feat_dir}/train_features_split4_4_v3.csv")[feature_list]

In [6]:
train_y = pd.read_csv("../downloads/train_y_v0.1.0.csv")

In [7]:
#post process it dont forget!!!!
train_y.loc[((train_y['Peak_Power_Demand_Sensor'] == 1)), 
            'Demand_Sensor'] = -1

#train_y.loc[((train_y['Energy_Usage_Sensor'] == 1)), 
#            'Usage_Sensor'] = -1

#'Setpoint', 'Temperature_Setpoint', 'Air_Temperature_Setpoint', 'Cooling_Temperature_Setpoint', 'Cooling_Supply_Air_Temperature_Deadband_Setpoint'
train_y.loc[((train_y['Cooling_Supply_Air_Temperature_Deadband_Setpoint'] == 1)), 
            'Air_Temperature_Setpoint'] = -1
train_y.loc[((train_y['Heating_Supply_Air_Temperature_Deadband_Setpoint'] == 1)), 
            'Air_Temperature_Setpoint'] = -1

In [8]:
train_y = pd.concat([train_y] * 10, ignore_index=True)

In [9]:
folds = create_folds(train_y)

# Prepare features

In [10]:
train_X = pd.concat([train_0, train_1, train_2, train_3, 
                     train_4, train_5, train_6, train_7, 
                     train_8, train_9], ignore_index=True)
train_X

Unnamed: 0,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Average power,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,0_ECDF Percentile_0,0_ECDF Percentile_1,0_ECDF_0,...,time_diffmax,time_diffmin,time_diffmedian,time_diffstd,time_diffvar,time_burstiness,time_total,time_event_density,time_entropy,time_slope
0,3.579300e+06,2.002655e+04,17.0,5.326999e+03,336.834861,30.0,30.0,30.00,30.00,0.000250,...,29709.534,57.065,599.9990,461.649413,2.131202e+05,-0.133925,2418900.499,0.001655,11.910432,605.860232
1,4.051012e+06,3.009676e+04,16.0,3.014589e+03,672.251192,22.5,22.5,22.50,22.50,0.000125,...,29705.420,35.112,600.0000,352.215834,1.240560e+05,-0.262351,4837688.953,0.001659,12.930601,603.250804
2,6.988878e+07,6.261729e+04,32.0,1.040152e+05,371.619154,805.0,3220.0,31.36,131.52,0.000248,...,7200.115,279.083,600.0275,108.255439,1.171924e+04,-0.694783,2418874.283,0.001664,11.965989,601.617578
3,1.183314e+11,5.159254e+06,1.0,8.805016e+07,706.907352,3834.0,3834.0,3834.00,3834.00,0.000124,...,7770.911,285.595,600.0135,85.323742,7.280141e+03,-0.751199,4838073.366,0.001665,12.970361,600.376320
4,4.404025e+08,3.141691e+05,2.0,3.292679e+05,672.200909,234.0,234.0,234.00,234.00,0.000124,...,4802.769,305.036,599.9700,51.713450,2.674281e+03,-0.841547,4837577.658,0.001664,12.972059,600.985625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318385,9.928028e+16,1.661917e+09,2.0,5.927706e+14,85.590344,201.0,804.0,9760520.00,10122320.00,0.000994,...,1180.751,304.968,600.0490,23.566671,5.553880e+02,-0.924407,602946.628,0.001668,9.971950,599.315545
318386,1.002000e+03,1.671441e+02,2.0,5.970967e+00,84.170978,1.0,1.0,1.00,1.00,0.000994,...,1258.058,473.015,600.0940,31.148107,9.702045e+02,-0.901471,604123.302,0.001665,9.971447,600.586043
318387,5.050000e+06,8.366623e+03,1.0,6.035888e+04,41.833115,100.0,100.0,100.00,100.00,0.001980,...,639.331,288.833,599.8335,26.006854,6.763564e+02,-0.916594,301198.426,0.001677,8.975652,599.109134
318388,4.581000e+07,2.524991e+04,1.0,5.442792e+05,42.083178,300.0,300.0,300.00,300.00,0.001965,...,607.430,57.118,599.9915,41.760417,1.743932e+03,-0.869134,302998.882,0.001680,8.982900,595.866076


In [11]:
del train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8, train_9

# Prepare labels

In [12]:
def get_active_labels_np(row):
    """More efficient version using numpy"""
    arr = row.to_numpy() # convert to numpy array
    indices = np.where(arr == 1)[0] # get indices where value is 1
    labels = row.index[indices].tolist() # get labels from indices
    return labels

labelhir = train_y.apply(get_active_labels_np, axis=1).tolist()

In [13]:
# Get a tier dict
ontology_list = list(train_y.columns[1:])

from collections import defaultdict

def build_tree(onto):
    """
    Build a tree so that each term has at most one parent.
    The parent is determined by the longest existing term that is a substring of the child.
    """
    # Sort terms by length so that broader terms are processed (and assigned) first
    sorted_onto = sorted(onto, key=len)
    
    # Dictionaries for storing parent-child relationships
    parent_map = {}             # term -> parent
    children_map = defaultdict(list)  # parent -> [children]

    processed = []
    
    for term in sorted_onto:
        # Find all processed terms that are substrings of 'term'
        potential_parents = [p for p in processed if p in term]
        
        if not potential_parents:
            # No parent found; this term is at the root
            parent_map[term] = None
            children_map[None].append(term)
        else:
            # Pick the longest parent (closest match)
            parent = max(potential_parents, key=len)
            parent_map[term] = parent
            children_map[parent].append(term)
        
        processed.append(term)
    
    return parent_map, children_map

"""
Re-built hierachical labels
"""
level_labels = [[], [], [], [], []]

def print_tree(children_map, root=None, depth=0):
    """
    Recursively print the tree structure with indentation.
    'root=None' means we are listing top-level (root) terms first.
    """
    
    if root is None:
        # For all top-level terms
        for child in sorted(children_map[root]):
            print_tree(children_map, child, depth)
    else:
        # print("  " * depth + root)
        level_labels[depth].append(root)
        for child in sorted(children_map[root]):
            print_tree(children_map, child, depth + 1)

parent_map, children_map = build_tree(ontology_list)
print_tree(children_map)

In [14]:
tiers = {
    1: level_labels[0],
    2: level_labels[1],
    3: level_labels[2],
    4: level_labels[3],
    5: level_labels[4]
}

def get_tier(label):
    for tier_num, tier_list in tiers.items():
        if label in tier_list:
            return tier_num
    return None  # Handle cases where the label isn't found in any tier

def sort_labels(labels):
    return sorted(labels, key=lambda label: (get_tier(label) or float('inf'), label))


In [15]:
sorted_labelhir = [sort_labels(labels) for labels in labelhir]

In [16]:
label_hier = np.array(
    sorted_labelhir,
    dtype=object,
)

In [17]:
len(label_hier)

318390

In [18]:
_label = np.array([x[-1] for x in label_hier])

In [19]:
len(_label)

318390

In [20]:
padded_label = pd.Series(label_hier).apply(lambda x: x + ['None'] * (5 - len(x)) if len(x) < 5 else x)

In [21]:
len(padded_label)

318390

In [22]:
listlabl = [] 
for i in range(len(padded_label)):
    tierli = padded_label[i].copy()
    tierli.append(''.join(tierli))
    listlabl.append(tierli)
pdlistlabl = pd.DataFrame(listlabl, columns=['tier0', 'tier1', 'tier2', 'tier3', 'tier4', 'combine'])
pdlistlabl = pdlistlabl.drop_duplicates(subset="combine").reset_index()
pdlistlabl

Unnamed: 0,index,tier0,tier1,tier2,tier3,tier4,combine
0,0,Parameter,,,,,ParameterNoneNoneNoneNone
1,1,Setpoint,Temperature_Setpoint,Cooling_Temperature_Setpoint,,,SetpointTemperature_SetpointCooling_Temperatur...
2,2,Sensor,Current_Sensor,Load_Current_Sensor,,,SensorCurrent_SensorLoad_Current_SensorNoneNone
3,3,Sensor,Power_Sensor,Electrical_Power_Sensor,,,SensorPower_SensorElectrical_Power_SensorNoneNone
4,4,Setpoint,Speed_Setpoint,,,,SetpointSpeed_SetpointNoneNoneNone
...,...,...,...,...,...,...,...
86,6699,Sensor,Power_Sensor,,,,SensorPower_SensorNoneNoneNone
87,7929,Setpoint,Cooling_Demand_Setpoint,,,,SetpointCooling_Demand_SetpointNoneNoneNone
88,10899,Sensor,Temperature_Sensor,,,,SensorTemperature_SensorNoneNoneNone
89,13579,Sensor,Flow_Sensor,,,,SensorFlow_SensorNoneNoneNone


In [23]:
# Count Nones at each level
for i in range(5):
    none_count = sum(padded_label.apply(lambda x: x[i] == 'None'))
    print(f"Level {i+1}: {none_count} None values out of {len(padded_label)} total ({none_count/len(padded_label):.2%})")

Level 1: 0 None values out of 318390 total (0.00%)
Level 2: 123210 None values out of 318390 total (38.70%)
Level 3: 202470 None values out of 318390 total (63.59%)
Level 4: 272160 None values out of 318390 total (85.48%)
Level 5: 314700 None values out of 318390 total (98.84%)


# Model Training

In [24]:
def train_random_forest(train_X, _label, folds, drop_none=False):
    """
    Train random forest models using k-fold cross validation
    
    Args:
        train_X: Training features DataFrame
        _label: Array of labels
        folds: List of dictionaries containing train/val indices
        drop_none: Whether to drop samples with "None" labels
        
    Returns:
        tuple: (list of trained classifiers, list of scores, list of validation predictions)
    """
    classifiers = []
    scores = []
    val_predictions = []  # List to store validation predictions

    # Define Random Forest parameters
    params = {
        'n_estimators': 100,
        'random_state': 42,
        'n_jobs': 8  # Use all available cores
    }

    for f_idx, fold in enumerate(folds):
        # Prepare train and validation data for this fold
        train_X_fold = train_X.iloc[fold['train']]
        train_y_fold = _label[fold['train']]
        val_X_fold = train_X.iloc[fold['val']]
        val_y_fold = _label[fold['val']]
        
        if drop_none:
            # Remove samples with "None" labels from training set
            train_mask = train_y_fold != "None"
            train_X_fold = train_X_fold[train_mask]
            train_y_fold = train_y_fold[train_mask]
            
            # Remove samples with "None" labels from validation set
            val_mask = val_y_fold != "None"
            val_X_fold = val_X_fold[val_mask]
            val_y_fold = val_y_fold[val_mask]
            print(f"Dropped train: {len(train_X_fold) - sum(train_mask)}, val: {len(val_X_fold) - sum(val_mask)}")
        
        # Check the train_y_fold. If more than 30% of samples are labeled "None",
        # randomly sample from the "None" to make that ratio no more than 30%.
        none_mask = (train_y_fold == "None")
        none_count = np.sum(none_mask)
        total_samples = len(train_y_fold)
        none_ratio = none_count / total_samples if total_samples > 0 else 0

        if none_ratio > 0.4:
            # Calculate how many "None" labels we should keep (30% of total)
            max_none_to_keep = int(0.4 * (total_samples - none_count))

            # Randomly choose which "None" labels to keep
            none_indices = np.where(none_mask)[0]

            # Fix the random seed before shuffling for reproducibility
            rng = np.random.RandomState(f_idx)
            rng.shuffle(none_indices)
            
            keep_none_indices = none_indices[:max_none_to_keep]

            # Indices of all non-"None" labels
            other_indices = np.where(~none_mask)[0]

            # Combine indices to keep and then sort
            new_indices = np.concatenate([keep_none_indices, other_indices])
            new_indices = np.sort(new_indices)  # Sort so we can index the DataFrame consistently

            # Subset the training data
            train_X_fold = train_X_fold.iloc[new_indices]
            train_y_fold = train_y_fold[new_indices]

            print(f"Sampled: none-ratio: {none_ratio}, removed: {total_samples - max_none_to_keep}")

        # Create and train Random Forest model
        model = RandomForestClassifier(**params)
        model.fit(train_X_fold, train_y_fold)
        
        classifiers.append(model)
        
        # Calculate score and save predictions on validation set
        val_preds = model.predict(val_X_fold)
        score = np.mean(val_preds == val_y_fold)
        scores.append(score)
        val_predictions.append({
            'true_labels': val_y_fold,
            'predicted_labels': val_preds,
            'fold_indices': fold['val']
        })
        print(f"Fold score: {score:.4f}")

    print(f"Average score: {np.mean(scores)}")
    return classifiers, scores, val_predictions

In [25]:
def train_lgbm_classifier(train_X, _label, folds, drop_none=False):
    """
    Train LightGBM models using k-fold cross validation

    Args:
        train_X: Training features DataFrame
        _label: Array of labels
        folds: List of dictionaries containing train/val indices
        drop_none: Whether to drop samples with "None" labels

    Returns:
        tuple: (list of trained classifiers, list of scores, list of validation predictions)
    """
    classifiers = []
    scores = []
    val_predictions = []  # List to store validation predictions

    # Define LightGBM parameters
    params = {
        'verbose':-1,
        'n_estimators': 100,
        'learning_rate': 0.1,
        'random_state': 42,
        'n_jobs': 8,  # Use all available cores
        'objective': 'multiclass',
        'num_class': len(set(_label))  # Set number of classes based on unique labels
    }

    for f_idx, fold in enumerate(folds):
        # Prepare train and validation data for this fold
        train_X_fold = train_X.iloc[fold['train']]
        train_y_fold = _label[fold['train']]
        val_X_fold = train_X.iloc[fold['val']]
        val_y_fold = _label[fold['val']]

        if drop_none:
            # Remove samples with "None" labels from training set
            train_mask = train_y_fold != "None"
            train_X_fold = train_X_fold[train_mask]
            train_y_fold = train_y_fold[train_mask]

            # Remove samples with "None" labels from validation set
            val_mask = val_y_fold != "None"
            val_X_fold = val_X_fold[val_mask]
            val_y_fold = val_y_fold[val_mask]
            print(f"Dropped train: {len(train_X_fold) - sum(train_mask)}, val: {len(val_X_fold) - sum(val_mask)}")

        # Check the train_y_fold. If more than 30% of samples are labeled "None",
        # randomly sample from the "None" to make that ratio no more than 30%.
        none_mask = (train_y_fold == "None")
        none_count = np.sum(none_mask)
        total_samples = len(train_y_fold)
        none_ratio = none_count / total_samples if total_samples > 0 else 0

        if none_ratio > 0.4:
            # Calculate how many "None" labels we should keep (30% of total)
            max_none_to_keep = int(0.4 * (total_samples - none_count))

            # Randomly choose which "None" labels to keep
            none_indices = np.where(none_mask)[0]

            # Fix the random seed before shuffling for reproducibility
            rng = np.random.RandomState(f_idx)
            rng.shuffle(none_indices)

            keep_none_indices = none_indices[:max_none_to_keep]

            # Indices of all non-"None" labels
            other_indices = np.where(~none_mask)[0]

            # Combine indices to keep and then sort
            new_indices = np.concatenate([keep_none_indices, other_indices])
            new_indices = np.sort(new_indices)  # Sort so we can index the DataFrame consistently

            # Subset the training data
            train_X_fold = train_X_fold.iloc[new_indices]
            train_y_fold = train_y_fold[new_indices]

            print(f"Sampled: none-ratio: {none_ratio}, removed: {total_samples - max_none_to_keep}")

        # Create and train LightGBM model
        model = LGBMClassifier(**params)
        model.fit(train_X_fold, train_y_fold)

        classifiers.append(model)

        # Calculate score and save predictions on validation set
        val_preds = model.predict(val_X_fold)
        score = np.mean(val_preds == val_y_fold)
        scores.append(score)
        val_predictions.append({
            'true_labels': val_y_fold,
            'predicted_labels': val_preds,
            'fold_indices': fold['val']
        })
        print(f"Fold score: {score:.4f}")

    print(f"Average score: {np.mean(scores)}")
    return classifiers, scores, val_predictions

In [26]:
def train_xgboost(train_X, _label, folds, drop_none=False):
    """
    Train XGBoost models using k-fold cross-validation

    Args:
        train_X: Training features DataFrame
        _label: Array of labels
        folds: List of dictionaries containing train/val indices
        drop_none: Whether to drop samples with "None" labels

    Returns:
        tuple: (list of trained classifiers, list of scores, list of validation predictions)
    """
    from xgboost import XGBClassifier
    
    classifiers = []
    scores = []
    val_predictions = []  # List to store validation predictions

    # Define XGBoost parameters
    params = {
        'n_estimators': 400,       # Number of trees
        'learning_rate': 0.3,     # Default learning rate
        'max_depth': 6,           # Maximum depth of trees
        'min_child_weight': 1,    # Minimum sum of weights in a child node
        'subsample': 0.8,         # Fraction of samples per tree
        'colsample_bytree': 0.8,  # Fraction of features per tree
        'gamma': 0,               # Minimum loss reduction for split
        'reg_alpha': 0,           # L1 regularization term
        'reg_lambda': 1,          # L2 regularization term
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'n_jobs': 8
    }

    for f_idx, fold in enumerate(folds):
        # Prepare train and validation data for this fold
        train_X_fold = train_X.iloc[fold['train']]
        train_y_fold = _label[fold['train']]
        val_X_fold = train_X.iloc[fold['val']]
        val_y_fold = _label[fold['val']]
        
        if drop_none:
            # Remove samples with "None" labels from training set
            train_mask = train_y_fold != "None"
            train_X_fold = train_X_fold[train_mask]
            train_y_fold = train_y_fold[train_mask]
            
            # Remove samples with "None" labels from validation set
            val_mask = val_y_fold != "None"
            val_X_fold = val_X_fold[val_mask]
            val_y_fold = val_y_fold[val_mask]
            print(f"Dropped train: {len(train_X_fold) - sum(train_mask)}, val: {len(val_X_fold) - sum(val_mask)}")

        # Check the train_y_fold. If more than 40% of samples are labeled "None",
        # randomly sample from the "None" to make that ratio no more than 40%.
        none_mask = (train_y_fold == "None")
        none_count = np.sum(none_mask)
        total_samples = len(train_y_fold)
        none_ratio = none_count / total_samples if total_samples > 0 else 0

        if none_ratio > 0.4:
            # Calculate how many "None" labels we should keep (40% of total)
            max_none_to_keep = int(0.4 * (total_samples - none_count))

            # Randomly choose which "None" labels to keep
            none_indices = np.where(none_mask)[0]

            # Fix the random seed before shuffling for reproducibility
            rng = np.random.RandomState(f_idx)
            rng.shuffle(none_indices)
            
            keep_none_indices = none_indices[:max_none_to_keep]

            # Indices of all non-"None" labels
            other_indices = np.where(~none_mask)[0]

            # Combine indices to keep and then sort
            new_indices = np.concatenate([keep_none_indices, other_indices])
            new_indices = np.sort(new_indices)  # Sort so we can index the DataFrame consistently

            # Subset the training data
            train_X_fold = train_X_fold.iloc[new_indices]
            train_y_fold = train_y_fold[new_indices]

            print(f"Sampled: none-ratio: {none_ratio}, removed: {total_samples - max_none_to_keep}")

        # Map train_y_fold to numeric values
        unique_classes = np.unique(train_y_fold)
        class_to_int = {cls: idx for idx, cls in enumerate(unique_classes)}
        int_to_class = {idx: cls for cls, idx in class_to_int.items()}
        train_y_fold = np.array([class_to_int[label] for label in train_y_fold])
        val_y_fold_numeric = np.array([class_to_int[label] for label in val_y_fold])

        # Create and train XGBoost model
        model = XGBClassifier(**params)
        model.fit(train_X_fold, train_y_fold)
        
        classifiers.append(model)
        
        # Calculate score and save predictions on validation set
        val_preds = model.predict(val_X_fold)
        val_preds_labels = np.array([int_to_class[pred] for pred in val_preds])
        score = np.mean(val_preds_labels == val_y_fold)
        scores.append(score)
        val_predictions.append({
            'true_labels': val_y_fold,
            'predicted_labels': val_preds_labels,
            'fold_indices': fold['val']
        })
        print(f"Fold score: {score:.4f}")

    print(f"Average score: {np.mean(scores)}")
    return classifiers, scores, val_predictions


### Train the high precision model by allowing None prediction

**Clearing memory**

In [29]:
# !pip install psutil



In [32]:
# !pip install pympler

Collecting pympler
  Downloading Pympler-1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading Pympler-1.1-py3-none-any.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.8/165.8 kB[0m [31m227.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pympler
Successfully installed pympler-1.1


In [33]:
from pympler import muppy, summary

# Get a summary of all objects in memory
all_objects = muppy.get_objects()
memory_summary = summary.summarize(all_objects)

# Print the summary
summary.print_(memory_summary)


                        types |   # objects |   total size
                numpy.ndarray |         180 |    688.70 MB
  pandas.core.frame.DataFrame |           3 |    641.53 MB
                         list |     1296805 |    140.27 MB
                          str |      554889 |     71.07 MB
                         dict |      104051 |     32.39 MB
    pandas.core.series.Series |           1 |     26.80 MB
                         code |       72842 |     12.39 MB
                         type |        9106 |      7.99 MB
                        tuple |       68311 |      3.84 MB
                          set |        6120 |      2.79 MB
                      weakref |       14059 |    988.52 KB
   builtin_function_or_method |       14052 |    988.03 KB
                          int |       32200 |    940.24 KB
                         cell |       21427 |    836.99 KB
                  abc.ABCMeta |         755 |    796.60 KB


In [34]:
import psutil
import gc

gc.collect()

# Get total, available, and used memory
mem = psutil.virtual_memory()
print(f"Total Memory: {mem.total / 1e9:.2f} GB")
print(f"Available Memory: {mem.available / 1e9:.2f} GB")
print(f"Used Memory: {mem.used / 1e9:.2f} GB")
print(f"Memory Usage: {mem.percent}%")


Total Memory: 17.18 GB
Available Memory: 7.25 GB
Used Memory: 8.65 GB
Memory Usage: 57.8%


In [38]:
# %whos

In [35]:
len(train_X)

318390

In [36]:
len(np.array([x[i] for x in padded_label]))

318390

**Training and saving models (commented out)**

In [37]:
prec_rf_classifiers = []
prec_scores = []
prec_rf_val_predictions = []

for i in range(1):
    print(f"Training level {i}")
    _classifiers, _scores, _val_predictions = train_random_forest(train_X, np.array([x[i] for x in padded_label]), folds, drop_none=False)
    prec_rf_classifiers.append(_classifiers)
    prec_scores.append(_scores)
    prec_rf_val_predictions.append(_val_predictions)

Training level 0
Fold score: 0.8138
Fold score: 0.8148
Fold score: 0.8151
Fold score: 0.8170
Fold score: 0.8168
Fold score: 0.8154
Fold score: 0.8133
Fold score: 0.8176
Fold score: 0.8161


KeyboardInterrupt: 

In [42]:
import pickle

# Loop through levels
for level in range(len(prec_rf_classifiers)):  # Iterate over levels (5 in this case)
    classifiers = prec_rf_classifiers[level]  # List of classifiers for this level
    for fold_idx, classifier in enumerate(classifiers):  # Iterate over folds
        # Define the file path for this classifier
        file_path = f"./rf_classifier_level_0_fold_{fold_idx}.pkl"
        
        # Save the classifier using pickle
        with open(file_path, "wb") as f:
            pickle.dump(classifier, f)
        print(f"Saved RF classifier for level 0, fold {fold_idx} to {file_path}")

In [38]:
prec_lgbm_classifiers = []
prec_scores = []
prec_lgbm_val_predictions = []

for i in range(4,5):
    print(f"Training level {i}")
    if (i == 1) or (i == 2) or (i == 3):
        _classifiers, _scores, _val_predictions = train_random_forest(train_X, np.array([x[i] for x in padded_label]), folds, drop_none=False)
    else:
        _classifiers, _scores, _val_predictions = train_lgbm_classifier(train_X, np.array([x[i] for x in padded_label]), folds, drop_none=False)
    prec_lgbm_classifiers.append(_classifiers)
    prec_scores.append(_scores)
    prec_lgbm_val_predictions.append(_val_predictions)

Training level 4
Sampled: none-ratio: 0.9884523173885277, removed: 285228
Fold score: 0.9479
Sampled: none-ratio: 0.9884313787074552, removed: 285225
Fold score: 0.9537
Sampled: none-ratio: 0.9884313787074552, removed: 285225
Fold score: 0.9475
Sampled: none-ratio: 0.9883197057417353, removed: 285213
Fold score: 0.9539
Sampled: none-ratio: 0.9883615831038803, removed: 285217
Fold score: 0.9440
Sampled: none-ratio: 0.9883511137633441, removed: 285216
Fold score: 0.9497
Sampled: none-ratio: 0.9884592969488852, removed: 285229
Fold score: 0.9546
Sampled: none-ratio: 0.988406950246204, removed: 285223
Fold score: 0.9421
Sampled: none-ratio: 0.9884313787074552, removed: 285225
Fold score: 0.9516
Sampled: none-ratio: 0.9884592969488852, removed: 285229
Fold score: 0.9543
Average score: 0.9499167687427368


In [44]:
# import pickle

# # Loop through levels
# for level in range(len(prec_lgbm_classifiers)):  # Iterate over levels (5 in this case)
#     classifiers = prec_lgbm_classifiers[level]  # List of classifiers for this level
#     for fold_idx, classifier in enumerate(classifiers):  # Iterate over folds
#         # Define the file path for this classifier
#         file_path = f"./lgbm_classifier_level_4_fold_{fold_idx}.pkl"
        
#         # Save the classifier using pickle
#         with open(file_path, "wb") as f:
#             pickle.dump(classifier, f)
#         print(f"Saved LGBM classifier for level 4, fold {fold_idx} to {file_path}")

In [45]:
# prec_xgb_classifiers = []
# prec_scores = []
# prec_xgb_val_predictions = []

# for i in range(5):
#     print(f"Training level {i}")
#     _classifiers, _scores, _val_predictions = train_xgboost(train_X, np.array([x[i] for x in padded_label]), folds, drop_none=False)
#     prec_xgb_classifiers.append(_classifiers)
#     prec_scores.append(_scores)
#     prec_xgb_val_predictions.append(_val_predictions)

In [46]:
# import pickle

# # Loop through levels
# for level in range(len(prec_xgb_classifiers)):  # Iterate over levels (5 in this case)
#     classifiers = prec_xgb_classifiers[level]  # List of classifiers for this level
#     for fold_idx, classifier in enumerate(classifiers):  # Iterate over folds
#         # Define the file path for this classifier
#         file_path = f"./xgb_classifier_level_{level}_fold_{fold_idx}.pkl"
        
#         # Save the classifier using pickle
#         with open(file_path, "wb") as f:
#             pickle.dump(classifier, f)
#         print(f"Saved XGBoost classifier for level {level}, fold {fold_idx} to {file_path}")

In [47]:
#RF Confusion

In [48]:
# # Create confusion matrix
# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# from sklearn.metrics import classification_report

In [49]:
# # Stack all 5 folds' predictions and true labels together
# all_true_labels = np.concatenate([_fold['true_labels'] for _fold in prec_rf_val_predictions[2]])
# all_predicted_labels = np.concatenate([_fold['predicted_labels'] for _fold in prec_rf_val_predictions[2]])

# # Compute confusion matrix
# cm = confusion_matrix(all_true_labels, all_predicted_labels)

# # Create a figure with larger size
# plt.figure(figsize=(20, 16))

# # Create heatmap
# sns.heatmap(cm, 
#             xticklabels=np.unique(all_true_labels),
#             yticklabels=np.unique(all_true_labels),
#             annot=True, 
#             fmt='d',
#             cmap='Blues')

# plt.title('Confusion Matrix - Fold 0')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.xticks(rotation=90)
# plt.yticks(rotation=0)
# plt.tight_layout()
# plt.show()

# # Print classification report for more detailed metrics

# print("\nClassification Report:")
# print(classification_report(all_true_labels, all_predicted_labels))

In [50]:
#with open('random_forest_data.pkl', 'rb') as f:
#    data = pickle.load(f)
#prec_rf_classifiers = data['classifiers1']
#prec_lgbm_classifiers = data['classifiers2']

In [51]:
#lgbm confusion

In [52]:
# # Stack all 5 folds' predictions and true labels together
# all_true_labels = np.concatenate([_fold['true_labels'] for _fold in prec_lgbm_val_predictions[2]])
# all_predicted_labels = np.concatenate([_fold['predicted_labels'] for _fold in prec_lgbm_val_predictions[2]])

# # Compute confusion matrix
# cm = confusion_matrix(all_true_labels, all_predicted_labels)

# # Create a figure with larger size
# plt.figure(figsize=(20, 16))

# # Create heatmap
# sns.heatmap(cm, 
#             xticklabels=np.unique(all_true_labels),
#             yticklabels=np.unique(all_true_labels),
#             annot=True, 
#             fmt='d',
#             cmap='Blues')

# plt.title('Confusion Matrix - Fold 0')
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.xticks(rotation=90)
# plt.yticks(rotation=0)
# plt.tight_layout()
# plt.show()

# # Print classification report for more detailed metrics

# print("\nClassification Report:")
# print(classification_report(all_true_labels, all_predicted_labels))

In [53]:
#max_value = np.finfo(np.float32).max
#cliped_text_X = np.clip(test_X, -max_value, max_value)
#cliped_text_X = np.clip(test_X, a_min=None, a_max=np.finfo(np.float32).max)
#cliped_text_X = np.nan_to_num(test_X, nan=0.0, posinf=np.finfo(np.float32).max, neginf=np.finfo(np.float32).min)

In [54]:
test_X = pd.read_csv('./data_features_fix/test_features_full_v3.csv', index_col=0)
test_X_1 = pd.read_csv('./data_features_fix/test_features_split1_2_v3.csv', index_col=0)
test_X_2 = pd.read_csv('./data_features_fix/test_features_split2_2_v3.csv', index_col=0)
test_X_3 = pd.read_csv('./data_features_fix/test_features_split1_3_v3.csv', index_col=0)
test_X_4 = pd.read_csv('./data_features_fix/test_features_split2_3_v3.csv', index_col=0)
test_X_5 = pd.read_csv('./data_features_fix/test_features_split3_3_v3.csv', index_col=0)

In [55]:
max_value = np.finfo(np.float32).max
test_X = np.clip(test_X, -max_value, max_value)
test_X = test_X.replace([np.inf, -np.inf], 0.0)  # Replace infinity
test_X = test_X.fillna(0.0)  # Replace NaN
test_X_1 = np.clip(test_X_1, -max_value, max_value)
test_X_1 = test_X_1.replace([np.inf, -np.inf], 0.0)  # Replace infinity
test_X_1 = test_X_1.fillna(0.0)  # Replace NaN
test_X_2 = np.clip(test_X_2, -max_value, max_value)
test_X_2 = test_X_2.replace([np.inf, -np.inf], 0.0)  # Replace infinity
test_X_2 = test_X_2.fillna(0.0)  # Replace NaN
test_X_3 = np.clip(test_X_1, -max_value, max_value)
test_X_3 = test_X_3.replace([np.inf, -np.inf], 0.0)  # Replace infinity
test_X_3 = test_X_3.fillna(0.0)  # Replace NaN
test_X_4 = np.clip(test_X_4, -max_value, max_value)
test_X_4 = test_X_4.replace([np.inf, -np.inf], 0.0)  # Replace infinity
test_X_4 = test_X_4.fillna(0.0)  # Replace NaN
test_X_5 = np.clip(test_X_5, -max_value, max_value)
test_X_5 = test_X_5.replace([np.inf, -np.inf], 0.0)  # Replace infinity
test_X_5 = test_X_5.fillna(0.0)  # Replace NaN

cliped_text_X

cliped_text_X = cliped_text_X.replace([np.inf, -np.inf], 0.0)  # Replace infinity
cliped_text_X = cliped_text_X.fillna(0.0)  # Replace NaN

print("Checking for NaN values:", np.any(np.isnan(cliped_text_X)))
print("Checking for infinite values:", np.any(np.isinf(cliped_text_X)))
print("Maximum value:", np.max(cliped_text_X))
print("Minimum value:", np.min(cliped_text_X))

cliped_text_X[cliped_text_X.isnull().any(axis=1)].index

Load in the downloaded classifiers

RF

In [56]:
import pickle

# Initialize the 2D list to store classifiers
prec_rf_classifiers = []

# Loop through each level
for level in range(5):
    level_classifiers = []  # List to store classifiers for the current level
    
    # Loop through each fold for the current level
    for fold in range(10):
        # Construct the filename
        file_path = f'./rf-models-default-params/rf_classifier_level_{level}_fold_{fold}.pkl'
        
        # Load the classifier
        with open(file_path, 'rb') as f:
            classifier = pickle.load(f)
        
        # Add the classifier to the list for the current level
        level_classifiers.append(classifier)
    
    # Append the list of classifiers for the current level to the main list
    prec_rf_classifiers.append(level_classifiers)

# The `prec_rf_classifiers` list now contains all classifiers organized by level and fold
print("All classifiers successfully loaded into `prec_rf_classifiers`.")

All classifiers successfully loaded into `prec_rf_classifiers`.


In [57]:
len(prec_rf_classifiers)

5

In [58]:
prec_rf_classifiers

[[RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42)],
 [RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_state=42),
  RandomForestClassifier(n_jobs=8, random_sta

LGBM

In [59]:
import pickle

# Initialize the 2D list to store classifiers
prec_lgbm_classifiers = []

# Loop through each level
for level in range(5):
    level_classifiers = []  # List to store classifiers for the current level
    
    # Loop through each fold for the current level
    for fold in range(10):
        # Construct the filename
        if (level == 1) or (level == 2) or (level == 3):
            level_classifiers.append(prec_rf_classifiers[level][fold])
            continue
        else:
            file_path = f'./lgbm-models-default-params/lgbm_classifier_level_{level}_fold_{fold}.pkl'
        
        # Load the classifier
        with open(file_path, 'rb') as f:
            classifier = pickle.load(f)
        
        # Add the classifier to the list for the current level
        level_classifiers.append(classifier)
    
    # Append the list of classifiers for the current level to the main list
    prec_lgbm_classifiers.append(level_classifiers)

# The `prec_rf_classifiers` list now contains all classifiers organized by level and fold
print("All classifiers successfully loaded into `prec_lgbm_classifiers`.")

All classifiers successfully loaded into `prec_lgbm_classifiers`.


In [60]:
prec_lgbm_classifiers

[[LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
  LGBMClassifier(n_jobs=8, num_class=6, objective='multiclass', random_state=42,
                 verbose=-1),
 

In [61]:
len(prec_lgbm_classifiers)

5

XGB

In [62]:
import pickle

# Initialize the 2D list to store classifiers
prec_xgb_classifiers = []

# Loop through each level
for level in range(5):
    level_classifiers = []  # List to store classifiers for the current level
    
    # Loop through each fold for the current level
    for fold in range(10):
        # Construct the filename
        file_path = f'./xgb-models-400-estimators-0.3-lr/xgb_classifier_level_{level}_fold_{fold}.pkl'
        
        # Load the classifier
        with open(file_path, 'rb') as f:
            classifier = pickle.load(f)
        
        # Add the classifier to the list for the current level
        level_classifiers.append(classifier)
    
    # Append the list of classifiers for the current level to the main list
    prec_xgb_classifiers.append(level_classifiers)

# The `prec_rf_classifiers` list now contains all classifiers organized by level and fold
print("All classifiers successfully loaded into `xgb_rf_classifiers`.")

All classifiers successfully loaded into `xgb_rf_classifiers`.


In [63]:
prec_xgb_classifiers

[[XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=0.8, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric='logloss',
                feature_types=None, gamma=0, grow_policy=None,
                importance_type=None, interaction_constraints=None,
                learning_rate=0.3, max_bin=None, max_cat_threshold=None,
                max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
                max_leaves=None, min_child_weight=1, missing=nan,
                monotone_constraints=None, multi_strategy=None, n_estimators=400,
                n_jobs=8, num_parallel_tree=None, objective='multi:softprob', ...),
  XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               

In [64]:
len(prec_xgb_classifiers)

5

In [65]:
def make_predictions_with_models_vote(classifiers1, classifiers2, classifiers3, test_data, test_data2, 
                                      test_data3, test_data4, test_data5, test_data6 ):
    """
    Make probability predictions using multiple classifier models
    
    Args:
        classifiers: List of trained classifier models
        test_data: Test data to make predictions on
        
    Returns:
        List of probability predictions from each classifier
    """
    test_preds_all = []
    for i in tqdm(range(len(classifiers1))):
    #for clf in tqdm(classifiers):
        pred1 = classifiers1[i].predict_proba(test_data)
        pred2 = classifiers2[i].predict_proba(test_data)
        pred3 = classifiers3[i].predict_proba(test_data)
        pred4 = classifiers1[i].predict_proba(test_data2)
        pred5 = classifiers2[i].predict_proba(test_data2)
        pred6 = classifiers3[i].predict_proba(test_data2)
        pred7 = classifiers1[i].predict_proba(test_data3)
        pred8 = classifiers2[i].predict_proba(test_data3)
        pred9 = classifiers3[i].predict_proba(test_data3)
        pred10 = classifiers1[i].predict_proba(test_data4)
        pred11 = classifiers2[i].predict_proba(test_data4)
        pred12 = classifiers3[i].predict_proba(test_data4)
        pred13 = classifiers1[i].predict_proba(test_data5)
        pred14 = classifiers2[i].predict_proba(test_data5)
        pred15 = classifiers3[i].predict_proba(test_data5)
        pred16 = classifiers1[i].predict_proba(test_data6)
        pred17 = classifiers2[i].predict_proba(test_data6)
        pred18 = classifiers3[i].predict_proba(test_data6)
        pred = (pred1+pred2+pred3+pred4+pred5+pred6+pred7+pred8+
                pred9+pred10+pred11+pred12+pred13+pred14+pred15+
               pred16+pred17+pred18)/18.0
        test_preds_all.append(pred)
    return test_preds_all

In [66]:
def align_and_combine_predictions(classifiers1, classifiers2, classifiers3, test_preds_all, test_data, threshold=0.0):
    """
    Aligns predictions from multiple classifiers and combines them through averaging
    
    Args:
        classifiers: List of trained classifier models
        test_preds_all: List of probability predictions from each classifier
        test_data: Test data used for predictions
        threshold: Minimum probability threshold for making predictions
        
    Returns:
        Final class predictions after aligning and combining probabilities
    """
    # Get the common classes across all classifiers
    all_classes = classifiers2[0].classes_
    test_preds_aligned = []

    # Make predictions with each fold's model and align them 
    for i, clf in tqdm(enumerate(classifiers2)):
        pred = test_preds_all[i]
        # Create a mapping to align predictions with common classes
        pred_dict = {_cls: idx for idx, _cls in enumerate(clf.classes_)}
        aligned_pred = np.zeros((len(test_data), len(all_classes)))
        
        for i, _cls in enumerate(all_classes):
            if _cls in pred_dict:
                aligned_pred[:, i] = pred[:, pred_dict[_cls]]
        
        test_preds_aligned.append(aligned_pred)

    # Stack and average the aligned predictions
    test_preds_all = np.stack(test_preds_aligned)
    test_preds_proba = test_preds_all.mean(axis=0)

    # Get max probabilities for each prediction
    max_probs = np.max(test_preds_proba, axis=1)
    
    # Convert probabilities to class predictions, using threshold
    test_preds = np.array(['None'] * len(test_data), dtype=object)
    confident_mask = max_probs >= threshold
    test_preds[confident_mask] = all_classes[np.argmax(test_preds_proba[confident_mask], axis=1)]
    
    return test_preds

In [67]:
def get_proba(classifiers1, classifiers2, classifiers3, test_preds_all, test_data, threshold=0.0):
    """
    Aligns predictions from multiple classifiers and combines them through averaging
    
    Args:
        classifiers: List of trained classifier models
        test_preds_all: List of probability predictions from each classifier
        test_data: Test data used for predictions
        threshold: Minimum probability threshold for making predictions
        
    Returns:
        Final class predictions after aligning and combining probabilities
    """
    # Get the common classes across all classifiers
    all_classes = classifiers1[0].classes_
    test_preds_aligned = []

    # Make predictions with each fold's model and align them 
    for i, clf in tqdm(enumerate(classifiers1)):
        pred = test_preds_all[i]
        # Create a mapping to align predictions with common classes
        pred_dict = {_cls: idx for idx, _cls in enumerate(clf.classes_)}
        aligned_pred = np.zeros((len(test_data), len(all_classes)))
        
        for i, _cls in enumerate(all_classes):
            if _cls in pred_dict:
                aligned_pred[:, i] = pred[:, pred_dict[_cls]]
        
        test_preds_aligned.append(aligned_pred)

    # Stack and average the aligned predictions
    test_preds_all = np.stack(test_preds_aligned)
    test_preds_proba = test_preds_all.mean(axis=0)

    ## Get max probabilities for each prediction
    #max_probs = np.max(test_preds_proba, axis=1)
    
    # Convert probabilities to class predictions, using threshold
    #test_preds = np.array(['None'] * len(test_data), dtype=object)
    #confident_mask = max_probs >= threshold
    #test_preds[confident_mask] = all_classes[np.argmax(test_preds_proba[confident_mask], axis=1)]
    
    return test_preds_proba

test_preds = []
thr_list = [0.0, 0.2, 0.3, 0.4, 0.5]
for i in range(5):
    print(f"Predicting level {i}")
    test_preds_all = make_predictions_with_models(prec_classifiers[i], cliped_text_X)
    test_preds.append(align_and_combine_predictions(prec_classifiers[i], test_preds_all, cliped_text_X))

In [68]:
import gc
gc.collect()

0

test_preds = []
thr_list = [0.0, 0.2, 0.3, 0.4, 0.5]
for i in range(5):
    print(f"Predicting level {i}")
    test_preds_all = make_predictions_with_models_vote(prec_rf_classifiers[i], prec_lgbm_classifiers[i], prec_xgb_classifiers[i], cliped_text_X)
    test_preds.append(align_and_combine_predictions(prec_rf_classifiers[i], prec_lgbm_classifiers[i], prec_xgb_classifiers[i], test_preds_all, cliped_text_X))
    #test_preds_all = make_predictions_with_models_vote(prec_rf_classifiers[i], prec_lgbm_classifiers[i], prec_xgb_classifiers[i], test_X)
    #test_preds.append(align_and_combine_predictions(prec_rf_classifiers[i], prec_lgbm_classifiers[i], prec_xgb_classifiers[i], test_preds_all, test_X))

test_preds_all[0][0]

# Convert to array and process None values
stacked = np.stack(test_preds).transpose()
for row in tqdm(stacked):
    # Find first occurrence of 'None' if any
    none_idx = np.where(row == 'None')[0]
    if len(none_idx) > 0:
        # Set all elements after first None to None
        first_none = none_idx[0]
        row[first_none:] = 'None'
        
stacked

In [69]:
#gets proba
test_preds_all_list = []
thr_list = [0.0, 0.2, 0.3, 0.4, 0.5]
for i in range(5):
    print(f"Predicting level {i}")
    test_preds_all = make_predictions_with_models_vote(prec_rf_classifiers[i], prec_lgbm_classifiers[i], prec_xgb_classifiers[i], 
                                                       test_X, test_X_1, test_X_2, test_X_3, test_X_4, test_X_5)
    test_preds_all_list.append(get_proba(prec_rf_classifiers[i], prec_lgbm_classifiers[i], prec_xgb_classifiers[i], test_preds_all, test_X))

Predicting level 0




0it [00:00, 104.09it/s]███████████████████████████████████████████████████████████████████████████| 10/10 [05:27<00:00, 32.75s/it]

Predicting level 1




0it [00:01,  5.06it/s]███████████████████████████████████████████████████████████████████████████| 10/10 [16:48<00:00, 100.83s/it]

Predicting level 2




0it [00:01,  8.11it/s]████████████████████████████████████████████████████████████████████████████| 10/10 [12:35<00:00, 75.55s/it]

Predicting level 3




0it [00:00, 12.00it/s]████████████████████████████████████████████████████████████████████████████| 10/10 [08:18<00:00, 49.86s/it]

Predicting level 4




0it [00:00, 50.73it/s]████████████████████████████████████████████████████████████████████████████| 10/10 [04:59<00:00, 30.00s/it]

In [70]:
pdlistlabl

Unnamed: 0,index,tier0,tier1,tier2,tier3,tier4,combine
0,0,Parameter,,,,,ParameterNoneNoneNoneNone
1,1,Setpoint,Temperature_Setpoint,Cooling_Temperature_Setpoint,,,SetpointTemperature_SetpointCooling_Temperatur...
2,2,Sensor,Current_Sensor,Load_Current_Sensor,,,SensorCurrent_SensorLoad_Current_SensorNoneNone
3,3,Sensor,Power_Sensor,Electrical_Power_Sensor,,,SensorPower_SensorElectrical_Power_SensorNoneNone
4,4,Setpoint,Speed_Setpoint,,,,SetpointSpeed_SetpointNoneNoneNone
...,...,...,...,...,...,...,...
86,6699,Sensor,Power_Sensor,,,,SensorPower_SensorNoneNoneNone
87,7929,Setpoint,Cooling_Demand_Setpoint,,,,SetpointCooling_Demand_SetpointNoneNoneNone
88,10899,Sensor,Temperature_Sensor,,,,SensorTemperature_SensorNoneNoneNone
89,13579,Sensor,Flow_Sensor,,,,SensorFlow_SensorNoneNoneNone


In [71]:
finalstack = []  # Initialize finalstack as a list to accumulate results

for i in tqdm(range(len(test_preds_all_list[0]))):
    rankbest = np.zeros([len(pdlistlabl), 5])  # Initialize rankbest
    nplistlab = pdlistlabl[pdlistlabl.columns[1:-1]].to_numpy()  # Convert DataFrame to NumPy array
    
    for j in range(5):
        for ii in range(len(rankbest)):  # Iterate over rows in rankbest
            try:
                tier_prob = test_preds_all_list[j][i]  # Get predictions for current fold
                tier_class = prec_rf_classifiers[j][0].classes_.tolist()  # Get class labels
                tier_idx = tier_class.index(nplistlab[ii][j])  # Get index of the label in tier_class
                rankbest[ii, j] = tier_prob[tier_idx]  # Assign the probability to rankbest
            except:
                tier_prob = test_preds_all_list[j+1][i]  # Get predictions for current fold
                tier_class = prec_rf_classifiers[j+1][0].classes_.tolist()  # Get class labels
                tier_idx = tier_class.index(nplistlab[ii][j])  # Get index of the label in tier_class
                rankbest[ii, j] = tier_prob[tier_idx]  # Assign the probability to rankbest
    # Get the index of the row in nplistlab corresponding to the maximum mean rankbest value
    selected_row = nplistlab[np.argmax(np.mean(rankbest, axis=1))]
    finalstack.append(selected_row)  # Append the selected row to finalstack

# Convert finalstack to a NumPy array if needed
finalstack = np.array(finalstack)


00%|█████████████████████████████████████████████████████████████████████████████████████| 315720/315720 [04:25<00:00, 1190.67it/s]

In [72]:
nplistlab[ii]

array(['Sensor', 'Outside_Air_Enthalpy_Sensor', 'None', 'None', 'None'],
      dtype=object)

In [73]:
#prec_rf_classifiers[j+1][0].classes_.tolist()

In [74]:
tier_class

['Average_Zone_Air_Temperature_Sensor',
 'Cooling_Temperature_Setpoint',
 'Differential_Supply_Return_Water_Temperature_Sensor',
 'Heating_Temperature_Setpoint',
 'None',
 'Outside_Air_Temperature_Setpoint',
 'Peak_Power_Demand_Sensor',
 'Return_Water_Temperature_Sensor',
 'Warmest_Zone_Air_Temperature_Sensor']

In [75]:
# # Assuming `number_to_label` is already defined from the previous code
# # Create a function to map numbers to labels
# def map_numbers_to_labels(array, mapping):
#     for row in tqdm(array):
#         for i, value in enumerate(row):
#             # If the value is numeric, map it to the label
#             if isinstance(value, (int, np.integer)):
#                 row[i] = mapping.get(value, value)  # Keep value if not in mapping

# # Example mapping dictionary (you should replace this with your actual mapping)
# number_to_label = {i: label for i, label in enumerate(train_y.columns[1:].tolist())}

# # Apply the mapping to the array
# map_numbers_to_labels(stacked, number_to_label)

# # Output the processed array
# stacked


In [76]:
finalstack

array([['Sensor', 'Power_Sensor', 'Electrical_Power_Sensor', 'None',
        'None'],
       ['Sensor', 'Flow_Sensor', 'Chilled_Water_Supply_Flow_Sensor',
        'Water_Flow_Sensor', 'None'],
       ['Sensor', 'Demand_Sensor', 'None', 'None', 'None'],
       ...,
       ['Sensor', 'None', 'None', 'None', 'None'],
       ['Sensor', 'Power_Sensor', 'None', 'None', 'None'],
       ['Alarm', 'None', 'None', 'None', 'None']], dtype=object)

In [77]:
columnlist = ['Active_Power_Sensor', 'Air_Flow_Sensor',
       'Air_Flow_Setpoint', 'Air_Temperature_Sensor',
       'Air_Temperature_Setpoint', 'Alarm', 'Angle_Sensor',
       'Average_Zone_Air_Temperature_Sensor',
       'Chilled_Water_Differential_Temperature_Sensor',
       'Chilled_Water_Return_Temperature_Sensor',
       'Chilled_Water_Supply_Flow_Sensor',
       'Chilled_Water_Supply_Temperature_Sensor', 'Command',
       'Cooling_Demand_Sensor', 'Cooling_Demand_Setpoint',
       'Cooling_Supply_Air_Temperature_Deadband_Setpoint',
       'Cooling_Temperature_Setpoint', 'Current_Sensor',
       'Damper_Position_Sensor', 'Damper_Position_Setpoint', 'Demand_Sensor',
       'Dew_Point_Setpoint', 'Differential_Pressure_Sensor',
       'Differential_Pressure_Setpoint',
       'Differential_Supply_Return_Water_Temperature_Sensor',
       'Discharge_Air_Dewpoint_Sensor', 'Discharge_Air_Temperature_Sensor',
       'Discharge_Air_Temperature_Setpoint',
       'Discharge_Water_Temperature_Sensor', 'Duration_Sensor',
       'Electrical_Power_Sensor', 'Energy_Usage_Sensor',
       'Filter_Differential_Pressure_Sensor', 'Flow_Sensor', 'Flow_Setpoint',
       'Frequency_Sensor', 'Heating_Demand_Sensor', 'Heating_Demand_Setpoint',
       'Heating_Supply_Air_Temperature_Deadband_Setpoint',
       'Heating_Temperature_Setpoint', 'Hot_Water_Flow_Sensor',
       'Hot_Water_Return_Temperature_Sensor',
       'Hot_Water_Supply_Temperature_Sensor', 'Humidity_Setpoint',
       'Load_Current_Sensor', 'Low_Outside_Air_Temperature_Enable_Setpoint',
       'Max_Air_Temperature_Setpoint', 'Min_Air_Temperature_Setpoint',
       'Outside_Air_CO2_Sensor', 'Outside_Air_Enthalpy_Sensor',
       'Outside_Air_Humidity_Sensor',
       'Outside_Air_Lockout_Temperature_Setpoint',
       'Outside_Air_Temperature_Sensor', 'Outside_Air_Temperature_Setpoint',
       'Parameter', 'Peak_Power_Demand_Sensor', 'Position_Sensor',
       'Power_Sensor', 'Pressure_Sensor', 'Rain_Sensor',
       'Reactive_Power_Sensor', 'Reset_Setpoint',
       'Return_Air_Temperature_Sensor', 'Return_Water_Temperature_Sensor',
       'Room_Air_Temperature_Setpoint', 'Sensor', 'Setpoint',
       'Solar_Radiance_Sensor', 'Speed_Setpoint', 'Static_Pressure_Sensor',
       'Static_Pressure_Setpoint', 'Status', 'Supply_Air_Humidity_Sensor',
       'Supply_Air_Static_Pressure_Sensor',
       'Supply_Air_Static_Pressure_Setpoint', 'Supply_Air_Temperature_Sensor',
       'Supply_Air_Temperature_Setpoint', 'Temperature_Sensor',
       'Temperature_Setpoint', 'Thermal_Power_Sensor', 'Time_Setpoint',
       'Usage_Sensor', 'Valve_Position_Sensor', 'Voltage_Sensor',
       'Warmest_Zone_Air_Temperature_Sensor', 'Water_Flow_Sensor',
       'Water_Temperature_Sensor', 'Water_Temperature_Setpoint',
       'Wind_Direction_Sensor', 'Wind_Speed_Sensor',
       'Zone_Air_Dewpoint_Sensor', 'Zone_Air_Humidity_Sensor',
       'Zone_Air_Humidity_Setpoint', 'Zone_Air_Temperature_Sensor'
]

In [78]:
zipftest = ZipFile('./test_X_v0.1.0.zip', 'r')

In [79]:
listtestfile = zipftest.namelist()[1:]

In [80]:
stackedfinalresult = pd.DataFrame(columns=['filename'])
stackedfinalresult['filename'] = pd.Series(listtestfile).apply(lambda x: x.split("/")[-1])

for labelname in columnlist:
    stackedfinalresult[labelname] = 0

test_preds = finalstack
for i in tqdm(range(len(test_preds))):
    # stackedfinalresult.loc[i, test_preds[i]] = 1
    predlist = test_preds[i].tolist()
    predlist = [x for x in predlist if x != 'None']
    for predlabelname in predlist:
    	stackedfinalresult.loc[i, predlabelname] = 1


00%|████████████████████████████████████████████████████████████████████████████████████| 315720/315720 [00:22<00:00, 13755.69it/s]

In [81]:
stackedfinalresult.loc[((stackedfinalresult['Peak_Power_Demand_Sensor'] == 1)), 
            'Demand_Sensor'] = 1
stackedfinalresult.loc[((stackedfinalresult['Cooling_Supply_Air_Temperature_Deadband_Setpoint'] == 1)), 
            'Air_Temperature_Setpoint'] = 1
stackedfinalresult.loc[((stackedfinalresult['Heating_Supply_Air_Temperature_Deadband_Setpoint'] == 1)), 
            'Air_Temperature_Setpoint'] = 1

In [82]:
stackedfinalresult = stackedfinalresult.assign(**{col: stackedfinalresult[col].astype(float) for col in stackedfinalresult.columns if col != "filename"})
stackedfinalresult.to_csv("./submit/18-01_v16jonas_post.csv.gz", index=False, compression='gzip')