## Optimize first Random Forest Model

In [1]:
from tqdm import tqdm
# Import the data sets
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
base_path = r"C:\Users\KAI\Coding\ThinkOnward_challenge\thinkOnward_TSClassification"
data_path = r"\data\building-instinct-starter-notebook\Starter notebook"
preprocessing_path = r"\kai\preprocessing"
sys.path.append(base_path+data_path)
sys.path.append(base_path+"\kai")
sys.path.append(base_path+preprocessing_path)
from preprocessing.preprocessing import Preprocessor

pre_load = False
if pre_load:
    df_features_dict = {}

    for s in ["monthly", "weekly", "daily", "with_regional/monthly", "with_regional/weekly", "with_regional/daily"]:
        df_features = pd.read_parquet(base_path + f'/preprocessed_data/{s}_data.parquet', engine='pyarrow')
        df_features.sort_index(inplace=True)
        df_features_dict[s] = df_features

    # Full Dataset
    df_features_full = pd.read_parquet(base_path + '/preprocessed_data/standard_data.parquet', engine='pyarrow')
    df_features_full.sort_index(inplace=True)
    df_features_dict['full'] = df_features_full

    # Full with regional Dataset
    df_features_full = pd.read_parquet(base_path + '/preprocessed_data/with_regional/standard_data.parquet', engine='pyarrow')
    df_features_full.sort_index(inplace=True)
    df_features_dict['with_regional/full'] = df_features_full

# Labels
load_filepath_labels = os.path.join(base_path + data_path,'building-instinct-train-label', 'train_label.parquet')#path to the train label file
df_targets = pd.read_parquet(load_filepath_labels, engine='pyarrow')
y = df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})

In [2]:
from scipy import stats
import xgboost as xgb
import cupy as cp
import multiprocessing
import gc
import psutil
# import torch
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.multioutput import MultiOutputClassifier
# from preprocessing.utils import print_memory_usage, print_gpu_memory, free_gpu_memory
from xgboost import XGBClassifier
from memory_profiler import profile


def sample_hyperparameters(param_grid):
    """
    Sample hyperparameters from the given grid using scipy.stats.

    Parameters:
    ----------
    param_grid : dict
        A dictionary where keys are hyperparameter names and values are lists of options or ranges.

    Returns:
    -------
    dict
        A dictionary with sampled hyperparameters.
    """
    sampled_params = {}
    
    for param, values in param_grid.items():
        if isinstance(values, list):
            # Randomly choose from list of options
            sampled_params[param] = np.random.choice(values)
        elif isinstance(values, tuple) and len(values) == 2:
            min_val, max_val = values
            if isinstance(min_val, int) and isinstance(max_val, int):
                # Sample integer values using scipy.stats.randint
                sampled_params[param] = stats.randint.rvs(min_val, max_val + 1)
            elif isinstance(min_val, float) and isinstance(max_val, float):
                # Sample float values using scipy.stats.uniform
                sampled_params[param] = stats.uniform.rvs(min_val, max_val - min_val)
        elif isinstance(values, str) and values == 'choice':
            # Sample from a list of options if 'choice' is specified
            sampled_params[param] = np.random.choice(param_grid[param])
        else:
            raise ValueError(f"Unsupported parameter type for {param}: {values}")
    
    # Adjust parameters for RandomForestClassifier
    if not sampled_params.get('bootstrap', True):
        sampled_params['max_samples'] = None  # Reset max_samples if bootstrap is False
        # sampled_params['oob_score'] = False
    # Ensure max_features is correctly set
    max_features = sampled_params.get('max_features')
    if isinstance(max_features, str) and max_features.startswith('0'):
        sampled_params['max_features'] = float(max_features)
        
    
    return sampled_params

# @profile(precision=4)
def random_search(df_features_dict, df_targets, param_grid, n_runs=5, target="building_stock_type"):
    if target == "building_stock_type":
        results_file = base_path + r"\kai\model"+ "HP_results/multixgboost/multi_results.csv"
        y = df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})
    elif target == "com":
        results_file = base_path + r"\kai\model"+ "\HP_results/multixgboost/multi_com_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'commercial'].filter(like='_com')
        label_encoders = {}
        for col in y.columns:
            le = LabelEncoder()
            y[col] = le.fit_transform(y[col])
            label_encoders[col] = le
    elif target == "res":
        results_file = base_path + r"\kai\model"+ "\HP_results/multixgboost/multi_res_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'residential'].filter(like='_res')
        label_encoders = {}
        for col in y.columns:
            le = LabelEncoder()
            y[col] = le.fit_transform(y[col])
            label_encoders[col] = le
    
    # Check if results file exists
    if os.path.exists(results_file):
        results_df = pd.read_csv(results_file)
    else:
        # Create an empty DataFrame if the results file doesn't exist
        results_df = pd.DataFrame(columns=[
            'train_set', 'f1_train', 'f1_val', 'n_estimators', 'criterion', 'max_depth', 
            'min_samples_split', 'min_samples_leaf', 'max_features', 'bootstrap'
        ])

    for _ in tqdm(range(n_runs), file=sys.stdout, desc="Running Random Search"):
        # print(f"Run {_ + 1}/{n_runs}")
        # sample parameters
        # print_memory_usage()
        # print_gpu_memory()
        params = sample_hyperparameters(param_grid)
        tqdm.write(f"Run {_ + 1}/{n_runs} - Params: {params}")
        train_set = params['train_set']
        if df_features_dict is None:
            df_features = pd.read_parquet(base_path + f'/preprocessed_data/{train_set}_data.parquet', engine='pyarrow')
            df_features.sort_index(inplace=True)
        else:
            df_features = df_features_dict[train_set].copy()
        
        if target == "building_stock_type":
            X = df_features
        elif target == "com":
            X = df_features[df_targets['building_stock_type'] == 'commercial']
        elif target == "res":
            X = df_features[df_targets['building_stock_type'] == 'residential']

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        xgb_params = {key: value for key, value in params.items() if key not in ['train_set', "num_boost_round"]}

        F1_l2_dict_train = {}
        F1_l2_dict_val = {}
        for i, col in enumerate(y.columns):
            y_train_col = y_train.iloc[:, i]
            y_val_col = y_val.iloc[:, i]
            
            if params["device"] == "cuda":
                X_train_gpu = cp.array(X_train)
                X_val_gpu = cp.array(X_val)
                y_train_gpu = cp.array(y_train_col)
                y_val_gpu = cp.array(y_val_col)
            else:
                dtrain = xgb.DMatrix(X_train, label=y_train_col)
                dval = xgb.DMatrix(X_val, label=y_val_col)
            
            # Train the model
            if "n_estimators" in xgb_params:
                bst = XGBClassifier(**xgb_params)
                bst.fit(X_train_gpu, y_train_gpu)
                y_train_pred = bst.predict(X_train_gpu)
                y_val_pred = bst.predict(X_val_gpu)
            else:# xgb.train can work with DMatrix objects
                dtrain = xgb.DMatrix(X_train_gpu, label=y_train_gpu)
                dval = xgb.DMatrix(X_val_gpu, label=y_val_gpu)
                bst = xgb.train(
                    xgb_params,
                    dtrain,
                    num_boost_round=params["num_boost_round"],)
                y_train_pred = bst.predict(dtrain)
                y_val_pred = bst.predict(dval)
            
            if params["device"] == "cuda":# retransform to cpu for F1 score computation
                y_train_pred = cp.asnumpy(y_train_pred)
                y_val_pred = cp.asnumpy(y_val_pred)

            # compute F1
            F1_l2_dict_train[col] = f1_score(y_train_col, y_train_pred.round(), average='macro')
            F1_l2_dict_val[col] = f1_score(y_val_col, y_val_pred.round(), average='macro')

            # Clean up GPU memory after each model
            if params["device"] == "cuda":
                del X_train_gpu, X_val_gpu, y_train_gpu, y_val_gpu
                cp.get_default_memory_pool().free_all_blocks()
                gc.collect()
            tqdm.write(f"Run {_ + 1}/{n_runs} - Finished column: {col} ({i+1}/{len(y.columns)}) with F1 Score (Val): {F1_l2_dict_val[col]:.4f}")

        if target == "building_stock_type":
            # Evaluate on the training set
            f1_train = f1_score(y_train, y_train, average='macro')
            train_conf_matrix = confusion_matrix(y_train, y_train)
            train_TN, train_FP, train_FN, train_TP = train_conf_matrix.ravel()
            f1_val = f1_score(y_val, y_val_pred, average='macro')
            val_conf_matrix = confusion_matrix(y_val, y_val_pred)

            # Extract TP, TN, FP, FN from the validation set confusion matrix
            val_TN, val_FP, val_FN, val_TP = val_conf_matrix.ravel()

            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                'train_TP': train_TP,
                'train_TN': train_TN,
                'train_FP': train_FP,
                'train_FN': train_FN,
                'val_TP': val_TP,
                'val_TN': val_TN,
                'val_FP': val_FP,
                'val_FN': val_FN,
                **xgb_params
                }
        else:
            f1_train = sum(F1_l2_dict_train.values()) / len(F1_l2_dict_train.values())
            f1_val = sum(F1_l2_dict_val.values()) / len(F1_l2_dict_val.values())
            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                **xgb_params
            }

        # Update tqdm description with the current F1 score
        tqdm.write(f"Run {_ + 1}/{n_runs} - F1 Score (Val): {f1_val:.4f}")
        # tqdm.set_description(f"Run {_ + 1}/{n_runs} - F1 Val: {f1_val:.4f}")
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        results_df.to_csv(results_file, index=False)

param_grid_xgb = {
    'num_boost_round': (1, 4),  # Number of boosting rounds
    'max_depth': (3, 20),  # Maximum tree depth for base learners
    'eta': (0.05, 0.4),  # Boosting learning rate (xgb's "eta")
    'min_child_weight': (1, 20),  # Minimum sum of instance weight (hessian) needed in a child
    'subsample': [0.5, 0.75, 1.0],  # Subsample ratio of the training instance
    'n_rounds': (2, 8),
    # 'max_leaves': (1, 100),  # Maximum number of leaves; 0 indicates no limit
    # 'max_bin': (256, 512),  # Maximum number of bins per feature for histogram-based algorithm
    # 'grow_policy': ['depthwise', 'lossguide'],  # Tree growing policy
    # 'learning_rate': (0.01, 0.3),  # Boosting learning rate (eta)
    # 'verbosity': [0, 1, 2, 3],  # Degree of verbosity (0: silent, 1: warning, 2: info, 3: debug)
    # 'objective': ['binary:logistic', 'multi:softprob', 'reg:squarederror'],  # Learning objective
    'booster': ['gbtree',],# 'gblinear', 'dart'],  # Booster to use
    'tree_method': ['hist'],  # Tree method
    # 'gamma': (0, 5),  # Minimum loss reduction required to make a further partition on a leaf node of the tree
    # 'min_child_weight': (0, 10),  # Minimum sum of instance weight (hessian) needed in a child
    # 'subsample': (0.5, 1.0),  # Subsample ratio of the training instance
    # 'sampling_method': ['uniform', 'gradient_based'],  # Sampling method (only for GPU version of hist tree method)
    # 'colsample_bytree': (0.5, 1.0),  # Subsample ratio of columns when constructing each tree
    # 'colsample_bylevel': (0.5, 1.0),  # Subsample ratio of columns for each level
    # 'colsample_bynode': (0.5, 1.0),  # Subsample ratio of columns for each split
    # 'reg_alpha': (0, 1),  # L1 regularization term on weights
    # 'reg_lambda': (1, 10),  # L2 regularization term on weights
    # 'scale_pos_weight': (0.1, 10),  # Balancing of positive and negative weights
    # 'base_score': (0.5, 0.5),  # The initial prediction score of all instances, global bias
    # 'multi_strategy': ['one_output_per_tree', 'multi_output_tree'],
    'random_state': [42],  # Random number seed for reproducibility
    # 'early_stopping_rounds': (10, 100),  # Number of rounds for early stopping
    'device': ['cuda'],#, 'cuda'],  # Device to use
    'train_set': ['standard', 'daily', 'weekly', 'monthly', 'with_regional/monthly', 'with_regional/weekly', 'with_regional/daily', 'with_regional/standard']  # List of options
}

param_grid_xgb_1 = {
    'n_estimators': (4, 50),  # Number of boosting rounds
    'max_depth': (3, 100),  # Maximum tree depth for base learners
    'eta': (0.05, 0.4),  # Boosting learning rate (xgb's "eta")
    'random_state': [42],  # Random number seed for reproducibility
    'device': ['cuda'],#, 'cuda'],  # Device to use
    'train_set': ['standard', 'daily', 'weekly', 'monthly', 'with_regional/monthly', 'with_regional/weekly', 'with_regional/daily', 'with_regional/standard']  # List of options
    }

cp.get_default_memory_pool().free_all_blocks()
# print_memory_usage()
# print_gpu_memory()
random_search(None, df_targets, param_grid_xgb_1, n_runs=5, target="res")

Run 1/5 - Params: {'n_estimators': 13, 'max_depth': 81, 'eta': 0.2853758974319447, 'random_state': 42, 'device': 'cuda', 'train_set': 'with_regional/weekly'}
Run 1/5 - Finished column: in.bedrooms_res (1/13) with F1 Score (Val): 0.2662
Run 1/5 - Finished column: in.cooling_setpoint_res (2/13) with F1 Score (Val): 0.0976
Run 1/5 - Finished column: in.heating_setpoint_res (3/13) with F1 Score (Val): 0.1785
Run 1/5 - Finished column: in.geometry_building_type_recs_res (4/13) with F1 Score (Val): 0.3703
Run 1/5 - Finished column: in.geometry_floor_area_res (5/13) with F1 Score (Val): 0.2588
Run 1/5 - Finished column: in.geometry_foundation_type_res (6/13) with F1 Score (Val): 0.3031
Run 1/5 - Finished column: in.geometry_wall_type_res (7/13) with F1 Score (Val): 0.3904
Run 1/5 - Finished column: in.heating_fuel_res (8/13) with F1 Score (Val): 0.2619
Run 1/5 - Finished column: in.income_res (9/13) with F1 Score (Val): 0.0550
Run 1/5 - Finished column: in.roof_material_res (10/13) with F1 Sc

XGBoostError: [21:07:24] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0015a694724fa8361-1\xgboost\xgboost-ci-windows\src\tree\updater_gpu_hist.cu:861: Exception in gpu_hist: [21:07:24] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0015a694724fa8361-1\xgboost\xgboost-ci-windows\src\common\device_helpers.cuh:400: Memory allocation error on worker 0: bad allocation: cudaErrorMemoryAllocation: out of memory
- Free memory: 0
- Requested memory: 13169602560



## old

In [None]:
from scipy import stats
import xgboost as xgb
import cupy as cp
import multiprocessing
import gc
import psutil
import torch
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.multioutput import MultiOutputClassifier


def random_search(df_features_dict, df_targets, param_grid, n_runs=5, target="building_stock_type"):
    if target == "building_stock_type":
        results_file = base_path + r"\kai\model"+ "HP_results/multixgboost/multi_results.csv"
        y = df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})
    elif target == "com":
        results_file = base_path + r"\kai\model"+ "\HP_results/multixgboost/multi_com_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'commercial'].filter(like='_com')
        label_encoders = {}
        for col in y.columns:
            le = LabelEncoder()
            y[col] = le.fit_transform(y[col])
            label_encoders[col] = le
    elif target == "res":
        results_file = base_path + r"\kai\model"+ "\HP_results/multixgboost/multi_res_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'residential'].filter(like='_res')
        label_encoders = {}
        for col in y.columns:
            le = LabelEncoder()
            y[col] = le.fit_transform(y[col])
            label_encoders[col] = le
    
    # Check if results file exists
    if os.path.exists(results_file):
        results_df = pd.read_csv(results_file)
    else:
        # Create an empty DataFrame if the results file doesn't exist
        results_df = pd.DataFrame(columns=[
            'train_set', 'f1_train', 'f1_val', 'n_estimators', 'criterion', 'max_depth', 
            'min_samples_split', 'min_samples_leaf', 'max_features', 'bootstrap'
        ])

    for _ in tqdm(range(n_runs), file=sys.stdout, desc="Running Random Search"):
        # print(f"Run {_ + 1}/{n_runs}")
        # sample parameters
        params = sample_hyperparameters(param_grid)
        tqdm.write(f"Run {_ + 1}/{n_runs} - Params: {params}")
        train_set = params['train_set']
        if df_features_dict is None:
            df_features = pd.read_parquet(base_path + f'/preprocessed_data/{train_set}_data.parquet', engine='pyarrow')
            df_features.sort_index(inplace=True)
        else:
            df_features = df_features_dict[train_set].copy()
        
        if target == "building_stock_type":
            X = df_features
        elif target == "com":
            X = df_features[df_targets['building_stock_type'] == 'commercial']
        elif target == "res":
            X = df_features[df_targets['building_stock_type'] == 'residential']

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        xgb_params = {key: value for key, value in params.items() if key != 'train_set'}
        if params["device"] == "cuda":
            X_train = cp.array(X_train)
            X_val = cp.array(X_val)
            y_train = cp.array(y_train)
            y_val = cp.array(y_val)
            
            for key, value in xgb_params.items():
                if isinstance(value, np.int32) or isinstance(value, np.int64):
                    xgb_params[key] = int(value)
                elif isinstance(value, np.float32) or isinstance(value, np.float64):
                    xgb_params[key] = float(value)
                elif isinstance(value, np.str_):
                    xgb_params[key] = str(value)
        clf = Pipeline([('preprocessor', ColumnTransformer([
                            ('scaler', StandardScaler(), slice(0, X_train.shape[1])),
                            ('encoder', OneHotEncoder(), [])  # No need to change the encoder for GPU
                        ])),
                        ('classifier', MultiOutputClassifier(
                            xgb.XGBClassifier(**xgb_params),))
                    ])

        if params["device"] == "cuda":
            clf.fit(cp.asnumpy(X_train), cp.asnumpy(y_train))  # XGBoost expects NumPy input
            print("Fitted")
            # d_train = xgb.DMatrix(X_train, label=y_train)
            # d_val = xgb.DMatrix(X_val, label=y_val)
            y_train_pred = clf.predict(cp.asnumpy(X_train))
            y_val_pred = clf.predict(cp.asnumpy(X_val))
        else:
            clf.fit(X_train, y_train)
            y_train_pred = clf.predict(X_train)
            y_val_pred = clf.predict(X_val)

        if target == "building_stock_type":
            # Evaluate on the training set
            f1_train = f1_score(y_train, y_train, average='macro')
            train_conf_matrix = confusion_matrix(y_train, y_train)
            train_TN, train_FP, train_FN, train_TP = train_conf_matrix.ravel()
            f1_val = f1_score(y_val, y_val_pred, average='macro')
            val_conf_matrix = confusion_matrix(y_val, y_val_pred)

            # Extract TP, TN, FP, FN from the validation set confusion matrix
            val_TN, val_FP, val_FN, val_TP = val_conf_matrix.ravel()

            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                'train_TP': train_TP,
                'train_TN': train_TN,
                'train_FP': train_FP,
                'train_FN': train_FN,
                'val_TP': val_TP,
                'val_TN': val_TN,
                'val_FP': val_FP,
                'val_FN': val_FN,
                **xgb_params
                }
        else:
            y_train_pred = pd.DataFrame(y_train_pred, columns=[f'class_{i}' for i in range(y_train.shape[1])])
            y_val_pred = pd.DataFrame(y_val_pred, columns=[f'class_{i}' for i in range(y_val.shape[1])])

            if params["device"] == "cuda":
                F1_l2_dict_train = {column: f1_score(cp.asnumpy(y_train[:, i]), y_train_pred.iloc[:, i], average='macro') for i, column in enumerate(y_train_pred.columns)}
                F1_l2_dict_val = {column: f1_score(cp.asnumpy(y_val[:, i]), y_val_pred.iloc[:, i], average='macro') for i, column in enumerate(y_val_pred.columns)}
            else:
                F1_l2_dict_train = {column: f1_score(y_train.iloc[:, i], y_train_pred.iloc[:, i], average='macro') for i, column in enumerate(y_train_pred.columns)}
                F1_l2_dict_val = {column: f1_score(y_val.iloc[:, i], y_val_pred.iloc[:, i], average='macro') for i, column in enumerate(y_val_pred.columns)}
            f1_train = sum(F1_l2_dict_train.values()) / len(F1_l2_dict_train.values())
            f1_val = sum(F1_l2_dict_val.values()) / len(F1_l2_dict_val.values())
            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                **xgb_params
            }
            if params["device"] == "cuda":
                cp.get_default_memory_pool().free_all_blocks()
            del clf
            del X_train, X_val, y_train, y_val
            del df_features, X, y_train_pred, y_val_pred, F1_l2_dict_train, F1_l2_dict_val
            gc.collect()
            print_memory_usage()
        # Update tqdm description with the current F1 score
        tqdm.write(f"Run {_ + 1}/{n_runs} - F1 Score (Val): {f1_val:.4f}")
        # tqdm.set_description(f"Run {_ + 1}/{n_runs} - F1 Val: {f1_val:.4f}")
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        results_df.to_csv(results_file, index=False)

param_grid_xgb = {
    'n_estimators': (4, 30),  # Number of boosting rounds
    'max_depth': (3, 20),  # Maximum tree depth for base learners
    # 'max_leaves': (1, 100),  # Maximum number of leaves; 0 indicates no limit
    # 'max_bin': (256, 512),  # Maximum number of bins per feature for histogram-based algorithm
    # 'grow_policy': ['depthwise', 'lossguide'],  # Tree growing policy
    # 'learning_rate': (0.01, 0.3),  # Boosting learning rate (eta)
    # 'verbosity': [0, 1, 2, 3],  # Degree of verbosity (0: silent, 1: warning, 2: info, 3: debug)
    # 'objective': ['binary:logistic', 'multi:softprob', 'reg:squarederror'],  # Learning objective
    # 'booster': ['gbtree', 'gblinear', 'dart'],  # Booster to use
    'tree_method': ['hist'],  # Tree method
    # 'gamma': (0, 5),  # Minimum loss reduction required to make a further partition on a leaf node of the tree
    # 'min_child_weight': (0, 10),  # Minimum sum of instance weight (hessian) needed in a child
    # 'subsample': (0.5, 1.0),  # Subsample ratio of the training instance
    # 'sampling_method': ['uniform', 'gradient_based'],  # Sampling method (only for GPU version of hist tree method)
    # 'colsample_bytree': (0.5, 1.0),  # Subsample ratio of columns when constructing each tree
    # 'colsample_bylevel': (0.5, 1.0),  # Subsample ratio of columns for each level
    # 'colsample_bynode': (0.5, 1.0),  # Subsample ratio of columns for each split
    # 'reg_alpha': (0, 1),  # L1 regularization term on weights
    # 'reg_lambda': (1, 10),  # L2 regularization term on weights
    # 'scale_pos_weight': (0.1, 10),  # Balancing of positive and negative weights
    # 'base_score': (0.5, 0.5),  # The initial prediction score of all instances, global bias
    'random_state': [42],  # Random number seed for reproducibility
    # 'early_stopping_rounds': (10, 100),  # Number of rounds for early stopping
    'device': ['cuda'],#, 'cuda'],  # Device to use
    'train_set': ['standard', 'daily', 'weekly', 'monthly', 'with_regional/monthly', 'with_regional/weekly', 'with_regional/daily', 'with_regional/standard']  # List of options
}

cp.get_default_memory_pool().free_all_blocks()
print_memory_usage()
print_gpu_memory()
random_search(None, df_targets, param_grid_xgb, n_runs=10, target="res")

In [8]:
import xgboost as xgb
import cupy as cp
import multiprocessing
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.multioutput import MultiOutputClassifier
df_features_full = df_features_dict['with_regional/daily']
s="com"
if s=="res":
    X = df_features_full[df_targets['building_stock_type'] == 'residential']
    y = df_targets[df_targets['building_stock_type'] == 'residential'].filter(like='_res')
if s=="com":
    X = df_features_full[df_targets['building_stock_type'] == 'commercial']
    y = df_targets[df_targets['building_stock_type'] == 'commercial'].filter(like='_com')

label_encoders = {}
for col in y.columns:
    le = LabelEncoder()
    y[col] = le.fit_transform(y[col])
    label_encoders[col] = le

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# clf = Pipeline([('preprocessor', ColumnTransformer([
#         ('scaler', StandardScaler(), X_train.columns),
#         ('encoder', OneHotEncoder(), [])
#     ])),
#     ('classifier', MultiOutputClassifier(xgb.XGBClassifier(n_estimators=50, device="cuda", tree_method="hist",# verbose="1",
#                                                            max_depth=12,  # Reduce tree depth<
#                                                            )))
#                                                            ])

# Convert training and validation sets to CuPy arrays for GPU
X_train = cp.array(X_train)
X_val = cp.array(X_val)
y_train = cp.array(y_train)
y_val = cp.array(y_val)

# Set up pipeline
clf = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('scaler', StandardScaler(), slice(0, X_train.shape[1])),
        ('encoder', OneHotEncoder(), [])  # No need to change the encoder for GPU
    ])),
    ('classifier', MultiOutputClassifier(
        xgb.XGBClassifier(n_estimators=20,
                          tree_method='hist',
                          max_depth=8,
                          verbosity=1,
                          random_state=42),))
])
# Train and evaluate the model
# clf.fit(X_train, y_train)

# y_train_pred = clf.predict(X_train)
# y_val_pred = clf.predict(X_val)

clf.fit(cp.asnumpy(X_train), cp.asnumpy(y_train))  # XGBoost expects NumPy input

# Predictions
y_train_pred = clf.predict(cp.asnumpy(X_train))
y_val_pred = clf.predict(cp.asnumpy(X_val))

# y_train_pred = pd.DataFrame(y_train_pred, columns=y.columns)
# y_val_pred = pd.DataFrame(y_val_pred, columns=y.columns)

# # Evaluate on the training set
# F1_l2_dict = {column: 0 for column in y.columns}
# for col in y.columns:
#     F1_l2_dict[col] = f1_score(y_train[col], y_train_pred[col], average="macro")
# f1_train = sum(F1_l2_dict.values()) / len(F1_l2_dict.values())

# # Evaluate on the validation set
# F1_l2_dict = {column: 0 for column in y.columns}
# for col in y.columns:
#     F1_l2_dict[col] = f1_score(y_val[col], y_val_pred[col], average="macro")
# f1_val = sum(F1_l2_dict.values()) / len(F1_l2_dict.values())
# print(f"Training F1: {f1_train:.4f}")
# print(f"Validation F1: {f1_val:.4f}")

# Convert predictions back to DataFrame for evaluation
y_train_pred = pd.DataFrame(y_train_pred, columns=[f'class_{i}' for i in range(y_train.shape[1])])
y_val_pred = pd.DataFrame(y_val_pred, columns=[f'class_{i}' for i in range(y_val.shape[1])])

# Evaluate on the training set
F1_l2_dict_train = {column: f1_score(cp.asnumpy(y_train[:, i]), y_train_pred.iloc[:, i], average='macro') for i, column in enumerate(y_train_pred.columns)}
f1_train = sum(F1_l2_dict_train.values()) / len(F1_l2_dict_train.values())

# Evaluate on the validation set
F1_l2_dict_val = {column: f1_score(cp.asnumpy(y_val[:, i]), y_val_pred.iloc[:, i], average='macro') for i, column in enumerate(y_val_pred.columns)}
f1_val = sum(F1_l2_dict_val.values()) / len(F1_l2_dict_val.values())

print(f"Training F1: {f1_train:.4f}")
print(f"Validation F1: {f1_val:.4f}")


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.



Training F1: 0.9976
Validation F1: 0.2391


In [None]:
from scipy import stats

def sample_hyperparameters(param_grid):
    """
    Sample hyperparameters from the given grid using scipy.stats.

    Parameters:
    ----------
    param_grid : dict
        A dictionary where keys are hyperparameter names and values are lists of options or ranges.

    Returns:
    -------
    dict
        A dictionary with sampled hyperparameters.
    """
    sampled_params = {}
    
    for param, values in param_grid.items():
        if isinstance(values, list):
            # Randomly choose from list of options
            sampled_params[param] = np.random.choice(values)
        elif isinstance(values, tuple) and len(values) == 2:
            min_val, max_val = values
            if isinstance(min_val, int) and isinstance(max_val, int):
                # Sample integer values using scipy.stats.randint
                sampled_params[param] = stats.randint.rvs(min_val, max_val + 1)
            elif isinstance(min_val, float) and isinstance(max_val, float):
                # Sample float values using scipy.stats.uniform
                sampled_params[param] = stats.uniform.rvs(min_val, max_val - min_val)
        elif isinstance(values, str) and values == 'choice':
            # Sample from a list of options if 'choice' is specified
            sampled_params[param] = np.random.choice(param_grid[param])
        else:
            raise ValueError(f"Unsupported parameter type for {param}: {values}")
    
    # Adjust parameters for RandomForestClassifier
    if not sampled_params.get('bootstrap', True):
        sampled_params['max_samples'] = None  # Reset max_samples if bootstrap is False
        # sampled_params['oob_score'] = False
    # Ensure max_features is correctly set
    max_features = sampled_params.get('max_features')
    if isinstance(max_features, str) and max_features.startswith('0'):
        sampled_params['max_features'] = float(max_features)
        
    
    return sampled_params


def random_search(df_features_dict, df_targets, param_grid, n_runs=5, target="building_stock_type"):
    if target == "building_stock_type":
        results_file = "results.csv"
        y = df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})
    elif target == "com":
        results_file = "com_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'commercial'].filter(like='_com')
    elif target == "res":
        results_file = "res_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'residential'].filter(like='_res')
    
    # Check if results file exists
    if os.path.exists(results_file):
        results_df = pd.read_csv(results_file)
    else:
        # Create an empty DataFrame if the results file doesn't exist
        results_df = pd.DataFrame(columns=[
            'train_set', 'f1_train', 'f1_val', 'n_estimators', 'criterion', 'max_depth', 
            'min_samples_split', 'min_samples_leaf', 'max_features', 'bootstrap'
        ])

    for _ in tqdm(range(n_runs), file=sys.stdout, desc="Running Random Search"):
        # print(f"Run {_ + 1}/{n_runs}")
        # sample parameters
        params = sample_hyperparameters(param_grid)
        tqdm.write(f"Run {_ + 1}/{n_runs} - Params: {params}")
        train_set = params['train_set']
        df_features = df_features_dict[train_set].copy()
        if target == "building_stock_type":
            X = df_features
        elif target == "com":
            X = df_features[df_targets['building_stock_type'] == 'commercial']
        elif target == "res":
            X = df_features[df_targets['building_stock_type'] == 'residential']

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        rf_params = {key: value for key, value in params.items() if key != 'train_set'}
        clf = Pipeline([('preprocessor', ColumnTransformer([
                ('scaler', StandardScaler(), df_features.columns),
                ('encoder', OneHotEncoder(), [])
            ])),
            ('classifier', xgb.XGBClassifier(n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"))])

        # Train and evaluate the model
        clf.fit(X_train, y_train)

        y_train_pred = clf.predict(X_train)
        y_val_pred = clf.predict(X_val)

        if target == "building_stock_type":
            # Evaluate on the training set
            f1_train = f1_score(y_train, y_train, average='macro')
            train_conf_matrix = confusion_matrix(y_train, y_train)
            train_TN, train_FP, train_FN, train_TP = train_conf_matrix.ravel()
            f1_val = f1_score(y_val, y_val_pred, average='macro')
            val_conf_matrix = confusion_matrix(y_val, y_val_pred)

            # Extract TP, TN, FP, FN from the validation set confusion matrix
            val_TN, val_FP, val_FN, val_TP = val_conf_matrix.ravel()

            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                'train_TP': train_TP,
                'train_TN': train_TN,
                'train_FP': train_FP,
                'train_FN': train_FN,
                'val_TP': val_TP,
                'val_TN': val_TN,
                'val_FP': val_FP,
                'val_FN': val_FN,
                **rf_params
                }
        else:
            y_train_pred = pd.DataFrame(y_train_pred, columns=y.columns)
            y_val_pred = pd.DataFrame(y_val_pred, columns=y.columns)

            # Evaluate on the training set
            F1_l2_dict = {column: 0 for column in y.columns}
            for col in y.columns:
                F1_l2_dict[col] = f1_score(y_train[col], y_train_pred[col], average="macro")
            f1_train = sum(F1_l2_dict.values()) / len(F1_l2_dict.values())

            # Evaluate on the validation set
            F1_l2_dict = {column: 0 for column in y.columns}
            for col in y.columns:
                F1_l2_dict[col] = f1_score(y_val[col], y_val_pred[col], average="macro")
            f1_val = sum(F1_l2_dict.values()) / len(F1_l2_dict.values())
        
            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                **rf_params
            }
        # Update tqdm description with the current F1 score
        tqdm.write(f"Run {_ + 1}/{n_runs} - F1 Score (Val): {f1_val:.4f}")
        # tqdm.set_description(f"Run {_ + 1}/{n_runs} - F1 Val: {f1_val:.4f}")
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        results_df.to_csv(results_file, index=False)

# Example usage
param_grid = {
    'n_estimators': (10, 1000),  # Integer range
    'criterion': ['gini', 'entropy', 'log_loss'],  # List of options
    'max_depth': (1, 100),  # Integer range
    'min_samples_split': (2, 10),  # Integer range
    'min_samples_leaf': (1, 10),  # Integer range
    'max_features': ['sqrt', 'log2'],  # List of options
    'bootstrap': [True, False],  # List of options
    'train_set': ['full', 'daily', 'weekly', 'monthly', 'with_regional/monthly', 'with_regional/weekly', 'with_regional/daily', 'with_regional/full']  # List of options
}

# random_search(df_features_dict, df_targets, param_grid, n_runs=5, target="building_stock_type")

In [1]:
import xgboost as xgb
import catboost as ctb
import lightgbm as lgb
import cupy as cp
X = cp.array(X)
y = cp.array(y)
y -= y.min()
num_round = 3000

# Leave most parameters as default
clf = xgb.XGBClassifier(device="cuda", n_estimators=num_round)
# Train model
start = time.time()
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
gpu_res = clf.evals_result()

ModuleNotFoundError: No module named 'xgboost'

In [None]:
from tqdm import tqdm
# Import the data sets
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
base_path = r"C:\Users\KAI\Coding\ThinkOnward_challenge\thinkOnward_TSClassification"
data_path = r"\data\building-instinct-starter-notebook\Starter notebook"
preprocessing_path = r"\kai\preprocessing"
sys.path.append(base_path+data_path)
sys.path.append(base_path+"\kai")
sys.path.append(base_path+preprocessing_path)
from preprocessing.preprocessing import Preprocessor

df_features_dict = {}

for s in ["monthly", "weekly", "daily"]:
    df_features = pd.read_parquet(base_path + f'/preprocessed_data/{s}_data.parquet', engine='pyarrow')
    df_features.sort_index(inplace=True)
    df_features_dict[s] = df_features

# Full Dataset
df_features_full = pd.read_parquet(base_path + '/preprocessed_data/standard_data.parquet', engine='pyarrow')
df_features_full.sort_index(inplace=True)
df_features_dict['full'] = df_features_full

# Labels
load_filepath_labels = os.path.join(base_path + data_path,'building-instinct-train-label', 'train_label.parquet')#path to the train label file
df_targets = pd.read_parquet(load_filepath_labels, engine='pyarrow')
y = df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})

In [2]:
from scipy import stats

def sample_hyperparameters(param_grid):
    """
    Sample hyperparameters from the given grid using scipy.stats.

    Parameters:
    ----------
    param_grid : dict
        A dictionary where keys are hyperparameter names and values are lists of options or ranges.

    Returns:
    -------
    dict
        A dictionary with sampled hyperparameters.
    """
    sampled_params = {}
    
    for param, values in param_grid.items():
        if isinstance(values, list):
            # Randomly choose from list of options
            sampled_params[param] = np.random.choice(values)
        elif isinstance(values, tuple) and len(values) == 2:
            min_val, max_val = values
            if isinstance(min_val, int) and isinstance(max_val, int):
                # Sample integer values using scipy.stats.randint
                sampled_params[param] = stats.randint.rvs(min_val, max_val + 1)
            elif isinstance(min_val, float) and isinstance(max_val, float):
                # Sample float values using scipy.stats.uniform
                sampled_params[param] = stats.uniform.rvs(min_val, max_val - min_val)
        elif isinstance(values, str) and values == 'choice':
            # Sample from a list of options if 'choice' is specified
            sampled_params[param] = np.random.choice(param_grid[param])
        else:
            raise ValueError(f"Unsupported parameter type for {param}: {values}")
    
    # Adjust parameters for RandomForestClassifier
    if not sampled_params.get('bootstrap', True):
        sampled_params['max_samples'] = None  # Reset max_samples if bootstrap is False
        # sampled_params['oob_score'] = False
    # Ensure max_features is correctly set
    max_features = sampled_params.get('max_features')
    if isinstance(max_features, str) and max_features.startswith('0'):
        sampled_params['max_features'] = float(max_features)
        
    
    return sampled_params


def random_search(df_features_dict, df_targets, param_grid, n_runs=5, target="building_stock_type"):
    if target == "building_stock_type":
        results_file = "results.csv"
        y = df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})
    elif target == "com":
        results_file = "com_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'commercial'].filter(like='_com')
    elif target == "res":
        results_file = "res_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'residential'].filter(like='_res')
    
    # Check if results file exists
    if os.path.exists(results_file):
        results_df = pd.read_csv(results_file)
    else:
        # Create an empty DataFrame if the results file doesn't exist
        results_df = pd.DataFrame(columns=[
            'train_set', 'f1_train', 'f1_val', 'n_estimators', 'criterion', 'max_depth', 
            'min_samples_split', 'min_samples_leaf', 'max_features', 'bootstrap'
        ])

    for _ in tqdm(range(n_runs), file=sys.stdout, desc="Running Random Search"):
        # print(f"Run {_ + 1}/{n_runs}")
        # sample parameters
        params = sample_hyperparameters(param_grid)
        tqdm.write(f"Run {_ + 1}/{n_runs} - Params: {params}")
        train_set = params['train_set']
        df_features = df_features_dict[train_set].copy()
        if target == "building_stock_type":
            X = df_features
        elif target == "com":
            X = df_features[df_targets['building_stock_type'] == 'commercial']
        elif target == "res":
            X = df_features[df_targets['building_stock_type'] == 'residential']

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        rf_params = {key: value for key, value in params.items() if key != 'train_set'}
        clf = Pipeline([('preprocessor', ColumnTransformer([
                ('scaler', StandardScaler(), df_features.columns),
                ('encoder', OneHotEncoder(), [])
            ])),
            ('classifier', RandomForestClassifier(**rf_params, random_state=42))
        ])

        # Train and evaluate the model
        clf.fit(X_train, y_train)

        y_train_pred = clf.predict(X_train)
        y_val_pred = clf.predict(X_val)

        if target == "building_stock_type":
            # Evaluate on the training set
            f1_train = f1_score(y_train, y_train, average='macro')
            train_conf_matrix = confusion_matrix(y_train, y_train)
            train_TN, train_FP, train_FN, train_TP = train_conf_matrix.ravel()
            f1_val = f1_score(y_val, y_val_pred, average='macro')
            val_conf_matrix = confusion_matrix(y_val, y_val_pred)

            # Extract TP, TN, FP, FN from the validation set confusion matrix
            val_TN, val_FP, val_FN, val_TP = val_conf_matrix.ravel()

            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                'train_TP': train_TP,
                'train_TN': train_TN,
                'train_FP': train_FP,
                'train_FN': train_FN,
                'val_TP': val_TP,
                'val_TN': val_TN,
                'val_FP': val_FP,
                'val_FN': val_FN,
                **rf_params
                }
        else:
            y_train_pred = pd.DataFrame(y_train_pred, columns=y.columns)
            y_val_pred = pd.DataFrame(y_val_pred, columns=y.columns)

            # Evaluate on the training set
            F1_l2_dict = {column: 0 for column in y.columns}
            for col in y.columns:
                F1_l2_dict[col] = f1_score(y_train[col], y_train_pred[col], average="macro")
            f1_train = sum(F1_l2_dict.values()) / len(F1_l2_dict.values())

            # Evaluate on the validation set
            F1_l2_dict = {column: 0 for column in y.columns}
            for col in y.columns:
                F1_l2_dict[col] = f1_score(y_val[col], y_val_pred[col], average="macro")
            f1_val = sum(F1_l2_dict.values()) / len(F1_l2_dict.values())
        
            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                **rf_params
            }
        # Update tqdm description with the current F1 score
        tqdm.write(f"Run {_ + 1}/{n_runs} - F1 Score (Val): {f1_val:.4f}")
        # tqdm.set_description(f"Run {_ + 1}/{n_runs} - F1 Val: {f1_val:.4f}")
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        results_df.to_csv(results_file, index=False)

# Example usage
param_grid = {
    'n_estimators': (10, 1000),  # Integer range
    'criterion': ['gini', 'entropy', 'log_loss'],  # List of options
    'max_depth': (1, 100),  # Integer range
    'min_samples_split': (2, 10),  # Integer range
    'min_samples_leaf': (1, 10),  # Integer range
    'max_features': ['sqrt', 'log2'],  # List of options
    'bootstrap': [True, False],  # List of options
    'train_set': ['full', 'daily', 'weekly', 'monthly']  # List of options
}

# random_search(df_features_dict, df_targets, param_grid, n_runs=5, target="building_stock_type")

## Commercial and residential columns

In [3]:
def calculate_hierarchical_f1_score(df_targets, df_pred, alpha=0.4, average='macro', F1_list=False):
    """
    Calculate the hierarchical F1-score for a multi-level classification problem.

    This function computes the F1-score at two hierarchical levels:
    1. The 'building_stock_type' level, which is the first level of hierarchy.
    2. The second level, which is conditional on the 'building_stock_type' being either 'commercial' or 'residential'.

    The final F1-score is a weighted average of the first level and second level F1-scores.

    Parameters:
    ----------
    df_targets : pd.DataFrame
        The dataframe containing the true target values. It must include a column 'building_stock_type' and other
        columns ending with '_com' or '_res' representing the second level of classification.

    df_pred : pd.DataFrame
        The dataframe containing the predicted values. It must be structured similarly to `df_targets`.

    alpha : float, optional, default=0.3
        The weight given to the first level F1-score in the final score calculation. The weight for the second level
        F1-score will be (1 - alpha).

    average : str, optional, default='macro'
        The averaging method for calculating the F1-score. It is passed directly to the `f1_score` function from sklearn.

    F1_list : bool, optional, default=False
        If True, the function returns a dictionary of F1-scores for all individual columns along with the overall F1-score.

    Returns:
    -------
    float or tuple
        If `F1_list` is False, returns a single float representing the overall hierarchical F1-score.
        If `F1_list` is True, returns a tuple where the first element is the overall hierarchical F1-score and the second
        element is a dictionary containing the F1-scores for all individual columns.

    """

    def calculate_f1_l2(df_targets, df_pred, average):
        """
        Calculate the F1-score for the second level of hierarchy.

        Parameters:
        ----------
        df_targets : pd.DataFrame
            The dataframe containing the true target values for the second level of hierarchy.
        df_pred : pd.DataFrame
            The dataframe containing the predicted values for the second level of hierarchy.
        average : str
            The averaging method for calculating the F1-score.

        Returns:
        -------
        dict
            A dictionary where keys are column names and values are the corresponding F1-scores.
        """
        F1_l2_dict = {column: 0 for column in df_targets.columns}

        # Find the intersection of indices
        common_indices = df_targets.index.intersection(df_pred.index)

        # Check if the intersection is empty
        if common_indices.empty:
            return F1_l2_dict
        else:
            # Select only the rows with common indices
            df_targets_common = df_targets.loc[common_indices]
            df_pred_common = df_pred.loc[common_indices]

            # Calculate the F1-score for each column based on the common rows
            for column in df_targets.columns:
                F1_l2_dict[column] = f1_score(df_targets_common[column], df_pred_common[column], average=average)

        return F1_l2_dict

    # Sort both dataframes based on index
    df_targets = df_targets.sort_index()
    df_pred = df_pred.sort_index()

    # Calculate F1 score for the first level of hierarchy
    F1_l1 = f1_score(df_targets['building_stock_type'], df_pred['building_stock_type'], average=average)
    F1_dict = {'building_stock_type': F1_l1}

    # Calculate F1 score for the second level of hierarchy (commercial buildings)
    df_com_targets = df_targets[df_targets['building_stock_type'] == 'commercial'].filter(like='_com')
    df_com_pred = df_pred[df_pred['building_stock_type'] == 'commercial'].filter(like='_com')
    F1_l2_dict_com = calculate_f1_l2(df_com_targets, df_com_pred, average)
    F1_l2_com = sum(F1_l2_dict_com.values()) / len(F1_l2_dict_com.values())

    F1_l2_dict = {}
    F1_l2_dict.update(F1_l2_dict_com)

    # Calculate F1 score for the second level of hierarchy (residential buildings)
    df_res_targets = df_targets[df_targets['building_stock_type'] == 'residential'].filter(like='_res')
    df_res_pred = df_pred[df_pred['building_stock_type'] == 'residential'].filter(like='_res')
    F1_l2_dict_res = calculate_f1_l2(df_res_targets, df_res_pred, average)
    F1_l2_res = sum(F1_l2_dict_res.values()) / len(F1_l2_dict_res.values())

    F1_l2_dict.update(F1_l2_dict_res)
    F1_l2_dict_sorted = sorted(F1_l2_dict.items(), key=lambda x: x[1], reverse=True)
    F1_dict.update(F1_l2_dict_sorted)

    # Calculate F1 score for the second level of hierarchy
    F1_l2 = (F1_l2_com + F1_l2_res) / 2

    # Calculate overall F1 score
    F1 = alpha * F1_l1 + (1 - alpha) * F1_l2

    if F1_list:
        return F1, F1_dict

    return F1

# Example usage
param_grid = {
    'n_estimators': (100, 500),  # Integer range
    'criterion': ['gini', 'entropy', 'log_loss'],  # List of options
    'max_depth': (5, 40),  # Integer range
    'min_samples_split': (2, 10),  # Integer range
    'min_samples_leaf': (1, 5),  # Integer range
    'max_features': ['sqrt', 'log2'],#, 0.5, 0.8],  # List of options None,
    'bootstrap': [True, False],  # List of options
    'train_set': ['full', 'daily', 'weekly', 'monthly'],  # List of options
    'n_jobs': [-1],
    'random_state': [1, 420],
    # 'min_weight_fraction_leaf': (0.0, 0.5),  # Float range
    # 'max_leaf_nodes': (10, 1000),  # Integer range
    # 'min_impurity_decrease': (0.0, 0.1),  # Float range
    # # 'oob_score': [True, False],  # List of options
    # 'class_weight': [None, 'balanced', 'balanced_subsample'],  # Class weights
    # 'ccp_alpha': (0.0, 0.1),  # Complexity parameter for pruning
    # 'max_samples': [None, 0.5, 0.8, 1.0],  # Fraction or integer number of samples
}

# Call the function
random_search(df_features_dict, df_targets, param_grid, n_runs=30, target="res")

Run 1/30 - Params: {'n_estimators': 358, 'criterion': 'log_loss', 'max_depth': 35, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True, 'train_set': 'weekly', 'n_jobs': -1}
Run 1/30 - F1 Score (Val): 0.2786                            
Run 2/30 - Params: {'n_estimators': 443, 'criterion': 'log_loss', 'max_depth': 32, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'bootstrap': True, 'train_set': 'weekly', 'n_jobs': -1}
Run 2/30 - F1 Score (Val): 0.2821                                    
Run 3/30 - Params: {'n_estimators': 428, 'criterion': 'log_loss', 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': False, 'train_set': 'monthly', 'n_jobs': -1, 'max_samples': None}
Run 3/30 - F1 Score (Val): 0.2595                                       
Run 4/30 - Params: {'n_estimators': 448, 'criterion': 'entropy', 'max_depth': 9, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features'

TODO:
- xgboost
- MulticolumnClassifier
- check submission quality

In [None]:
from sklearn.multioutput import MultiOutputClassifier
def multi_random_search(df_features_dict, df_targets, param_grid, n_runs=5, target="building_stock_type"):
    if target == "building_stock_type":
        results_file = "multi_results.csv"
        y = df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})
    elif target == "com":
        results_file = "multi_com_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'commercial'].filter(like='_com')
    elif target == "res":
        results_file = "multi_res_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'residential'].filter(like='_res')
    
    # Check if results file exists
    if os.path.exists(results_file):
        results_df = pd.read_csv(results_file)
    else:
        # Create an empty DataFrame if the results file doesn't exist
        results_df = pd.DataFrame(columns=[
            'train_set', 'f1_train', 'f1_val', 'n_estimators', 'criterion', 'max_depth', 
            'min_samples_split', 'min_samples_leaf', 'max_features', 'bootstrap'
        ])

    for _ in tqdm(range(n_runs), file=sys.stdout, desc="Running Random Search"):
        # print(f"Run {_ + 1}/{n_runs}")
        # sample parameters
        params = sample_hyperparameters(param_grid)
        train_set = params['train_set']
        df_features = df_features_dict[train_set].copy()
        if target == "building_stock_type":
            X = df_features
        elif target == "com":
            X = df_features[df_targets['building_stock_type'] == 'commercial']
        elif target == "res":
            X = df_features[df_targets['building_stock_type'] == 'residential']

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        rf_params = {key: value for key, value in params.items() if key != 'train_set'}
        clf = Pipeline([('preprocessor', ColumnTransformer([
                ('scaler', StandardScaler(), df_features.columns),
                ('encoder', OneHotEncoder(), [])
            ])),
            ('classifier', MultiOutputClassifier(RandomForestClassifier(**rf_params, random_state=42), n_jobs=-1))
        ])

        # Train and evaluate the model
        clf.fit(X_train, y_train)

        y_train_pred = clf.predict(X_train)
        y_val_pred = clf.predict(X_val)

        if target == "building_stock_type":
            # Evaluate on the training set
            f1_train = f1_score(y_train, y_train, average='macro')
            train_conf_matrix = confusion_matrix(y_train, y_train)
            train_TN, train_FP, train_FN, train_TP = train_conf_matrix.ravel()
            f1_val = f1_score(y_val, y_val_pred, average='macro')
            val_conf_matrix = confusion_matrix(y_val, y_val_pred)

            # Extract TP, TN, FP, FN from the validation set confusion matrix
            val_TN, val_FP, val_FN, val_TP = val_conf_matrix.ravel()

            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                'train_TP': train_TP,
                'train_TN': train_TN,
                'train_FP': train_FP,
                'train_FN': train_FN,
                'val_TP': val_TP,
                'val_TN': val_TN,
                'val_FP': val_FP,
                'val_FN': val_FN,
                **rf_params
                }
        else:
            y_train_pred = pd.DataFrame(y_train_pred, columns=y.columns)
            y_val_pred = pd.DataFrame(y_val_pred, columns=y.columns)

            # Evaluate on the training set
            F1_l2_dict = {column: 0 for column in y.columns}
            for col in y.columns:
                F1_l2_dict[col] = f1_score(y_train[col], y_train_pred[col], average="macro")
            f1_train = sum(F1_l2_dict.values()) / len(F1_l2_dict.values())

            # Evaluate on the validation set
            F1_l2_dict = {column: 0 for column in y.columns}
            for col in y.columns:
                F1_l2_dict[col] = f1_score(y_val[col], y_val_pred[col], average="macro")
            f1_val = sum(F1_l2_dict.values()) / len(F1_l2_dict.values())
        
            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                **rf_params
            }
        # Update tqdm description with the current F1 score
        tqdm.write(f"Run {_ + 1}/{n_runs} - F1 Score (Val): {f1_val:.4f}")
        # tqdm.set_description(f"Run {_ + 1}/{n_runs} - F1 Val: {f1_val:.4f}")
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
    results_df.to_csv(results_file, index=False)

# Example usage
param_grid = {
    'n_estimators': (100, 500),  # Integer range
    'criterion': ['gini', 'entropy', 'log_loss'],  # List of options
    'max_depth': (5, 40),  # Integer range
    'min_samples_split': (2, 10),  # Integer range
    'min_samples_leaf': (1, 10),  # Integer range
    'max_features': ['sqrt', 'log2', None, 0.5, 0.8],  # List of options
    'bootstrap': [True, False],  # List of options
    'train_set': ['full', 'daily', 'weekly', 'monthly'],  # List of options
    'n_jobs': [-1],
    'min_weight_fraction_leaf': (0.0, 0.5),  # Float range
    'max_leaf_nodes': (10, 1000),  # Integer range
    'min_impurity_decrease': (0.0, 0.1),  # Float range
    # 'oob_score': [True, False],  # List of options
    'class_weight': [None, 'balanced', 'balanced_subsample'],  # Class weights
    'ccp_alpha': (0.0, 0.1),  # Complexity parameter for pruning
    'max_samples': [None, 0.5, 0.8, 1.0],  # Fraction or integer number of samples
}

multi_random_search(df_features_dict, df_targets, param_grid, n_runs=5, target="com")

## Results analysis

In [None]:
df1 = pd.read_csv("results.csv")
df2 = pd.read_csv("com_results.csv")
df3 = pd.read_csv("res_results.csv")
display(df1)
display(df2)
display(df3)