In [1]:
from tqdm import tqdm
# Import the data sets
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
base_path = r"C:\Users\KAI\Coding\ThinkOnward_challenge\thinkOnward_TSClassification"
data_path = r"\data\building-instinct-starter-notebook\Starter notebook"
preprocessing_path = r"\kai\preprocessing"
sys.path.append(base_path+data_path)
sys.path.append(base_path+"\kai")
sys.path.append(base_path+preprocessing_path)
from preprocessing.preprocessing import Preprocessor
from scipy import stats
import xgboost as xgb
import cupy as cp
import gc
from preprocessing.utils import print_memory_usage, print_gpu_memory, free_gpu_memory
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

pre_load = False
if pre_load:
    df_features_dict = {}

    for s in ["monthly", "weekly", "daily", "with_regional/monthly", "with_regional/weekly", "with_regional/daily"]:
        df_features = pd.read_parquet(base_path + f'/preprocessed_data/{s}_data.parquet', engine='pyarrow')
        df_features.sort_index(inplace=True)
        df_features_dict[s] = df_features

    # Full Dataset
    df_features_full = pd.read_parquet(base_path + '/preprocessed_data/standard_data.parquet', engine='pyarrow')
    df_features_full.sort_index(inplace=True)
    df_features_dict['full'] = df_features_full

    # Full with regional Dataset
    df_features_full = pd.read_parquet(base_path + '/preprocessed_data/with_regional/standard_data.parquet', engine='pyarrow')
    df_features_full.sort_index(inplace=True)
    df_features_dict['with_regional/full'] = df_features_full

# Labels
load_filepath_labels = os.path.join(base_path + data_path,'building-instinct-train-label', 'train_label.parquet')#path to the train label file
df_targets = pd.read_parquet(load_filepath_labels, engine='pyarrow')
y = df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})

In [6]:
from preprocessing.run import create_submission
from sklearn.multioutput import MultiOutputClassifier

def run_model(param_grid, param_grid_com, param_grid_res, submission_path="submission.parquet", read_from_file = [False, False, False]):
    '''
    Function to run the model and create the submission file.
    '''
    
    # 0. Load the data
    X = pd.read_parquet(base_path + f'/preprocessed_data/with_regional/standard_data.parquet', engine='pyarrow')
    X.sort_index(inplace=True)
    load_filepath_labels = os.path.join(base_path + data_path,'building-instinct-train-label', 'train_label.parquet')#path to the train label file
    df_targets = pd.read_parquet(load_filepath_labels, engine='pyarrow')
    y = df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})
    X_test = pd.read_parquet(base_path + '/preprocessed_data/test/with_regional_data_test.parquet', engine='pyarrow')
    X_test.sort_index(inplace=True)
    # 1. prediction of building_type
    if read_from_file[0]:
        y_pred = pd.read_parquet(base_path + '/kai/model/XGBoost/y_pred.parquet', engine='pyarrow')
    else:

        clf = Pipeline([('preprocessor', ColumnTransformer([
                        ('scaler', StandardScaler(), X.columns),
                        ('encoder', OneHotEncoder(), [])
                    ])),
                    ('classifier', RandomForestClassifier(**param_grid, random_state=42))
                ])
        clf.fit(X, y)
        y_pred = clf.predict(X_test)
        y_pred = pd.DataFrame(y_pred, index=X_test.index, columns=["building_stock_type"])
        y_pred.to_parquet(base_path + '/kai/model/XGBoost/y_pred.parquet', engine='pyarrow')

    # 2.making the com prediction
    if read_from_file[1]:
        y_com_pred = pd.read_parquet(base_path + '/kai/model/XGBoost/y_com_pred.parquet', engine='pyarrow')
    else:
        X_com = X[df_targets['building_stock_type'] == 'commercial']
        y_com = df_targets[df_targets['building_stock_type'] == 'commercial'].filter(like='_com')
        label_encoders = {}
        for col in y_com.columns:
            le = LabelEncoder()
            # le = OneHotEncoder()
            y_com[col] = le.fit_transform(y_com[col])
            label_encoders[col] = le
        X_com_test = X_test[y_pred == 1]

        y_com_pred = pd.DataFrame(columns=y_com.columns)
        for i, col in enumerate(y_com.columns):
            print(f"Training model for {col}")
            y_com_col = y_com.iloc[:, i]
            # bst = Pipeline([('preprocessor', ColumnTransformer([
            #                     ('scaler', StandardScaler(), slice(0, X.shape[1])),
            #                     ('encoder', OneHotEncoder(), [])  # No need to change the encoder for GPU
            #                 ])),
            #                 ('classifier', MultiOutputClassifier(
            #                     xgb.XGBClassifier(**param_grid_com),))
                        # ])
            bst = Pipeline([('preprocessor', ColumnTransformer([
                                ('scaler', StandardScaler(), slice(0, X.shape[1])),
                                ('encoder', OneHotEncoder(), [])  # No need to change the encoder for GPU
                            ])),
                            ('classifier',xgb.XGBClassifier(**param_grid_com),)
                        ])
            bst.fit(X_com, y_com_col)
            y_com_pred_col = bst.predict(X_com_test)

            # y_com_pred_col = cp.asnumpy(y_com_pred_col)
            y_com_pred_col = label_encoders[col].inverse_transform(y_com_pred_col)
            y_com_pred[col] = y_com_pred_col
        y_com_pred.index = X_com_test.index
        pd.DataFrame(y_com_pred).to_parquet(base_path + '/kai/model/XGBoost/y_com_pred.parquet', engine='pyarrow')

    # 3. making the res prediction
    if read_from_file[2]:
        y_res_pred = pd.read_parquet(base_path + '/kai/model/XGBoost/y_res_pred.parquet', engine='pyarrow')
    else: 
        X_res = X[df_targets['building_stock_type'] == 'residential']
        y_res = df_targets[df_targets['building_stock_type'] == 'residential'].filter(like='_res')
        label_encoders = {}
        for col in y_res.columns:
            le = LabelEncoder()
            y_res[col] = le.fit_transform(y_res[col])
            label_encoders[col] = le
        X_res_test = X_test[y_pred == 0]

        # X_train_gpu = cp.array(X_res)
        # X_test_gpu = cp.array(X_res_test)
        y_res_pred = pd.DataFrame(columns=y_res.columns)
        for i, col in enumerate(y_res.columns):
            print(f"Training model for {col}")
            y_res_col = y_res.iloc[:, i]
            # y_train_gpu = cp.array(y_res_col)
            # bst = Pipeline([('preprocessor', ColumnTransformer([
            #                     ('scaler', StandardScaler(), slice(0, X.shape[1])),
            #                     ('encoder', OneHotEncoder(), [])  # No need to change the encoder for GPU
            #                 ])),
            #                 ('classifier', MultiOutputClassifier(
            #                     xgb.XGBClassifier(**param_grid_res),))
            #             ])
            bst = Pipeline([('preprocessor', ColumnTransformer([
                                ('scaler', StandardScaler(), slice(0, X.shape[1])),
                                ('encoder', OneHotEncoder(), [])  # No need to change the encoder for GPU
                            ])),
                            ('classifier',xgb.XGBClassifier(**param_grid_res),)
                        ])
            # bst = XGBClassifier(**res_params)
            bst.fit(X_res, y_res_col)
            y_res_pred_col = bst.predict(X_res_test)

            # y_res_pred_col = cp.asnumpy(y_res_pred_col)
            y_res_pred_col = label_encoders[col].inverse_transform(y_res_pred_col)
            y_res_pred[col] = y_res_pred_col
        y_res_pred.index = X_res_test.index
        y_res_pred.to_parquet(base_path + '/kai/model/XGBoost/y_res_pred.parquet', engine='pyarrow')
    submission_df = create_submission(y_com_pred, y_res_pred, y_pred, save_filepath=base_path + submission_path)
    return submission_df

param_grid = {'n_estimators': 136, 'criterion': 'gini', 'max_depth': 61,   'min_samples_split': 7, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': False,}

param_grid_com = {'n_estimators': 7, 'max_depth': 5, 'random_state': 42, 'device': 'cpu',}

param_grid_res = {'n_estimators': 7, 'max_depth': 5, 'random_state': 42, 'device': 'cpu','eta':0.3273831428143833}


run_model(param_grid, param_grid_com, param_grid_res, submission_path='/submissions/xgboost_submission_31_08.parquet', read_from_file=[True, True, False])

Training model for in.comstock_building_type_group_com
Training model for in.heating_fuel_com
Training model for in.hvac_category_com
Training model for in.number_of_stories_com
Training model for in.ownership_type_com
Training model for in.vintage_com
Training model for in.wall_construction_type_com
Training model for in.tstat_clg_sp_f..f_com
Training model for in.tstat_htg_sp_f..f_com
Training model for in.weekday_opening_time..hr_com
Training model for in.weekday_operating_hours..hr_com
Training model for in.bedrooms_res
Training model for in.cooling_setpoint_res
Training model for in.heating_setpoint_res
Training model for in.geometry_building_type_recs_res
Training model for in.geometry_floor_area_res
Training model for in.geometry_foundation_type_res
Training model for in.geometry_wall_type_res
Training model for in.heating_fuel_res


XGBoostError: [13:44:06] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0015a694724fa8361-1\xgboost\xgboost-ci-windows\src\common\io.h:320: bad_malloc: Failed to allocate 4245555648 bytes.

In [2]:
# prediction of building_type
param_grid = {'n_estimators': 136,  # Integer range
    'criterion': 'gini',  # List of options
    'max_depth': 61,  # Integer range
    'min_samples_split': 7,  # Integer range
    'min_samples_leaf': 3,  # Integer range
    'max_features': 'sqrt',  # List of options
    'bootstrap': False,  # List of options
}
X = pd.read_parquet(base_path + f'/preprocessed_data/with_regional/standard_data.parquet', engine='pyarrow')
X.sort_index(inplace=True)
clf = Pipeline([('preprocessor', ColumnTransformer([
                ('scaler', StandardScaler(), X.columns),
                ('encoder', OneHotEncoder(), [])
            ])),
            ('classifier', RandomForestClassifier(**param_grid, random_state=42))
        ])
clf.fit(X, y)
X_test = pd.read_parquet(base_path + '/preprocessed_data/test/with_regional_data_test.parquet', engine='pyarrow')
X_test.sort_index(inplace=True)
y_pred = clf.predict(X_test)

In [3]:
# making the com prediction
com_params = {'n_estimators': 3, 'max_depth': 7, 'random_state': 42, 'device': 'cpu',}
X_com = X[df_targets['building_stock_type'] == 'commercial']
y_com = df_targets[df_targets['building_stock_type'] == 'commercial'].filter(like='_com')
label_encoders = {}
for col in y_com.columns:
    le = LabelEncoder()
    # le = OneHotEncoder()
    y_com[col] = le.fit_transform(y_com[col])
    label_encoders[col] = le
X_com_test = X_test[y_pred == 1]

y_com_pred = pd.DataFrame(columns=y_com.columns)
for i, col in enumerate(y_com.columns):
    print(f"Training model for {col}")
    y_com_col = y_com.iloc[:, i]

    bst = XGBClassifier(**com_params)
    bst.fit(X_com, y_com_col)
    y_com_pred_col = bst.predict(X_com_test)

    # y_com_pred_col = cp.asnumpy(y_com_pred_col)
    y_com_pred_col = label_encoders[col].inverse_transform(y_com_pred_col)
    y_com_pred[col] = y_com_pred_col

Training model for in.comstock_building_type_group_com
Training model for in.heating_fuel_com
Training model for in.hvac_category_com
Training model for in.number_of_stories_com
Training model for in.ownership_type_com
Training model for in.vintage_com
Training model for in.wall_construction_type_com
Training model for in.tstat_clg_sp_f..f_com
Training model for in.tstat_htg_sp_f..f_com
Training model for in.weekday_opening_time..hr_com
Training model for in.weekday_operating_hours..hr_com


In [8]:
pd.DataFrame(y_com_pred).to_parquet(base_path + '/kai/model/XGBoost/y_com_pred.parquet', engine='pyarrow')

In [41]:
# making the res prediction 
res_params = {'n_estimators': 4, 'max_depth': 3, 'random_state': 42, 'device': 'cpu','eta':0.3273831428143833}
X_res = X[df_targets['building_stock_type'] == 'residential']
y_res = df_targets[df_targets['building_stock_type'] == 'residential'].filter(like='_res')
label_encoders = {}
for col in y_res.columns:
    le = LabelEncoder()
    y_res[col] = le.fit_transform(y_res[col])
    label_encoders[col] = le
X_res_test = X_test[y_pred == 0]

# X_train_gpu = cp.array(X_res)
# X_test_gpu = cp.array(X_res_test)
y_res_pred = pd.DataFrame(columns=y_res.columns)
for i, col in enumerate(y_res.columns):
    y_res_col = y_res.iloc[:, i]
    # y_train_gpu = cp.array(y_res_col)

    bst = XGBClassifier(**res_params)
    bst.fit(X_res, y_res_col)
    y_res_pred_col = bst.predict(X_res_test)

    # y_res_pred_col = cp.asnumpy(y_res_pred_col)
    y_res_pred_col = label_encoders[col].inverse_transform(y_res_pred_col)
    y_res_pred[col] = y_res_pred_col

In [42]:
y_res_pred.to_parquet(base_path + '/kai/model/XGBoost/y_res_pred.parquet', engine='pyarrow')

In [43]:
def create_submission(df_com, df_res, df_test, save_filepath=None):
    """
    Given a df_test dataframe that already contains the predictions for the building_stock_type column,
    and two dataframes df_com and df_res that contain the predictions for the residential and commercial
    columns respectively, this function will create a submission dataframe that is compatible with the submission format.
    """
    # First load the training labels again to get the correct column order
    load_filepath_labels = os.path.join(base_path + data_path,'building-instinct-train-label', 'train_label.parquet')
    df_targets = pd.read_parquet(load_filepath_labels, engine='pyarrow')
    df_targets.sort_index(inplace=True)

    # Create a new dataframe with the same index as df_targets
    bldg_id_list = [i for i in range(1,1441)]
    df = pd.DataFrame(index=bldg_id_list, columns=df_targets.columns)
    df.index.name = df_targets.index.name

    # Populate the first column 'building_stock_type'
    df['building_stock_type'] = df_test["building_stock_type"].map({0: 'residential', 1: 'commercial'})

    res_columns = [col for col in df_targets.columns if col.endswith('_res')]
    com_columns = [col for col in df_targets.columns if col.endswith('_com')]
    for bldg_id in df.index:
        if df.at[bldg_id, 'building_stock_type'] == 'residential':
            df.loc[bldg_id, com_columns] = np.nan
            for col in res_columns:
                df.at[bldg_id, col] = df_res.at[bldg_id, col]
        else:
            df.loc[bldg_id, res_columns] = np.nan
            for col in com_columns:
                df.at[bldg_id, col] = df_com.at[bldg_id, col]
    df = df.astype(str)
    if save_filepath:
        df.to_parquet(save_filepath)
    return df

y_res_pred.index = X_res_test.index
y_com_pred.index = X_com_test.index
y_pred = pd.DataFrame(y_pred, index=X_test.index, columns=["building_stock_type"])
submission_df = create_submission(y_com_pred, y_res_pred, y_pred, save_filepath=base_path + '/submissions/xgboost_submission_30_08.parquet')

In [32]:
from preprocessing.run import create_submission
submission_df = create_submission(y_com_pred, y_res_pred, pd.DataFrame(y_pred, columns=["building_stock_type"]), save_filepath=base_path + '/submissions/xgboost_submission_30_08.parquet')

KeyError: 2

In [3]:
def sample_hyperparameters(param_grid):
    """
    Sample hyperparameters from the given grid using scipy.stats.

    Parameters:
    ----------
    param_grid : dict
        A dictionary where keys are hyperparameter names and values are lists of options or ranges.

    Returns:
    -------
    dict
        A dictionary with sampled hyperparameters.
    """
    sampled_params = {}
    
    for param, values in param_grid.items():
        if isinstance(values, list):
            # Randomly choose from list of options
            sampled_params[param] = np.random.choice(values)
        elif isinstance(values, tuple) and len(values) == 2:
            min_val, max_val = values
            if isinstance(min_val, int) and isinstance(max_val, int):
                # Sample integer values using scipy.stats.randint
                sampled_params[param] = stats.randint.rvs(min_val, max_val + 1)
            elif isinstance(min_val, float) and isinstance(max_val, float):
                # Sample float values using scipy.stats.uniform
                sampled_params[param] = stats.uniform.rvs(min_val, max_val - min_val)
        elif isinstance(values, str) and values == 'choice':
            # Sample from a list of options if 'choice' is specified
            sampled_params[param] = np.random.choice(param_grid[param])
        else:
            raise ValueError(f"Unsupported parameter type for {param}: {values}")
    
    # Adjust parameters for RandomForestClassifier
    if not sampled_params.get('bootstrap', True):
        sampled_params['max_samples'] = None  # Reset max_samples if bootstrap is False
        # sampled_params['oob_score'] = False
    # Ensure max_features is correctly set
    max_features = sampled_params.get('max_features')
    if isinstance(max_features, str) and max_features.startswith('0'):
        sampled_params['max_features'] = float(max_features)
        
    
    return sampled_params

# @profile(precision=4)
def produce_submission(df_features_dict, df_targets, param_grid_list, n_runs=5, target="building_stock_type"):
    """
    1. fit simple classifier for building_stock_type
    2. fit multioutput classifier for com
    3. fit multioutput classifier for res
    """
    if target == "building_stock_type":
        results_file = base_path + r"\kai\model"+ "HP_results/multixgboost/multi_results.csv"
        y = df_targets["building_stock_type"].map({"residential": 0, "commercial": 1})
    elif target == "com":
        results_file = base_path + r"\kai\model"+ "\HP_results/multixgboost/multi_com_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'commercial'].filter(like='_com')
        label_encoders = {}
        for col in y.columns:
            le = LabelEncoder()
            y[col] = le.fit_transform(y[col])
            label_encoders[col] = le
    elif target == "res":
        results_file = base_path + r"\kai\model"+ "\HP_results/multixgboost/multi_res_results.csv"
        y = df_targets[df_targets['building_stock_type'] == 'residential'].filter(like='_res')
        label_encoders = {}
        for col in y.columns:
            le = LabelEncoder()
            y[col] = le.fit_transform(y[col])
            label_encoders[col] = le
    
    # Check if results file exists
    if os.path.exists(results_file):
        results_df = pd.read_csv(results_file)
    else:
        # Create an empty DataFrame if the results file doesn't exist
        results_df = pd.DataFrame(columns=[
            'train_set', 'f1_train', 'f1_val', 'n_estimators', 'criterion', 'max_depth', 
            'min_samples_split', 'min_samples_leaf', 'max_features', 'bootstrap'
        ])

    for _ in tqdm(range(n_runs), file=sys.stdout, desc="Running Random Search"):
        # print(f"Run {_ + 1}/{n_runs}")
        # sample parameters
        # print_memory_usage()
        # print_gpu_memory()
        params = sample_hyperparameters(param_grid)
        tqdm.write(f"Run {_ + 1}/{n_runs} - Params: {params}")
        train_set = params['train_set']
        if df_features_dict is None:
            df_features = pd.read_parquet(base_path + f'/preprocessed_data/{train_set}_data.parquet', engine='pyarrow')
            df_features.sort_index(inplace=True)
        else:
            df_features = df_features_dict[train_set].copy()
        
        if target == "building_stock_type":
            X = df_features
        elif target == "com":
            X = df_features[df_targets['building_stock_type'] == 'commercial']
        elif target == "res":
            X = df_features[df_targets['building_stock_type'] == 'residential']

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        xgb_params = {key: value for key, value in params.items() if key not in ['train_set', "num_boost_round"]}

        F1_l2_dict_train = {}
        F1_l2_dict_val = {}
        for i, col in enumerate(y.columns):
            y_train_col = y_train.iloc[:, i]
            y_val_col = y_val.iloc[:, i]
            
            if params["device"] == "cuda":
                X_train_gpu = cp.array(X_train)
                X_val_gpu = cp.array(X_val)
                y_train_gpu = cp.array(y_train_col)
                y_val_gpu = cp.array(y_val_col)
            else:
                dtrain = xgb.DMatrix(X_train, label=y_train_col)
                dval = xgb.DMatrix(X_val, label=y_val_col)
            
            # Train the model
            if "n_estimators" in xgb_params:
                bst = XGBClassifier(**xgb_params)
                bst.fit(X_train_gpu, y_train_gpu)
                y_train_pred = bst.predict(X_train_gpu)
                y_val_pred = bst.predict(X_val_gpu)
            else:# xgb.train can work with DMatrix objects
                dtrain = xgb.DMatrix(X_train_gpu, label=y_train_gpu)
                dval = xgb.DMatrix(X_val_gpu, label=y_val_gpu)
                bst = xgb.train(
                    xgb_params,
                    dtrain,
                    num_boost_round=params["num_boost_round"],)
                y_train_pred = bst.predict(dtrain)
                y_val_pred = bst.predict(dval)
            
            if params["device"] == "cuda":# retransform to cpu for F1 score computation
                y_train_pred = cp.asnumpy(y_train_pred)
                y_val_pred = cp.asnumpy(y_val_pred)

            # compute F1
            F1_l2_dict_train[col] = f1_score(y_train_col, y_train_pred.round(), average='macro')
            F1_l2_dict_val[col] = f1_score(y_val_col, y_val_pred.round(), average='macro')

            # Clean up GPU memory after each model
            if params["device"] == "cuda":
                del X_train_gpu, X_val_gpu, y_train_gpu, y_val_gpu
                cp.get_default_memory_pool().free_all_blocks()
                gc.collect()
            tqdm.write(f"Run {_ + 1}/{n_runs} - Finished column: {col} ({i+1}/{len(y.columns)}) with F1 Score (Val): {F1_l2_dict_val[col]:.4f}")

        if target == "building_stock_type":
            # Evaluate on the training set
            f1_train = f1_score(y_train, y_train, average='macro')
            train_conf_matrix = confusion_matrix(y_train, y_train)
            train_TN, train_FP, train_FN, train_TP = train_conf_matrix.ravel()
            f1_val = f1_score(y_val, y_val_pred, average='macro')
            val_conf_matrix = confusion_matrix(y_val, y_val_pred)

            # Extract TP, TN, FP, FN from the validation set confusion matrix
            val_TN, val_FP, val_FN, val_TP = val_conf_matrix.ravel()

            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                'train_TP': train_TP,
                'train_TN': train_TN,
                'train_FP': train_FP,
                'train_FN': train_FN,
                'val_TP': val_TP,
                'val_TN': val_TN,
                'val_FP': val_FP,
                'val_FN': val_FN,
                **xgb_params
                }
        else:
            f1_train = sum(F1_l2_dict_train.values()) / len(F1_l2_dict_train.values())
            f1_val = sum(F1_l2_dict_val.values()) / len(F1_l2_dict_val.values())
            # Append the results to the DataFrame
            new_row = {
                'train_set': train_set,
                'f1_train': f1_train,
                'f1_val': f1_val,
                **xgb_params
            }

        # Update tqdm description with the current F1 score
        tqdm.write(f"Run {_ + 1}/{n_runs} - F1 Score (Val): {f1_val:.4f}")
        # tqdm.set_description(f"Run {_ + 1}/{n_runs} - F1 Val: {f1_val:.4f}")
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        results_df.to_csv(results_file, index=False)

param_grid_xgb = {
    'num_boost_round': (1, 4),  # Number of boosting rounds
    'max_depth': (3, 20),  # Maximum tree depth for base learners
    'eta': (0.05, 0.4),  # Boosting learning rate (xgb's "eta")
    'min_child_weight': (1, 20),  # Minimum sum of instance weight (hessian) needed in a child
    'subsample': [0.5, 0.75, 1.0],  # Subsample ratio of the training instance
    'n_rounds': (2, 8),
    # 'max_leaves': (1, 100),  # Maximum number of leaves; 0 indicates no limit
    # 'max_bin': (256, 512),  # Maximum number of bins per feature for histogram-based algorithm
    # 'grow_policy': ['depthwise', 'lossguide'],  # Tree growing policy
    # 'learning_rate': (0.01, 0.3),  # Boosting learning rate (eta)
    # 'verbosity': [0, 1, 2, 3],  # Degree of verbosity (0: silent, 1: warning, 2: info, 3: debug)
    # 'objective': ['binary:logistic', 'multi:softprob', 'reg:squarederror'],  # Learning objective
    'booster': ['gbtree',],# 'gblinear', 'dart'],  # Booster to use
    'tree_method': ['hist'],  # Tree method
    # 'gamma': (0, 5),  # Minimum loss reduction required to make a further partition on a leaf node of the tree
    # 'min_child_weight': (0, 10),  # Minimum sum of instance weight (hessian) needed in a child
    # 'subsample': (0.5, 1.0),  # Subsample ratio of the training instance
    # 'sampling_method': ['uniform', 'gradient_based'],  # Sampling method (only for GPU version of hist tree method)
    # 'colsample_bytree': (0.5, 1.0),  # Subsample ratio of columns when constructing each tree
    # 'colsample_bylevel': (0.5, 1.0),  # Subsample ratio of columns for each level
    # 'colsample_bynode': (0.5, 1.0),  # Subsample ratio of columns for each split
    # 'reg_alpha': (0, 1),  # L1 regularization term on weights
    # 'reg_lambda': (1, 10),  # L2 regularization term on weights
    # 'scale_pos_weight': (0.1, 10),  # Balancing of positive and negative weights
    # 'base_score': (0.5, 0.5),  # The initial prediction score of all instances, global bias
    # 'multi_strategy': ['one_output_per_tree', 'multi_output_tree'],
    'random_state': [42],  # Random number seed for reproducibility
    # 'early_stopping_rounds': (10, 100),  # Number of rounds for early stopping
    'device': ['cuda'],#, 'cuda'],  # Device to use
    'train_set': ['standard', 'daily', 'weekly', 'monthly', 'with_regional/monthly', 'with_regional/weekly', 'with_regional/daily', 'with_regional/standard']  # List of options
}

param_grid_xgb_1 = {
    'n_estimators': (4, 50),  # Number of boosting rounds
    'max_depth': (3, 100),  # Maximum tree depth for base learners
    'eta': (0.05, 0.4),  # Boosting learning rate (xgb's "eta")
    'random_state': [42],  # Random number seed for reproducibility
    'device': ['cuda'],#, 'cuda'],  # Device to use
    'train_set': ['standard', 'daily', 'weekly', 'monthly', 'with_regional/monthly', 'with_regional/weekly', 'with_regional/daily', 'with_regional/standard']  # List of options
    }

cp.get_default_memory_pool().free_all_blocks()
# print_memory_usage()
# print_gpu_memory()
# random_search(None, df_targets, param_grid_xgb_1, n_runs=5, target="res")

In [6]:
def calculate_average_energy_consumption(folder_path, season_months_dict=None, type='daily', with_regional=False):
    """
    Process multiple parquet files in a folder, calculate average energy consumption,
    and return a pandas DataFrame with each row corresponding to one file in the folder.

    Parameters:
    - folder_path (str): Path to the folder containing parquet files.
    - season_months_dict (dict): A dictionary where keys are season names (strings) and values are lists
    of corresponding month numbers. For example, {'cold': [1, 2, 12], 'hot': [6, 7, 8], 'mild': [3, 4, 5, 9, 10, 11]}.

    Returns:
    - df_ave (pd.DataFrame): A pandas DataFrame with each row corresponding to one file in the folder (i.e. one building).
    The columns are multi-layer with the first layer being the day/week/month/season and the second layer the hour of the day 
    Index ('bldg_id') contains building IDs. Column values are average hourly electricity energy consumption
    """
    # Initialize an empty list to store individual DataFrames for each file
    result_dfs = []
    if with_regional:
        locations = {
            "WI": {"latitude": 44.500000, "longitude": -89.500000},  # Wisconsin
            "WV": {"latitude": 39.000000, "longitude": -80.500000},  # West Virginia
            "VT": {"latitude": 44.000000, "longitude": -72.699997},  # Vermont
            "TX": {"latitude": 31.000000, "longitude": -100.000000}, # Texas
            "SD": {"latitude": 44.500000, "longitude": -100.000000}, # South Dakota
            "RI": {"latitude": 41.742325, "longitude": -71.742332},  # Rhode Island
            "OR": {"latitude": 44.000000, "longitude": -120.500000}, # Oregon
            "NY": {"latitude": 43.000000, "longitude": -75.000000},  # New York
            "NH": {"latitude": 44.000000, "longitude": -71.500000},  # New Hampshire
            "NE": {"latitude": 41.500000, "longitude": -100.000000}, # Nebraska
            "KS": {"latitude": 38.500000, "longitude": -98.000000},  # Kansas
            "MS": {"latitude": 33.000000, "longitude": -90.000000},  # Mississippi
            "IL": {"latitude": 40.000000, "longitude": -89.000000},  # Illinois
            "DE": {"latitude": 39.000000, "longitude": -75.500000},  # Delaware
            "CT": {"latitude": 41.599998, "longitude": -72.699997},  # Connecticut
            "AR": {"latitude": 34.799999, "longitude": -92.199997},  # Arkansas
            "IN": {"latitude": 40.273502, "longitude": -86.126976},  # Indiana
            "MO": {"latitude": 38.573936, "longitude": -92.603760},  # Missouri
            "FL": {"latitude": 27.994402, "longitude": -81.760254},  # Florida
            "NV": {"latitude": 39.876019, "longitude": -117.224121}, # Nevada
            "ME": {"latitude": 45.367584, "longitude": -68.972168},  # Maine
            "MI": {"latitude": 44.182205, "longitude": -84.506836},  # Michigan
            "GA": {"latitude": 33.247875, "longitude": -83.441162},  # Georgia
            "HI": {"latitude": 19.741755, "longitude": -155.844437}, # Hawaii
            "AK": {"latitude": 66.160507, "longitude": -153.369141}, # Alaska
            "TN": {"latitude": 35.860119, "longitude": -86.660156},  # Tennessee
            "VA": {"latitude": 37.926868, "longitude": -78.024902},  # Virginia
            "NJ": {"latitude": 39.833851, "longitude": -74.871826},  # New Jersey
            "KY": {"latitude": 37.839333, "longitude": -84.270020},  # Kentucky
            "ND": {"latitude": 47.650589, "longitude": -100.437012}, # North Dakota
            "MN": {"latitude": 46.392410, "longitude": -94.636230},  # Minnesota
            "OK": {"latitude": 36.084621, "longitude": -96.921387},  # Oklahoma
            "MT": {"latitude": 46.965260, "longitude": -109.533691}, # Montana
            "WA": {"latitude": 47.751076, "longitude": -120.740135}, # Washington
            "UT": {"latitude": 39.419220, "longitude": -111.950684}, # Utah
            "CO": {"latitude": 39.113014, "longitude": -105.358887}, # Colorado
            "OH": {"latitude": 40.367474, "longitude": -82.996216},  # Ohio
            "AL": {"latitude": 32.318230, "longitude": -86.902298},  # Alabama
            "IA": {"latitude": 42.032974, "longitude": -93.581543},  # Iowa
            "NM": {"latitude": 34.307144, "longitude": -106.018066}, # New Mexico
            "SC": {"latitude": 33.836082, "longitude": -81.163727},  # South Carolina
            "PA": {"latitude": 41.203323, "longitude": -77.194527},  # Pennsylvania
            "AZ": {"latitude": 34.048927, "longitude": -111.093735}, # Arizona
            "MD": {"latitude": 39.045753, "longitude": -76.641273},  # Maryland
            "MA": {"latitude": 42.407211, "longitude": -71.382439},  # Massachusetts
            "CA": {"latitude": 36.778259, "longitude": -119.417931}, # California
            "ID": {"latitude": 44.068203, "longitude": -114.742043}, # Idaho
            "WY": {"latitude": 43.075970, "longitude": -107.290283}, # Wyoming
            "NC": {"latitude": 35.782169, "longitude": -80.793457},  # North Carolina
            "LA": {"latitude": 30.391830, "longitude": -92.329102},  # Louisiana
            "DC": {"latitude": 38.907200, "longitude": -77.036900},  # Washington, D.C.
        }

    # Iterate through all files in the folder_path
    for file_name in tqdm(os.listdir(folder_path)):
        if file_name.endswith(".parquet"):
            # Extract the bldg_id from the file name
            bldg_id = int(file_name.split('.')[0])

            # Construct the full file path
            file_path = os.path.join(folder_path, file_name)

            # Read the original parquet file
            df = pd.read_parquet(file_path)

            # Convert 'timestamp' column to datetime
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df['hour'] = df['timestamp'].dt.hour
            if with_regional:
                state = df["in.state"].unique()[0]
                latitude = locations[state]["latitude"]
                longitude = locations[state]["longitude"]
            if type == 'daily':# -> goes from Input: 365 * 24 * 4 = 35,040 columns to 365 * 24 = 8,760 values per building
                df['day_of_year'] = df['timestamp'].dt.day_of_year
                df['hourly_energy_consumption'] = df.groupby(['day_of_year', 'hour'])['out.electricity.total.energy_consumption'].transform('mean')
                result_df = df.pivot_table(values='hourly_energy_consumption', index='bldg_id', columns=['day_of_year', 'hour'])
            
            elif type == 'weekly':# -> goes from Input: 365 * 24 * 4 = 35,040 columns to 52 * 24 = 1,248 values per building
                df['week'] = df['timestamp'].dt.isocalendar().week
                df['weekly_energy_consumption'] = df.groupby(['week', 'hour'])['out.electricity.total.energy_consumption'].transform('mean')
                result_df = df.pivot_table(values='weekly_energy_consumption', index='bldg_id', columns=['week', 'hour'])

            elif type == 'monthly':# -> goes from Input: 365 * 24 * 4 = 35,040 columns to 12 * 24 = 288 values per building
                df['month'] = df['timestamp'].dt.month
                df['monthly_energy_consumption'] = df.groupby(['month', 'hour'])['out.electricity.total.energy_consumption'].transform('mean')
                result_df = df.pivot_table(values='monthly_energy_consumption', index='bldg_id', columns=['month', 'hour'])

            elif type == 'seasonal': # originally provided prerpocessing method -> goes from Input: 365 * 24 * 4 = 35,040 columns to 365 * (12/s)  = ... values per building
                df['month'] = df['timestamp'].dt.month
                # Create a mapping from month to the corresponding season
                month_to_season = {month: season for season, months_list in season_months_dict.items() for month in months_list}

                # Assign a season to each row based on the month
                df['season'] = df['month'].map(month_to_season)

                # Calculate hourly average energy consumption for each row
                df['hourly_avg_energy_consumption'] = 4 * df.groupby(['season', 'hour'])['out.electricity.total.energy_consumption'].transform('mean')

                # Pivot the dataframe to create the desired output format
                result_df = df.pivot_table(values='hourly_avg_energy_consumption', index='bldg_id', columns=['season', 'hour'])

                # Reset the column names
                result_df.columns = pd.MultiIndex.from_tuples([(season, hour+1) for season, months_list in season_months_dict.items() for hour in range(24)])
            else:
                raise ValueError('Invalid type. Please select from hourly, weekly, or monthly.')

            # Add 'bldg_id' index with values corresponding to the names of the parquet files
            result_df['bldg_id'] = bldg_id
            if with_regional:
                result_df["latitude"] = latitude
                result_df["longitude"] = longitude
            result_df.set_index('bldg_id', inplace=True)

            # Append the result_df to the list
            result_dfs.append(result_df)

    # Concatenate all individual DataFrames into a single DataFrame
    df_ave = pd.concat(result_dfs, ignore_index=False)

    return df_ave


for s in ["monthly"]:
    # save_path = os.path.join(base_path + f'/preprocessed_data/with_regional/{s}_data.parquet')
    # df_features = pd.read_parquet(base_path + f'/preprocessed_data/{s}_data.parquet', engine='pyarrow')
    folder_path = os.path.join(base_path + data_path,'building-instinct-test-data')
    df_features = calculate_average_energy_consumption(folder_path, type=s, with_regional=True)
    df_features.sort_index(inplace=True)


df_features_full = pd.read_parquet(base_path + '/preprocessed_data/test/data_test.parquet', engine='pyarrow')
df_features_full.sort_index(inplace=True)
df_features_full["latitude"] = df_features["latitude"]
df_features_full["longitude"] = df_features["longitude"]
save_path = os.path.join(base_path + f'/preprocessed_data/with_regional_data_test.parquet')
df_features_full.to_parquet(save_path, engine='pyarrow')

  0%|          | 0/1450 [00:00<?, ?it/s]

100%|██████████| 1450/1450 [01:07<00:00, 21.40it/s]
  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
