In [21]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import os
import glob

# Load the datasets
def load_data(time_series_folder, metadata_file, skip=1):
    # Load metadata    
    columns_to_use = [
        "building_id",
        #"site_id",
        "sqm",
        "lat",
        "lng",
        #"timezone",
        #"industry",
        "subindustry",
        "heatingtype",
        #"primaryspaceusage"
        #"yearbuilt",
        #"date_opened",
        #"numberoffloors",
    ]
    meta = pd.read_csv(metadata_file,
                usecols=columns_to_use
    )
    

    
    #meta["industry"].fillna("None", inplace=True)
    meta["subindustry"].fillna("None", inplace=True)
    meta["heatingtype"].fillna("None", inplace=True)
    
    meta.dropna(inplace=True)
    
    # Load time series data
    all_files = glob.glob(os.path.join(time_series_folder, "*.csv"))
    df_list = []
    for file in all_files[::skip]:
        building_id = os.path.basename(file).split('.')[0]
        
        # check if building_id is in metadata
        if building_id not in meta["building_id"].values:
            print(f"Building ID {building_id} not found in metadata. Skipping...")
            continue
        
        df = pd.read_csv(file, parse_dates=True, index_col='timestamp')
        df['building_id'] = os.path.basename(file).split('.')[0]
        df['hour'] = df.index.hour
        df['day'] = df.index.day
        df['month'] = df.index.month
        df['year'] = df.index.year
        df_list.append(df)
    time_series_data = pd.concat(df_list)
    
    time_series_data.head()
    
    return time_series_data, meta

# Merge datasets
def merge_data(time_series_data, metadata):
    merged_data = pd.merge(time_series_data, metadata, how='left', on='building_id')
    return merged_data

# Preprocess the data
def preprocess_data(df):
    # Handle missing values
    print("Filling missing values")
    df.fillna(method='ffill', inplace=True)
    df.info()
    df.head()
    # Feature engineering

    
    # Convert to supervised learning problem
    print("Creating dataset...")
    def create_dataset(data, target_col, time_step=1):
        X, Y = [], []
        for i in range(len(data) - time_step - 1):
            a = data[i:(i + time_step)]
            X.append(a)
            Y.append(data[i + time_step][target_col])
        return np.array(X), np.array(Y)
    
    time_step = 10
    feature_cols = ['hour', 'day', 'month', 'year', 'building_id', 
                    'airTemperature', 
                    'cloudCoverage', 
                    'dewTemperature', 
                    'precipDepth1HR', 
                    #'precipDepth6HR', 
                    'seaLvlPressure', 
                    'windDirection', 
                    'windSpeed',
                    'sqm',
                    'lat',
                    'heatingtype',
                    #'yearbuilt',
                    'subindustry']  # Add more features as needed
    target_col = 'electricity'
    X, Y = create_dataset(df[feature_cols + [target_col]].values, target_col, time_step)
    
    # Split into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    return X_train, X_test, Y_train, Y_test

# Build the Gradient Boosting model
def build_model():
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    return model

# Train the model
def train_model(model, X_train, Y_train):
    X_train_reshaped = X_train.reshape(X_train.shape[0], -1)  # Flatten the input for the model
    model.fit(X_train_reshaped, Y_train)

# Evaluate the model
def evaluate_model(model, X_test, Y_test):
    X_test_reshaped = X_test.reshape(X_test.shape[0], -1)  # Flatten the input for the model
    predictions = model.predict(X_test_reshaped)
    mse = mean_squared_error(Y_test, predictions)
    print(f"Mean Squared Error: {mse}")

# Main function
def main(time_series_folder, metadata_file, skip=1):
    time_series_data, metadata = load_data(time_series_folder, metadata_file, skip=skip)
    merged_data = merge_data(time_series_data, metadata)
    X_train, X_test, Y_train, Y_test = preprocess_data(merged_data)
    model = build_model()
    train_model(model, X_train, Y_train)
    evaluate_model(model, X_test, Y_test)



In [22]:
# Example usage
time_series_folder = '../data/buildings_cleaned'
metadata_file = '../data/metadata/metadata.csv'
main(time_series_folder, metadata_file, skip=79)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  meta["subindustry"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  meta["heatingtype"].fillna("None", inplace=True)


Building ID Eagle_education_Paul not found in metadata. Skipping...
Building ID Eagle_public_Missy not found in metadata. Skipping...
Building ID Gator_assembly_Lucia not found in metadata. Skipping...
Filling missing values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298239 entries, 0 to 298238
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   airTemperature  298239 non-null  float64
 1   cloudCoverage   298239 non-null  float64
 2   dewTemperature  298239 non-null  float64
 3   precipDepth1HR  298239 non-null  float64
 4   seaLvlPressure  298239 non-null  float64
 5   windDirection   298239 non-null  float64
 6   windSpeed       298239 non-null  float64
 7   electricity     298239 non-null  float64
 8   chilledwater    298239 non-null  float64
 9   hotwater        298239 non-null  int64  
 10  gas             298239 non-null  float64
 11  water           298239 non-null  float64
 12  building_id    

  df.fillna(method='ffill', inplace=True)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices