# Import Libraries and Define Constants

In this section, we import necessary libraries (`pandas` and `os`) and define constants that will be used throughout the notebook. These constants include paths to data files, directory paths for outputs, and configuration data for various utilities like electricity, gas, water, etc.

In [1]:
# Imports and Configuration Constants
import pandas as pd
import os

In [2]:

# Define dataset names and configurations for each dataset type
DATASET_NAMES = [
    "electricity",
    "gas",
    "solar",
    "steam",
    "water",
]
data_map = {
    "electricity": {
        0: {
            "site_name": "Mouse",
            "building_name": "science",
            "consumer_name": "Micheal",
        },
        1: {"site_name": "Mouse", "building_name": "health", "consumer_name": "Estela"},
    },
    "gas": {
        0: {
            "site_name": "Panther",
            "building_name": "education",
            "consumer_name": "Mohammad",
        },
        1: {
            "site_name": "Panther",
            "building_name": "lodging",
            "consumer_name": "Dean",
        },
    },
    "solar": {
        0: {
            "site_name": "Bobcat",
            "building_name": "education",
            "consumer_name": "Alissa",
        },
        1: {
            "site_name": "Bobcat",
            "building_name": "education",
            "consumer_name": "Coleman",
        },
    },
    "water": {
        0: {
            "site_name": "Panther",
            "building_name": "lodging",
            "consumer_name": "Cora",
        },
        1: {
            "site_name": "Wolf",
            "building_name": "education",
            "consumer_name": "Ursula",
        },
    },
}

TARGET_FILE_TEMPLATE = "../data/meters/cleaned/{}_cleaned.csv"
WEATHER_FILE = "../data/weather/weather.csv"
OUTPUT_DIR = "../data/meters/final/"
TARGET_FILE_ELECTRICITY = "../data/ELECTRICITY/ELECTRICITY.txt"


# Functions for Data Processing

Here, we define two core functions: `extract_and_merge_data` for data extraction, merging, and feature aggregation, and `split_train_test` for splitting the dataset into training and testing periods.


## `extract_and_merge_data`

This function loads and merges consumption and weather data for a given utility dataset, resamples the data to a daily frequency, and calculates daily aggregate statistics. The final processed dataset is saved as a CSV file.

### Parameters:
- `dataset_name` (str): Utility dataset name.
- `config` (dict): Configuration dictionary containing site, building, and consumer names.


In [3]:
def extract_and_merge_data(dataset_name, config):
    site_name = config["site_name"]
    building_name = config["building_name"]
    consumer_name = config["consumer_name"]

    # Define file paths and target/output columns
    target_file = TARGET_FILE_TEMPLATE.format(dataset_name)
    processed_file = f"{dataset_name}_{site_name}_{building_name}_{consumer_name}"
    output_file = os.path.join(
        OUTPUT_DIR,
        processed_file.upper(),
        f"{processed_file}.csv".upper(),
    )

    target_column = f"{site_name}_{building_name}_{consumer_name}"
    new_target_column = dataset_name.capitalize()

    try:
        # Load target data (meter readings)
        target_df = pd.read_csv(target_file, usecols=["timestamp", target_column])
        target_df = target_df.rename(columns={target_column: new_target_column})
        target_df["timestamp"] = pd.to_datetime(target_df["timestamp"])

        # Load weather data, filter for the specific site, and drop `site_id`
        weather_df = pd.read_csv(WEATHER_FILE)
        weather_df = weather_df[weather_df["site_id"] == site_name].drop(
            columns=["site_id"]
        )
        weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])

        # Merge on timestamp, keeping timestamps from the target data only
        merged_df = pd.merge(target_df, weather_df, on="timestamp", how="left")

        # Resample the data to daily frequency using sum for consumption and mean for weather data
        resampled_df = merged_df.resample("D", on="timestamp").agg(
            {
                new_target_column: [
                    "sum",
                    "mean",
                    "min",
                    "max",
                    "first",
                    "last",
                    "median",
                ],
                **{col: "mean" for col in weather_df.columns if col != "timestamp"},
            }
        )

        # Flatten column names after aggregation
        resampled_df.columns = [
            "sum_conso",
            "mean_conso",
            "min_conso",
            "max_conso",
            "first_conso",
            "last_conso",
            "median_conso",
        ] + list(weather_df.columns[1:])
        resampled_df.reset_index(inplace=True)

        # Add time-related features
        resampled_df["month"] = resampled_df["timestamp"].dt.month
        resampled_df["day_of_week"] = resampled_df["timestamp"].dt.dayofweek

        # Save to the specified output file without removing `timestamp`
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        resampled_df.to_csv(output_file, index=False)

        print(f"Data extracted and saved to {output_file}")
        return output_file

    except FileNotFoundError as e:
        print(
            f"File not found: {e}. Skipping {dataset_name} for {site_name} - {building_name} - {consumer_name}."
        )
    except ValueError as e:
        print(
            f"Column error: {e}. Skipping {dataset_name} for {site_name} - {building_name} - {consumer_name}."
        )
    except Exception as e:
        print(
            f"An error occurred: {e}. Skipping {dataset_name} for {site_name} - {building_name} - {consumer_name}."
        )

    return None


## `split_train_test`

Splits the processed dataset into training and testing sets based on the year of data. Training data consists of 2016 records, and testing data consists of 2017 records.

### Parameters:
- `output_file` (str): Path to the processed dataset file.


In [4]:
def split_train_test(output_file):
    if output_file is None:
        return

    try:
        merged_df = pd.read_csv(output_file, parse_dates=["timestamp"])
        train_df = merged_df[merged_df["timestamp"].dt.year == 2016]
        test_df = merged_df[merged_df["timestamp"].dt.year == 2017]

        train_file = output_file.replace(".CSV", "_TRAIN.CSV")
        test_file = output_file.replace(".CSV", "_TEST.CSV")

        train_df.drop(columns=["timestamp"]).to_csv(train_file, index=False)
        test_df.drop(columns=["timestamp"]).to_csv(test_file, index=False)

        print(f"Train data saved to {train_file}")
        print(f"Test data saved to {test_file}")

    except Exception as e:
        print(f"Error during train-test split: {e}")


# Execution Loop for Data Processing

This section iterates over each dataset in `DATASET_NAMES` and each configuration in `data_map`, calling `extract_and_merge_data` for data extraction and merging, followed by `split_train_test` for train-test splitting.


In [5]:
for dataset_name in DATASET_NAMES:
    if dataset_name in data_map:
        for config in data_map[dataset_name].values():
            output_file = extract_and_merge_data(dataset_name, config)
            split_train_test(output_file)


Data extracted and saved to ../data/meters/final/ELECTRICITY_MOUSE_SCIENCE_MICHEAL/ELECTRICITY_MOUSE_SCIENCE_MICHEAL.CSV
Train data saved to ../data/meters/final/ELECTRICITY_MOUSE_SCIENCE_MICHEAL/ELECTRICITY_MOUSE_SCIENCE_MICHEAL_TRAIN.CSV
Test data saved to ../data/meters/final/ELECTRICITY_MOUSE_SCIENCE_MICHEAL/ELECTRICITY_MOUSE_SCIENCE_MICHEAL_TEST.CSV
Data extracted and saved to ../data/meters/final/ELECTRICITY_MOUSE_HEALTH_ESTELA/ELECTRICITY_MOUSE_HEALTH_ESTELA.CSV
Train data saved to ../data/meters/final/ELECTRICITY_MOUSE_HEALTH_ESTELA/ELECTRICITY_MOUSE_HEALTH_ESTELA_TRAIN.CSV
Test data saved to ../data/meters/final/ELECTRICITY_MOUSE_HEALTH_ESTELA/ELECTRICITY_MOUSE_HEALTH_ESTELA_TEST.CSV
Data extracted and saved to ../data/meters/final/GAS_PANTHER_EDUCATION_MOHAMMAD/GAS_PANTHER_EDUCATION_MOHAMMAD.CSV
Train data saved to ../data/meters/final/GAS_PANTHER_EDUCATION_MOHAMMAD/GAS_PANTHER_EDUCATION_MOHAMMAD_TRAIN.CSV
Test data saved to ../data/meters/final/GAS_PANTHER_EDUCATION_MOHAMMAD

# Process Electricity Data

The `process_electricity_data` function loads a separate electricity dataset with specific aggregations and saves it in the specified output directory. The `split_train_test` function then separates this data into training and testing sets.


## `process_electricity_data`

Processes electricity data, resamples it to daily frequency, and calculates aggregation statistics. The result is saved in the specified output file.


In [13]:

def process_electricity_data():
    # Define file path and output file name
    processed_file = "electricity_global_reactive_power"
    output_file = os.path.join(
        OUTPUT_DIR, processed_file.upper(), f"{processed_file}.csv".upper()
    )

    # Load the electricity data with Date and Time combined into a timestamp
    try:
        electricity_df = pd.read_csv(
            TARGET_FILE_ELECTRICITY,
            sep=";",  # Separator is ';'
            parse_dates=[[0, 1]],  # Combine 'Date' and 'Time' columns
            dayfirst=True,  # Use day-first format for dates
            na_values="?",  # Handle missing values
        )
        electricity_df.columns = [
            "timestamp",
            "Global_active_power",
            "Global_reactive_power",
            "Voltage",
            "Global_intensity",
            "Sub_metering_1",
            "Sub_metering_2",
            "Sub_metering_3",
        ]
        electricity_df["timestamp"] = pd.to_datetime(
            electricity_df["timestamp"], errors="coerce"
        )

        # Drop rows with invalid dates
        electricity_df.dropna(subset=["timestamp"], inplace=True)

        # Define detailed aggregations for `Global_reactive_power`
        aggregation_dict = {
            "Global_reactive_power": [
                "sum",
                "mean",
                "min",
                "max",
                "first",
                "last",
                "median",
            ]  # Detailed aggregations
        }

        # Simple sum aggregation for other columns
        other_columns = {
            "Voltage": "sum",
            "Global_intensity": "sum",
            "Sub_metering_1": "sum",
            "Sub_metering_2": "sum",
            "Sub_metering_3": "sum",
        }

        # Combine all aggregations into a single dictionary
        aggregation_dict.update(other_columns)

        # Resample the data to daily frequency using the specified aggregations
        resampled_df = electricity_df.resample("D", on="timestamp").agg(
            aggregation_dict
        )

        # Rename columns for `Global_reactive_power` aggregations only
        resampled_df.columns = [
            "sum_conso",
            "mean_conso",
            "min_conso",
            "max_conso",
            "first_conso",
            "last_conso",
            "median_conso",
        ] + list(other_columns.keys())

        resampled_df.reset_index(inplace=True)

        # Add time-related features
        resampled_df["month"] = resampled_df["timestamp"].dt.month
        resampled_df["day_of_week"] = resampled_df["timestamp"].dt.dayofweek

        # Convert all column names to lowercase
        resampled_df.columns = [col.lower() for col in resampled_df.columns]

        # Save to the specified output file
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        resampled_df.to_csv(output_file, index=False)

        print(f"Electricity data processed and saved to {output_file}")
        return output_file

    except FileNotFoundError as e:
        print(f"File not found: {e}. Skipping electricity data.")
    except ValueError as e:
        print(f"Value error: {e}. Skipping electricity data.")
    except Exception as e:
        print(f"An error occurred: {e}. Skipping electricity data.")

    return None


# Run Electricity Data Processing

Finally, we call `process_electricity_data` and `split_train_test` to handle the electricity dataset specifically.


In [14]:
output_file = process_electricity_data()
split_train_test(output_file)


  electricity_df = pd.read_csv(


Electricity data processed and saved to ../data/meters/final/ELECTRICITY_GLOBAL_REACTIVE_POWER/ELECTRICITY_GLOBAL_REACTIVE_POWER.CSV
Train data saved to ../data/meters/final/ELECTRICITY_GLOBAL_REACTIVE_POWER/ELECTRICITY_GLOBAL_REACTIVE_POWER_TRAIN.CSV
Test data saved to ../data/meters/final/ELECTRICITY_GLOBAL_REACTIVE_POWER/ELECTRICITY_GLOBAL_REACTIVE_POWER_TEST.CSV


# Print out data names

In [12]:
import os

# Directory containing the processed data directories
base_directory = "../data/meters/final"

# Generate a list of paths to each main .csv file in subdirectories, excluding "TRAIN" and "TEST"
processed_data_paths = []
for subdirectory in os.listdir(base_directory):
    subdirectory_path = os.path.join(base_directory, subdirectory)
    # Check if it’s a directory and contains a .csv file without "TRAIN" or "TEST"
    if os.path.isdir(subdirectory_path):
        csv_filename = f"{subdirectory}.csv"
        csv_filepath = os.path.join(subdirectory_path, csv_filename)
        if os.path.isfile(csv_filepath) and "TRAIN" not in csv_filename and "TEST" not in csv_filename:
            # Add the file path, replacing backslashes with forward slashes for consistency
            processed_data_paths.append(csv_filepath.replace("\\", "/"))

# Print the list in the desired format
print("processed_data_paths = [")
for path in processed_data_paths:
    print(f'    "{path}",')
print("]")


processed_data_paths = [
    "../data/meters/final/GAS_PANTHER_EDUCATION_MOHAMMAD/GAS_PANTHER_EDUCATION_MOHAMMAD.csv",
    "../data/meters/final/ELECTRICITY_MOUSE_HEALTH_ESTELA/ELECTRICITY_MOUSE_HEALTH_ESTELA.csv",
    "../data/meters/final/HOTWATER_FOX_LODGING_ALANA/HOTWATER_FOX_LODGING_ALANA.csv",
    "../data/meters/final/SOLAR_BOBCAT_EDUCATION_ALISSA/SOLAR_BOBCAT_EDUCATION_ALISSA.csv",
    "../data/meters/final/SOLAR_BOBCAT_EDUCATION_COLEMAN/SOLAR_BOBCAT_EDUCATION_COLEMAN.csv",
    "../data/meters/final/WATER_PANTHER_LODGING_CORA/WATER_PANTHER_LODGING_CORA.csv",
    "../data/meters/final/ELECTRICITY_MOUSE_SCIENCE_MICHEAL/ELECTRICITY_MOUSE_SCIENCE_MICHEAL.csv",
    "../data/meters/final/HOTWATER_ROBIN_EDUCATION_MARGARITO/HOTWATER_ROBIN_EDUCATION_MARGARITO.csv",
    "../data/meters/final/WATER_WOLF_EDUCATION_URSULA/WATER_WOLF_EDUCATION_URSULA.csv",
    "../data/meters/final/ELECTRICITY_GLOBAL_REACTIVE_POWER/ELECTRICITY_GLOBAL_REACTIVE_POWER.csv",
    "../data/meters/final/GAS_PANTHER