In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import torch

In [19]:
import json
import os
from glob import glob
# madalena_energy = pd.read_json("datasets/madalena/E-145/E145-august.json")

def extract_energy_json(file_path):
    save_path = "datasets/madalena_processed"
    os.makedirs(save_path, exist_ok=True)

    for house_path in os.listdir(file_path):
        dfs = []
        house_csv_path = house_path
        house_path = os.path.join(file_path, house_path)
        for file in glob(os.path.join(house_path, "*.json")):
            with open(file, "r", encoding="utf-8") as file_json:
                data = json.load(file_json)

            #because we have three phases (even though b,c seem to be 0)
            df_a = pd.DataFrame(data["energy_a"]).rename(columns={"value": "energy_a"})
            df_b = pd.DataFrame(data["energy_b"]).rename(columns={"value": "energy_b"})
            df_c = pd.DataFrame(data["energy_c"]).rename(columns={"value": "energy_c"})

            df_b["timestamp"] = pd.to_datetime(df_b["time"])
            df_c["timestamp"] = pd.to_datetime(df_c["time"])
            df_a["timestamp"] = pd.to_datetime(df_a["time"])

            energy_df = df_a.merge(df_b, on="timestamp", how="inner").merge(df_c, on="timestamp", how="inner")
            #remove duplicates
            energy_df = energy_df.loc[:, ~energy_df.columns.duplicated()]


            energy_df["total_energy"] = energy_df["energy_a"] + energy_df["energy_b"] + energy_df["energy_c"]
            energy_df = energy_df[["timestamp", "total_energy"]].set_index("timestamp")
            dfs.append(energy_df)
        df = pd.concat(dfs)
        df.to_csv(f"{save_path}/{house_csv_path}.csv")

extract_energy_json("datasets/madalena")

In [46]:
#get comfort metrics and concat them with energy data per house
#energy and comfort might not start and end at exaclty the same time, so we also need to handlle that
def merge_madalena():
    madalena_comfort_df = pd.read_csv("datasets/madalena/madalena_comfort.csv")
    madalena_comfort_df["timestamp"] = pd.to_datetime(madalena_comfort_df["date"])
    madalena_comfort_df.drop(columns=["date"], inplace=True)
    madalena_comfort_df.set_index("timestamp", inplace=True)

    save_dir = "datasets/madalena_merged"
    os.makedirs(save_dir, exist_ok=True)
    energy_path = "datasets/madalena_processed"

    for file in os.listdir(energy_path):
        madalena_energy_df = pd.read_csv(f"{energy_path}/{file}")
        madalena_energy_df["timestamp"] = pd.to_datetime(madalena_energy_df["timestamp"])
        madalena_energy_df.set_index("timestamp", inplace=True)
        if madalena_energy_df.index.tz is not None:
            madalena_energy_df.index = madalena_energy_df.index.tz_convert(None)
        madalena_energy_df = madalena_energy_df.sort_values(by="timestamp")
        house_name = file.split(".")[0].split("_")[1]
        house_name = f"E{house_name}"
        madalena_comfort_house = madalena_comfort_df[madalena_comfort_df["Room"] == house_name].copy()
        madalena_comfort_house.drop(columns=["Room"], inplace=True)
        
        start_time = max(madalena_energy_df.index.min(), madalena_comfort_house.index.min())
        end_time = min(madalena_energy_df.index.max(), madalena_comfort_house.index.max())

        madalena_energy_df = madalena_energy_df.loc[start_time:end_time]
        madalena_comfort_house = madalena_comfort_house.loc[start_time:end_time]

        madalena_merged_df = madalena_energy_df.merge(madalena_comfort_house, left_index=True, right_index=True, how="inner")
        madalena_merged_df.to_csv(f"{save_dir}/{house_name}.csv")

merge_madalena()