In [1]:
import torch

In [2]:
#extract korean dataset
from load_korean_data import extract_zips

extract_zips()

Done extracting Korean data


In [3]:
import pandas as pd

df_madalena_comfort = pd.read_csv('datasets/madalena_comfort.csv')

df_madalena_comfort.head()

Unnamed: 0,date,Room,CO2[ppm],PM4[ug/m3],Lighting[lux],T_in[°C],RH [%],PM10[ug/m3],PM2_5[ug/m3],PM1[ug/m3],PM0_5[ug/m3],T_out [°C]
0,2024-07-01 00:00:00,E145,624.23,65.43,0.0,28.46,55.51,65.44,65.42,65.22,56.81,24.5
1,2024-07-01 01:00:00,E145,629.0,66.77,0.0,28.48,56.39,66.78,66.76,66.56,57.97,24.0
2,2024-07-01 02:00:00,E145,640.13,67.78,0.0,28.45,56.74,67.79,67.77,67.57,58.85,22.3
3,2024-07-01 03:00:00,E145,639.77,68.0,0.0,28.39,57.08,68.01,67.98,67.78,59.04,22.0
4,2024-07-01 04:00:00,E145,643.31,67.25,0.03,28.35,57.57,67.26,67.24,67.04,58.39,21.5


In [4]:
from glob import glob
import os
from multiprocessing import Pool

def load_house_data(house_path):
    appliance_df_list = [] 
    total_load_df = None  
    sub_folders = [d for d in os.listdir(house_path) if os.path.isdir(os.path.join(house_path, d))]

    if not sub_folders:
        print(f"⚠️ No sub-folder found in {house_path}")
        return None
    house_sub_folder = os.path.join(house_path, sub_folders[0])
    for timestamp_folder in sorted(os.listdir(house_sub_folder)):
        full_timestamp_path = os.path.join(house_sub_folder, timestamp_folder)

        if os.path.isdir(full_timestamp_path):
            for file in glob(os.path.join(full_timestamp_path, "*.parquet.gzip")):
                appliance_name = os.path.basename(file).replace(".parquet.gzip", "")

                df = pd.read_parquet(file)
                df["timestamp"] = pd.to_datetime(df["timestamp"] / 1000, unit="s")  
                df["appliance"] = appliance_name  

                if "total" in appliance_name: 
                    total_load_df = df
                else:
                    appliance_df_list.append(df)

    if appliance_df_list:
        appliance_df = pd.concat(appliance_df_list, ignore_index=True)
    else:
        return None

    if total_load_df is not None:
        appliance_df = appliance_df.merge(total_load_df, on="timestamp", suffixes=("", "_total"))

    return appliance_df

def process_house(house_id):
    house_path = os.path.join("datasets/korean_extracted", house_id)

    if os.path.isdir(house_path):
        print(f"🔄 Processing {house_id}...")
        return house_id, load_house_data(house_path)

    return house_id, None

def process_all_houses():
    extracted_data_dir = "datasets/korean_extracted"
    houses = [d for d in os.listdir(extracted_data_dir) if os.path.isdir(os.path.join(extracted_data_dir, d))]

    with Pool(processes=4) as pool:  # Adjust based on CPU cores
        results = pool.map(process_house, houses)

    all_houses_data = {house_id: df for house_id, df in results if df is not None}

    print(f"✅ Loaded {len(all_houses_data)} houses' data in parallel!")
    return all_houses_data
extracted_korean_path = 'datasets/korean_extracted'
all_houses_data = {}

def resample_to_5Hz(df):
    df = df.set_index("timestamp")  # Ensure timestamp is the index
    df_5Hz = df.resample("200ms").mean().reset_index()  # 5Hz resampling
    return df_5Hz


In [5]:
def create_final_dataset():
    all_houses_data = process_all_houses()

    for house_id, df in all_houses_data.items():
        df_5Hz = resample_to_5Hz(df)
        df_5Hz.to_parquet(f"datasets/processed_data/{house_id}_5Hz.parquet", compression="gzip")
        print(f"📁 Saved downsampled data for {house_id}")

    # Merge all houses into a final dataset
    # final_dataset = pd.concat([resample_to_5Hz(df) for df in all_houses_data.values()], ignore_index=True)
    # final_dataset.to_parquet("final_microgrid_5Hz.parquet", compression="gzip")

    # print("✅ Final dataset created and saved!")

In [None]:
create_final_dataset()