In [1]:
import torch
import pandas as pd

In [2]:
#extract korean dataset
from load_korean_data import extract_zips

extract_zips()

Done extracting Korean data


In [None]:

df_madalena_comfort = pd.read_csv('datasets/madalena_comfort.csv')

df_madalena_comfort.head(50)

Unnamed: 0,date,Room,CO2[ppm],PM4[ug/m3],Lighting[lux],T_in[°C],RH [%],PM10[ug/m3],PM2_5[ug/m3],PM1[ug/m3],PM0_5[ug/m3],T_out [°C]
0,2024-07-01 00:00:00,E145,624.23,65.43,0.0,28.46,55.51,65.44,65.42,65.22,56.81,24.5
1,2024-07-01 01:00:00,E145,629.0,66.77,0.0,28.48,56.39,66.78,66.76,66.56,57.97,24.0
2,2024-07-01 02:00:00,E145,640.13,67.78,0.0,28.45,56.74,67.79,67.77,67.57,58.85,22.3
3,2024-07-01 03:00:00,E145,639.77,68.0,0.0,28.39,57.08,68.01,67.98,67.78,59.04,22.0
4,2024-07-01 04:00:00,E145,643.31,67.25,0.03,28.35,57.57,67.26,67.24,67.04,58.39,21.5
5,2024-07-01 05:00:00,E145,619.97,68.98,3.82,28.41,57.81,68.99,68.96,68.76,59.89,21.3
6,2024-07-01 06:00:00,E145,565.29,74.8,35.68,28.62,57.59,74.81,74.79,74.57,64.95,22.0
7,2024-07-01 07:00:00,E145,525.4,69.85,32.15,28.7,57.31,69.86,69.84,69.63,60.65,22.5
8,2024-07-01 08:00:00,E145,494.33,67.51,5.58,28.79,56.73,67.53,67.5,67.3,58.62,23.3
9,2024-07-01 09:00:00,E145,486.38,82.8,3.48,28.8,56.46,82.81,82.78,82.54,71.89,25.5


In [None]:
from glob import glob
import os
from multiprocessing import Pool
import gc


def optimize_memory(df):
    for col in df.select_dtypes(include=["int64", "float64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")  
        df[col] = pd.to_numeric(df[col], downcast="float")   
    return df

def load_house_data(house_path, save_dir="datasets/korean_processed"):
    os.makedirs(save_dir, exist_ok=True)
    house_id = os.path.basename(house_path)
    sub_folders = [d for d in os.listdir(house_path) if os.path.isdir(os.path.join(house_path, d))]

    if not sub_folders:
        print(f"No sub-folder found in {house_path}")
        return None
    house_sub_folder = os.path.join(house_path, sub_folders[0])
    for timestamp_folder in sorted(os.listdir(house_sub_folder)):
        full_timestamp_path = os.path.join(house_sub_folder, timestamp_folder)
        print(f"Timestamp: {timestamp_folder}")
        if os.path.isdir(full_timestamp_path):
            daily_appliance_df_list = []
            daily_total_load_df = None
            for file in glob(os.path.join(full_timestamp_path, "*.parquet.gzip")):
                appliance_name = os.path.basename(file).replace(".parquet.gzip", "")

                df = pd.read_parquet(file, engine="pyarrow")

                # ✅ Convert timestamp correctly
                df["timestamp"] = pd.to_datetime(df["timestamp"] / 1000, unit="s")

                # ✅ Optimize memory usage
                df = optimize_memory(df)  
                if "total" in appliance_name.lower():
                    daily_total_load_df = df
                else:
                    df["appliance"] = appliance_name
                    daily_appliance_df_list.append(df)

            if daily_appliance_df_list:
                daily_appliance_df = pd.concat(daily_appliance_df_list, ignore_index=True)
                daily_appliance_df = daily_appliance_df.pivot_table(
                    index="timestamp",
                    columns="appliance",
                    values=["active_power", "reactive_power"]
                )
                daily_appliance_df.columns = [f"{col[1]}_{col[0]}" for col in daily_appliance_df.columns]
                daily_appliance_df = daily_appliance_df.reset_index()

                if daily_total_load_df is not None:
                    daily_appliance_df = daily_appliance_df.merge(daily_total_load_df, on="timestamp", suffixes=("", "_total"))

                daily_appliance_df.to_parquet(f"{save_dir}/{house_id}_{timestamp_folder}.parquet", compression="gzip")
                print(f"✅ Processed {house_id} for day {timestamp_folder}")

                del daily_appliance_df_list, daily_total_load_df
                gc.collect()
    print(f"Finished processing {house_id}")

def process_all_houses():
    extracted_data_dir = "datasets/korean_extracted"
    processed_data_dir = "datasets/korean_processed"

    os.makedirs(processed_data_dir, exist_ok=True) 
    houses = [d for d in os.listdir(extracted_data_dir) if os.path.isdir(os.path.join(extracted_data_dir, d))]
    for house_id in houses:
        house_path = os.path.join(extracted_data_dir, house_id)
        if os.path.isdir(house_path):
            print(f"Processing house {house_id}...")

            house_df = load_house_data(house_path)

            if house_df is not None:
                # Save each house’s data separately
                house_df.to_parquet(f"{processed_data_dir}/{house_id}.parquet", compression="gzip")
                print(f"Finished processing {house_id}")

def resample_to_5Hz():
    processed_data_dir = "datasets/korean_processed"
    resampled_data_dir = "datasets/resampled_data"

    os.makedirs(resampled_data_dir, exist_ok=True)

    for house_file in os.listdir(processed_data_dir):
        if house_file.endswith(".parquet"):
            house_path = os.path.join(processed_data_dir, house_file)
            house_df = pd.read_parquet(house_path)

            # Set timestamp as index and resample
            house_df = house_df.set_index("timestamp")
            house_df_5Hz = house_df.resample("200ms").mean().reset_index()

            # Save resampled data
            house_df_5Hz.to_parquet(f"{resampled_data_dir}/{house_file}", compression="gzip")
            print(f"Resampled {house_file} to 5Hz")


In [None]:
def create_final_dataset():
    all_houses_data = process_all_houses()

    for house_id, df in all_houses_data.items():
        df_5Hz = resample_to_5Hz(df)
        df_5Hz.to_parquet(f"datasets/processed_data/{house_id}_5Hz.parquet", compression="gzip")
        print(f"Saved downsampled data for {house_id}")

In [None]:
load_house_data("datasets/korean_extracted/enertalk-dataset-00") 

Done extracting Korean data
Timestamp: 20161101
✅ Processed enertalk-dataset-00 for day 20161101
Timestamp: 20161102
✅ Processed enertalk-dataset-00 for day 20161102
Timestamp: 20161103
✅ Processed enertalk-dataset-00 for day 20161103
Timestamp: 20161104
✅ Processed enertalk-dataset-00 for day 20161104
Timestamp: 20161105
✅ Processed enertalk-dataset-00 for day 20161105
Timestamp: 20161106
✅ Processed enertalk-dataset-00 for day 20161106
Timestamp: 20161107
✅ Processed enertalk-dataset-00 for day 20161107
Timestamp: 20161109
✅ Processed enertalk-dataset-00 for day 20161109
Timestamp: 20161110
✅ Processed enertalk-dataset-00 for day 20161110
Timestamp: 20161111
✅ Processed enertalk-dataset-00 for day 20161111
Timestamp: 20161112
✅ Processed enertalk-dataset-00 for day 20161112
Timestamp: 20161113
✅ Processed enertalk-dataset-00 for day 20161113
Timestamp: 20161114
✅ Processed enertalk-dataset-00 for day 20161114
Timestamp: 20161115
✅ Processed enertalk-dataset-00 for day 20161115
Timest

TypeError: resample_to_5Hz() missing 1 required positional argument: 'df'

In [None]:
def merge_house_data(house_id, processed_data_dir="datasets/korean_processed"):
    """
    Merges all daily parquet files for a house into a single DataFrame.
    """

    house_files = glob(os.path.join(processed_data_dir, f"{house_id}_*.parquet"))

    house_dataframes = [pd.read_parquet(file) for file in sorted(house_files)]

    merged_df = pd.concat(house_dataframes, ignore_index=True)
    
    return merged_df

house_0_df = merge_house_data("enertalk-dataset-00")

print(f" Merged dataset shape: {house_0_df.shape}")

✅ Merged dataset shape: (31882986, 19)


In [36]:
def process_parquet(file):
    df = pd.read_parquet(file, engine="pyarrow")
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")  
    df = df.set_index("timestamp").sort_index()
    df = df[~df.index.duplicated(keep="first")]
    time_diffs = df.index.to_series().diff().dropna()
    detected_freq = time_diffs.mode()[0]  
    detected_freq_str = f"{int(detected_freq.total_seconds() * 1000)}ms"
    df = df.asfreq(detected_freq_str)
    df = df.interpolate(method="time")
    df_resampled = df.resample("1min").mean()
    return df_resampled

    
house_path = "datasets/korean_extracted/enertalk-dataset-00/00"
full_timestamp_path = "datasets/korean_extracted/enertalk-dataset-00/00/20161101"

save_dir = "datasets/korean_processed"
def merge_daily_files(folder_path):
    files = [f for f in os.listdir(folder_path) if f.endswith(".parquet.gzip")]

    dfs = {}  

    for file in files:
        file_path = os.path.join(folder_path, file)
        df = process_parquet(file_path) 
        device_name = file.split("_", 1)[1].split(".parquet")[0]

        df = df.rename(columns={
            "active_power": f"{device_name}_active_power",
            "reactive_power": f"{device_name}_reactive_power"
        })

        dfs[device_name] = df  

    merged_df = list(dfs.values())[0]  

    for df in list(dfs.values())[1:]:
        merged_df = merged_df.merge(df, left_index=True, right_index=True, how="outer")

    del dfs
    gc.collect()
    return merged_df

merged_df = merge_daily_files(full_timestamp_path)
merged_df = merged_df.fillna(0)

merged_df.to_csv("datasets/korean_processed/merged_daily_data.csv", index=False)



In [37]:
test_df = pd.read_csv("datasets/korean_processed/merged_daily_data.csv")
print(test_df.head(100).to_string())

    total_active_power  total_reactive_power  washing-machine_active_power  washing-machine_reactive_power  rice-cooker_active_power  rice-cooker_reactive_power  water-purifier_active_power  water-purifier_reactive_power  microwave_active_power  microwave_reactive_power  kimchi-fridge_active_power  kimchi-fridge_reactive_power
0           226.278702            -41.480791                      0.000000                        0.000000                  0.000000                    0.000000                     0.000000                       0.000000                0.000000                  0.000000                    0.000000                      0.000000
1           426.839889            -85.463171                      0.117980                        6.110303                  0.000000                    0.000000                   340.030000                      -1.647143                0.475614                 -1.545066                    0.453393                     -3.483571
2           4