In [6]:
import pandas as pd
import numpy as np
import tqdm 
import os
from sklearn.model_selection import train_test_split

In [7]:
def count_labels(labels):
    unique_labels, counts = np.unique(labels, return_counts=True)
    label_counts = dict(zip(unique_labels, counts))
    return label_counts

# Train test data split

In [8]:
# Define the base directory
base_dir = "/opt/nilm-shared-data/nilm_device_detection/other_dataset/RAE_dataset/transform_data/labeled_data"

# Define the subdirectories
subdirectories = ["clothes_dryer", "fridge", "lp3", "lp16", "lp20", "oven"]

# Initialize an empty list to store dataframes
all_dataframes = []

# Iterate through each subdirectory
for subdir in subdirectories:
    subdir_path = os.path.join(base_dir, subdir)
    
    # Get a list of all excel files in the subdirectory
    excel_files = [file for file in os.listdir(subdir_path) if file.endswith(".xlsx")]
    
    # Iterate through each excel file
    for excel_file in tqdm.tqdm(excel_files, desc=f"Working on {subdir}"):
        file_path = os.path.join(subdir_path, excel_file)
        
        # Read the excel file into a pandas dataframe
        df = pd.read_excel(file_path)
        df = df[["unix_ts", "Irms", "pf", "P", "Q", "S", "label"]]
        df = df.rename(columns={
            "pf": "AvgPowerFactor",
            "label": "Label"
        })
        
        # Append the dataframe to the list
        all_dataframes.append(df)

Working on clothes_dryer:   0%|          | 0/44 [00:00<?, ?it/s]

Working on clothes_dryer: 100%|██████████| 44/44 [00:02<00:00, 18.87it/s]
Working on fridge: 100%|██████████| 3144/3144 [01:09<00:00, 45.34it/s]
Working on lp3: 100%|██████████| 70/70 [00:08<00:00,  7.96it/s]
Working on lp16: 100%|██████████| 298/298 [00:19<00:00, 15.25it/s]
Working on lp20: 100%|██████████| 372/372 [00:21<00:00, 17.32it/s]
Working on oven: 100%|██████████| 6/6 [00:00<00:00,  8.25it/s]


In [11]:
big_dataframe = pd.concat(all_dataframes)
print(big_dataframe.shape)
big_dataframe.head()

(4204963, 7)


Unnamed: 0,unix_ts,Irms,AvgPowerFactor,P,Q,S,Label
0,1460681086,41.4,0.995318,4889,372,4912,clothes_dryer_s2
1,1460681087,41.2,0.995694,4856,372,4877,clothes_dryer_s2
2,1460681088,41.0,0.995256,4825,373,4848,clothes_dryer_s2
3,1460681089,40.8,0.994827,4808,372,4833,clothes_dryer_s2
4,1460681090,40.5,0.994184,4786,383,4814,clothes_dryer_s2


In [12]:
original_df = big_dataframe.copy()

In [20]:
#Label distribution of the data
count_labels(big_dataframe["Label"])

{'clothes_dryer_s1': 33815,
 'clothes_dryer_s2': 60754,
 'fridge_s1': 2254224,
 'fridge_s2': 22230,
 'kitchen_oven_s1': 18089,
 'kitchen_oven_s2': 5958,
 'kitchen_oven_s3': 3047,
 'lp16_s1': 704397,
 'lp20_s1': 765575,
 'lp20_s2': 10194,
 'lp3_s1': 303787,
 'lp3_s2': 22893}

In [None]:
# Change lp20_s1 and lp3_s1 -> lp320_s1
# big_dataframe_change_label = big_dataframe.copy()
# big_dataframe_change_label.loc[big_dataframe_change_label["Label"].isin(["lp3_s1", "lp20_s1"]), "Label"] = "lp320_s1"
# count_labels(big_dataframe_change_label["Label"])

In [None]:
#Drop lp3
# df_rm_lp3 = big_dataframe[big_dataframe["Label"] != "lp3_s1"]
# df_rm_lp3 = df_rm_lp3[df_rm_lp3["Label"] != "lp3_s2"]
# count_labels(df_rm_lp3["Label"])

In [21]:
#Drop lp3
df_rm_lp20 = big_dataframe[big_dataframe["Label"] != "lp20_s1"]
df_rm_lp20 = df_rm_lp20[df_rm_lp20["Label"] != "lp20_s2"]
count_labels(df_rm_lp20["Label"])

{'clothes_dryer_s1': 33815,
 'clothes_dryer_s2': 60754,
 'fridge_s1': 2254224,
 'fridge_s2': 22230,
 'kitchen_oven_s1': 18089,
 'kitchen_oven_s2': 5958,
 'kitchen_oven_s3': 3047,
 'lp16_s1': 704397,
 'lp3_s1': 303787,
 'lp3_s2': 22893}

# Save to file

In [22]:
train_df, temp_df, y_train, y_temp = train_test_split(df_rm_lp20, df_rm_lp20["Label"], test_size=0.3, random_state=42)
test_df, val_df, y_test, y_val = train_test_split(temp_df, y_temp, test_size=0.5, random_state=42)

In [23]:
print(f"Train dataset: {len(train_df)}")
print(count_labels(train_df["Label"]))
print(f"Test dataset: {len(test_df)}")
print(count_labels(test_df["Label"]))
print(f"Validation dataset: {len(val_df)}")
print(count_labels(val_df["Label"]))

Train dataset: 2400435
{'clothes_dryer_s1': 23849, 'clothes_dryer_s2': 42667, 'fridge_s1': 1578031, 'fridge_s2': 15477, 'kitchen_oven_s1': 12756, 'kitchen_oven_s2': 4161, 'kitchen_oven_s3': 2130, 'lp16_s1': 492441, 'lp3_s1': 212872, 'lp3_s2': 16051}
Test dataset: 514379
{'clothes_dryer_s1': 4961, 'clothes_dryer_s2': 8893, 'fridge_s1': 337812, 'fridge_s2': 3329, 'kitchen_oven_s1': 2685, 'kitchen_oven_s2': 909, 'kitchen_oven_s3': 485, 'lp16_s1': 106122, 'lp3_s1': 45696, 'lp3_s2': 3487}
Validation dataset: 514380
{'clothes_dryer_s1': 5005, 'clothes_dryer_s2': 9194, 'fridge_s1': 338381, 'fridge_s2': 3424, 'kitchen_oven_s1': 2648, 'kitchen_oven_s2': 888, 'kitchen_oven_s3': 432, 'lp16_s1': 105834, 'lp3_s1': 45219, 'lp3_s2': 3355}


In [24]:
base_dir = "/opt/nilm-shared-data/nilm_device_detection/other_dataset/RAE_dataset/train_test_data/rm_lp20"
train_df.to_csv(os.path.join(base_dir, "RAE_train.csv"))
test_df.to_csv(os.path.join(base_dir, "RAE_test.csv"))
val_df.to_csv(os.path.join(base_dir, "RAE_val.csv"))

# Create segment series data

In [None]:
random_files = ['lights_plugs_3_48.xlsx' 'lights_plugs_3_10.xlsx'
 'lights_and_plugs_20_349.xlsx' 'lights_and_plugs_20_157.xlsx'
 'lights_and_plugs_20_299.xlsx' 'lights_and_plugs_20_347.xlsx'
 'lights_and_plugs_20_353.xlsx' 'lights_and_plugs_20_231.xlsx'
 'lights_plugs_3_12.xlsx' 'lights_and_plugs_20_218.xlsx'
 'lights_and_plugs_20_187.xlsx' 'lights_plugs_3_8.xlsx'
 'lights_and_plugs_20_74.xlsx' 'lights_and_plugs_20_155.xlsx'
 'lights_and_plugs_20_185.xlsx' 'lights_plugs_3_1.xlsx'
 'lights_and_plugs_20_190.xlsx']

In [None]:
series_data_files = np.array(lp3_segments_files)
series_data_files = np.concatenate((series_data_files, lp20_segments_files))
#Shuffle the data
np.random.seed(42)
np.random.shuffle(series_data_files)
#Check the files
print(len(series_data_files))
print(series_data_files)

In [None]:
lp3_base_dir = "/opt/nilm-shared-data/nilm_device_detection/other_dataset/RAE_dataset/transform_data/labeled_data/lp3"
lp20_base_dir = "/opt/nilm-shared-data/nilm_device_detection/other_dataset/RAE_dataset/transform_data/labeled_data/lp20"

series_test_df = None
for series_file in series_data_files:
    # Select data
    if "lights_plugs_3" in series_file:
        current_series_df = pd.read_excel(os.path.join(lp3_base_dir, series_file))
    elif "lights_and_plugs_20" in series_file:
        current_series_df = pd.read_excel(os.path.join(lp20_base_dir, series_file))
    # Change the name    
    current_series_df = current_series_df.rename(columns={
            "pf": "AvgPowerFactor",
            "label": "Label"
    })
    
    if series_test_df is None:
        series_test_df = current_series_df
    else:
        series_test_df = pd.concat([series_test_df, current_series_df])
        
print(len(series_test_df))
series_test_df.head()

In [None]:
series_test_df.to_csv("/opt/nilm-shared-data/nilm_device_detection/other_dataset/RAE_dataset/train_test_data/segments_test/series_data/lp_3_20_series_data_1.csv")