In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time as time

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from data_repository import DataRepository
from model_training_ultils import ModelEvaluationUltis
from sklearn.ensemble import RandomForestClassifier

# Import data

## Import test data

In [2]:
evaludation_tool = ModelEvaluationUltis()
data_repo = DataRepository("../.env")

In [3]:
train_clean, y_train_clean, _, _, test_clean, y_test_clean = data_repo.load_fft_data(clean_data=True)
train_unclean, y_train_unclean, _, _, test_unclean, y_test_unclean = data_repo.load_fft_data(clean_data=False)

In [4]:
print(f"Train dataset clean: {train_clean.shape}, y: {y_train_clean.shape}")
print(f"Train dataset unclean: {train_unclean.shape}, y{y_train_unclean.shape}")

Train dataset clean: (8188, 4501), y: (8188,)
Train dataset unclean: (8649, 4501), y(8649,)


In [5]:
print(f"Test dataset clean: {test_clean.shape}, y: {y_test_clean.shape}")
print(f"Test dataset unclean: {test_unclean.shape}, y{y_test_unclean.shape}")

Test dataset clean: (2012, 4501), y: (2012,)
Test dataset unclean: (2160, 4501), y(2160,)


In [6]:
def find_top_k_indices(amplitudes, k):
    # Get the indices of the top 50 elements
    top_k_indices = np.argsort(amplitudes)[-k:][::-1]
    highest_ampls = amplitudes[top_k_indices]
    return highest_ampls

def get_x_by_top_ampls(k, ampls):
    X = []
    for ampl in ampls:
        X.append(find_top_k_indices(amplitudes=ampl, k=k))
    return np.array(X)

X_train_clean = get_x_by_top_ampls(k=1, ampls=train_clean)
X_train_unclean = get_x_by_top_ampls(k=1, ampls=train_unclean)
X_test_clean = get_x_by_top_ampls(k=1, ampls=test_clean)
X_test_unclean = get_x_by_top_ampls(k=1, ampls=test_unclean)

__Label encoder__

In [7]:
label_encoder = LabelEncoder()
y_test_clean = label_encoder.fit_transform(y_test_clean)
y_test_unclean = label_encoder.transform(y_test_unclean)
y_train_clean = label_encoder.transform(y_train_clean)
y_train_unclean = label_encoder.transform(y_train_unclean)

In [8]:
print(list(label_encoder.classes_))
print(label_encoder.transform(list(label_encoder.classes_)))

['error', 'normal', 'overcurrent', 'overheating', 'zero']
[0 1 2 3 4]


In [9]:
print(data_repo.count_labels(y_test_unclean))
print(data_repo.count_labels(y_test_clean))
print(data_repo.count_labels(y_train_clean))
print(data_repo.count_labels(y_train_unclean))

{0: 445, 1: 419, 2: 454, 3: 411, 4: 431}
{0: 414, 1: 394, 2: 421, 3: 374, 4: 409}
{0: 1622, 1: 1612, 2: 1653, 3: 1688, 4: 1613}
{0: 1718, 1: 1734, 2: 1709, 3: 1766, 4: 1722}


In [10]:
print(X_train_clean.shape)
print(X_test_clean.shape)
print(X_test_unclean.shape)

(8188, 1)
(2012, 1)
(2160, 1)


In [11]:
def get_accuracy_with_training_sets(file_name, X_test, y_test):
    accuracy_arr = []
    X_trains, y_trains = data_repo.read_train_fft_survey_data(file_name)
    for i in range(X_trains.shape[0]):
        train_ampls = X_trains[i]
        X_train = get_x_by_top_ampls(k=1, ampls=train_ampls)
        y_train = label_encoder.transform(y_trains[i])
        rf_current = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
        rf_current.fit(X_train, y_train)
        y_pred = rf_current.predict(X_test)
        accuracy_arr.append(accuracy_score(y_pred=y_pred, y_true=y_test))
    return np.array(accuracy_arr)

In [12]:
def get_accuracy_with_test_sets(file_name, X_train, y_train):
    print("[+] Working with file: "+file_name)
    accuracy_arr = []
    X_tests, y_tests = data_repo.read_test_fft_survey_data(file_name)
    for i in range(X_tests.shape[0]):
        test_ampls = X_tests[i]
        X_test = get_x_by_top_ampls(k=1, ampls=test_ampls)
        y_test = label_encoder.transform(y_tests[i])
        rf_current = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
        rf_current.fit(X_train, y_train)
        y_pred = rf_current.predict(X_test)
        accuracy_arr.append(accuracy_score(y_pred=y_pred, y_true=y_test))
    return np.array(accuracy_arr)

# Khảo sát biên

In [13]:
unclean_rf_fft = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
unclean_rf_fft.fit(X_train_unclean, y_train_unclean)

In [14]:
clean_rf_fft = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
clean_rf_fft.fit(X_train_clean, y_train_clean)

In [29]:
#Unclean train - unclean test
accuracy_score(y_pred=unclean_rf_fft.predict(X_test_unclean), y_true=y_test_unclean)

0.9550925925925926

In [28]:
#Unclean train - clean test
accuracy_score(y_pred=unclean_rf_fft.predict(X_test_clean), y_true=y_test_clean)

0.9726640159045725

In [25]:
#Clean train - Unclean test
accuracy_score(y_pred=clean_rf_fft.predict(X_test_unclean), y_true=y_test_unclean)

0.950925925925926

In [27]:
#Clean train - clean test
accuracy_score(y_pred=clean_rf_fft.predict(X_test_clean), y_true=y_test_clean)

0.9751491053677932

# Surveying with training dataset

## With CL

In [18]:
clean_accuracy_scores = []
for i in [0.1, 0.2, 0.4, 0.6, 0.8]:
    print(f"[+] Working with i = {i}")
    file_name =f"/home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_{i}_dirty_removed.xlsx"
    clean_accuracy_scores.append(get_accuracy_with_training_sets(file_nfame=file_name, X_test=X_test_clean, y_test=y_test_clean))
clean_accuracy_scores = np.array(clean_accuracy_scores)

[+] Working with i = 0.1
[+] Working with i = 0.2
[+] Working with i = 0.4
[+] Working with i = 0.6
[+] Working with i = 0.8


## Without CL

In [20]:
unclean_accuracy_scores = []
for i in [0.1, 0.2, 0.4, 0.6, 0.8]:
    print(f"[+] Working with i = {i}")
    file_name =f"/home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_{i}_dirty_removed.xlsx"
    unclean_accuracy_scores.append(get_accuracy_with_training_sets(file_name=file_name, X_test=X_test_unclean, y_test=y_test_unclean))
unclean_accuracy_scores = np.array(unclean_accuracy_scores)

[+] Working with i = 0.1
[+] Working with i = 0.2
[+] Working with i = 0.4
[+] Working with i = 0.6
[+] Working with i = 0.8


## Write to files

In [21]:
clean_accuracy_scores_df = pd.DataFrame(clean_accuracy_scores)
clean_accuracy_scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.974155,0.973161,0.970676,0.974652,0.973161,0.974155,0.973658,0.972167,0.972664,0.972664
1,0.973161,0.972167,0.972664,0.973161,0.973161,0.970676,0.973161,0.972664,0.969682,0.974155
2,0.974652,0.97167,0.972167,0.973658,0.975646,0.972167,0.974155,0.975646,0.973161,0.972664
3,0.974155,0.971173,0.973161,0.974155,0.977137,0.972167,0.974155,0.970676,0.973658,0.976143
4,0.974155,0.974155,0.975646,0.974652,0.977634,0.974155,0.977137,0.974652,0.975646,0.975149


In [22]:
unclean_accuracy_scores_df = pd.DataFrame(unclean_accuracy_scores)
unclean_accuracy_scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.955556,0.955556,0.953241,0.958333,0.955556,0.956481,0.954167,0.955556,0.95463,0.95463
1,0.95463,0.95463,0.955093,0.956019,0.955556,0.953241,0.955556,0.955093,0.952778,0.956481
2,0.95787,0.955556,0.956019,0.956481,0.958333,0.956481,0.956481,0.958333,0.955093,0.956944
3,0.958333,0.95463,0.954167,0.957407,0.959722,0.955093,0.958333,0.953241,0.958796,0.959722
4,0.953704,0.956944,0.958796,0.956481,0.959259,0.956019,0.958796,0.958796,0.958333,0.959259


In [23]:
clean_accuracy_scores_df.to_excel("../../output/survey_data/20240115_remove_dirty_fft_train_set_clean_acc_scores.xlsx", index=False)

In [24]:
unclean_accuracy_scores_df.to_excel("../../output/survey_data/20240115_remove_dirty_fft_train_set_unclean_acc_scores.xlsx", index=False)

# Survey with test dataset

## Clean dataset

In [15]:
clean_rf_fft = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
clean_rf_fft.fit(X_train_clean, y_train_clean)
y_pred_0 = clean_rf_fft.predict(X_test_unclean)
acc_0 = accuracy_score(y_pred=y_pred_0, y_true=y_test_unclean)
acc_0

0.950925925925926

In [16]:
clean_test_accuracy = []
for i in [0.1, 0.2, 0.4, 0.6, 0.8]:
    test_file_name = f"/home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_{i}_dirty_removed.xlsx"
    print(f"[+] Working with i = {i}")
    clean_test_accuracy.append(get_accuracy_with_test_sets(file_name=test_file_name, X_train=X_train_clean, y_train=y_train_clean))
clean_test_accuracy = np.array(clean_test_accuracy)

[+] Working with i = 0.1
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.1_dirty_removed.xlsx
[+] Working with i = 0.2
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.2_dirty_removed.xlsx
[+] Working with i = 0.4
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.4_dirty_removed.xlsx
[+] Working with i = 0.6
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.6_dirty_removed.xlsx
[+] Working with i = 0.8
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.8_dirty_removed.xlsx


## Unclean dataset

In [17]:
unclean_test_accuracy = []
for i in [0.1, 0.2, 0.4, 0.6, 0.8]:
    test_file_name = f"/home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_{i}_dirty_removed.xlsx"
    print(f"[+] Working with i = {i}")
    unclean_test_accuracy.append(get_accuracy_with_test_sets(file_name=test_file_name, X_train=X_train_unclean, y_train=y_train_unclean))
unclean_test_accuracy = np.array(unclean_test_accuracy)

[+] Working with i = 0.1
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.1_dirty_removed.xlsx
[+] Working with i = 0.2
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.2_dirty_removed.xlsx
[+] Working with i = 0.4
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.4_dirty_removed.xlsx
[+] Working with i = 0.6
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.6_dirty_removed.xlsx
[+] Working with i = 0.8
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.8_dirty_removed.xlsx


## Write to files

In [18]:
clean_test_accuracy_df = pd.DataFrame(clean_test_accuracy)
unclean_test_accuracy_df = pd.DataFrame(unclean_test_accuracy)

In [19]:
clean_test_accuracy_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.952958,0.952958,0.953423,0.953889,0.952492,0.953889,0.954355,0.952492,0.953889,0.953889
1,0.954503,0.955441,0.954972,0.956848,0.954972,0.954503,0.955441,0.954503,0.954972,0.954503
2,0.960533,0.958631,0.961008,0.96291,0.958631,0.960533,0.961484,0.959582,0.959106,0.96291
3,0.963821,0.964785,0.965268,0.96575,0.96575,0.964785,0.963821,0.964785,0.964785,0.966233
4,0.970646,0.970646,0.969178,0.96771,0.968689,0.969178,0.970157,0.9682,0.969667,0.970157


In [20]:
unclean_test_accuracy_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.956218,0.956218,0.95715,0.95715,0.955752,0.956684,0.95715,0.956218,0.956684,0.95715
1,0.954972,0.956848,0.958724,0.9606,0.959662,0.957786,0.958724,0.957786,0.958255,0.957786
2,0.961959,0.958155,0.963386,0.96291,0.961959,0.96291,0.961959,0.961484,0.95768,0.964812
3,0.964303,0.962856,0.964303,0.96575,0.96575,0.965268,0.964785,0.96575,0.963338,0.964785
4,0.969178,0.967221,0.96771,0.967221,0.968689,0.9682,0.968689,0.9682,0.9682,0.9682


In [21]:
clean_test_accuracy_df.to_excel("../../output/survey_data/20240115_remove_dirty_fft_clean_test_set_acc_scores.xlsx")
unclean_test_accuracy_df.to_excel("../../output/survey_data/20240115_remove_dirty_fft_unclean_test_set_acc_scores.xlsx")