In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time as time

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from data_repository import DataRepository
from model_training_ultils import ModelEvaluationUltis
from sklearn.ensemble import RandomForestClassifier

# Import data

## Import test data

In [2]:
evaludation_tool = ModelEvaluationUltis()
data_repo = DataRepository("../.env")

In [3]:
X_train_clean, y_train_clean, _, _, X_test_clean, y_test_clean = data_repo.load_current_data(clean_data=True)
X_train_unclean, y_train_unclean, _, _, X_test_unclean, y_test_unclean = data_repo.load_current_data(clean_data=False)

In [4]:
print(f"Train dataset: X: {X_train_clean.shape}, y: {y_train_clean.shape}")
print(f"Train dataset X: {X_train_unclean.shape}, y{y_train_unclean.shape}")
print(f"Test dataset X: {X_test_clean.shape}, y{y_test_clean.shape}")
print(f"Test dataset X: {X_test_unclean.shape}, y{y_test_unclean.shape}")

Train dataset: X: (8188, 9000), y: (8188,)
Train dataset X: (8649, 9000), y(8649,)
Test dataset X: (2012, 9000), y(2012,)
Test dataset X: (2160, 9000), y(2160,)


__Label encoder__

In [5]:
label_encoder = LabelEncoder()
y_test_clean = label_encoder.fit_transform(y_test_clean)
y_test_unclean = label_encoder.transform(y_test_unclean)
y_train_clean = label_encoder.transform(y_train_clean)
y_train_unclean = label_encoder.transform(y_train_unclean)

In [6]:
print(list(label_encoder.classes_))
print(label_encoder.transform(list(label_encoder.classes_)))

['error', 'normal', 'overcurrent', 'overheating', 'zero']
[0 1 2 3 4]


In [7]:
print(data_repo.count_labels(y_test_unclean))
print(data_repo.count_labels(y_test_clean))
print(data_repo.count_labels(y_train_clean))
print(data_repo.count_labels(y_train_unclean))

{0: 445, 1: 419, 2: 454, 3: 411, 4: 431}
{0: 414, 1: 394, 2: 421, 3: 374, 4: 409}
{0: 1622, 1: 1612, 2: 1653, 3: 1688, 4: 1613}
{0: 1718, 1: 1734, 2: 1709, 3: 1766, 4: 1722}


In [8]:
def get_accuracy_with_training_set(file_name, X_test, y_test):
    print("[+] Working with file: "+file_name)
    accuracy_arr = []
    X_trains, y_trains = data_repo.read_train_current_survey_data(file_name)
    for i in range(X_trains.shape[0]):
        X_train = X_trains[i]
        y_train = label_encoder.transform(y_trains[i])
        rf_current = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
        rf_current.fit(X_train, y_train)
        y_pred = rf_current.predict(X_test)
        accuracy_arr.append(accuracy_score(y_pred=y_pred, y_true=y_test))
    return np.array(accuracy_arr)

In [9]:
def get_accuracy_with_test_set(file_name, X_train, y_train):
    print("[+] Working with file: "+file_name)
    accuracy_arr = []
    X_tests, y_tests = data_repo.read_test_current_survey_data(file_name)
    for i in range(X_tests.shape[0]):
        X_test = X_tests[i]
        y_test = label_encoder.transform(y_tests[i])
        rf_current = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
        rf_current.fit(X_train, y_train)
        y_pred = rf_current.predict(X_test)
        accuracy_arr.append(accuracy_score(y_pred=y_pred, y_true=y_test))
    return np.array(accuracy_arr)

# Khảo sát biên

In [30]:
unclean_rf_current = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
unclean_rf_current.fit(X_train_unclean, y_train_unclean)

In [31]:
clean_rf_current = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
clean_rf_current.fit(X_train_clean, y_train_clean)

In [33]:
#Clean train - Clean test
accuracy_score(y_pred=clean_rf_current.predict(X_test_clean), y_true=y_test_clean)

0.9811133200795229

In [25]:
#Clean train - Unclean test
accuracy_score(y_pred=clean_rf_current.predict(X_test_unclean), y_true=y_test_unclean)

0.9550925925925926

In [26]:
#Unclean train - Unclean test
accuracy_score(y_pred=unclean_rf_current.predict(X_test_unclean), y_true=y_test_unclean)

0.9564814814814815

In [24]:
#Unclean train - clean test
accuracy_score(y_pred=unclean_rf_current.predict(X_test_clean), y_true=y_test_clean)

0.9786282306163022

# Surveying with training dataset

## With CL

In [28]:
clean_accuracy_scores = []
for i in [0.1, 0.2, 0.4, 0.6, 0.8]:
    print(f"[+] Working with i = {i}")
    file_name =f"/home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_{i}_dirty_removed.xlsx"
    clean_accuracy_scores.append(get_accuracy_with_training_set(file_name=file_name, X_test=X_test_clean, y_test=y_test_clean))
clean_accuracy_scores = np.array(clean_accuracy_scores)

[+] Working with i = 0.1
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_0.1_dirty_removed.xlsx
[+] Working with i = 0.2
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_0.2_dirty_removed.xlsx
[+] Working with i = 0.4
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_0.4_dirty_removed.xlsx
[+] Working with i = 0.6
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_0.6_dirty_removed.xlsx
[+] Working with i = 0.8
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_0.8_dirty_removed.xlsx


## Without CL

In [33]:
unclean_accuracy_scores = []
for i in [0.1, 0.2, 0.4, 0.6, 0.8]:
    print(f"[+] Working with i = {i}")
    file_name =f"/home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_{i}_dirty_removed.xlsx"
    unclean_accuracy_scores.append(get_accuracy_with_training_set(file_name=file_name, X_test=X_test_unclean, y_test=y_test_unclean))
unclean_accuracy_scores = np.array(unclean_accuracy_scores)

[+] Working with i = 0.1
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_0.1_dirty_removed.xlsx
[+] Working with i = 0.2
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_0.2_dirty_removed.xlsx
[+] Working with i = 0.4
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_0.4_dirty_removed.xlsx
[+] Working with i = 0.6
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_0.6_dirty_removed.xlsx
[+] Working with i = 0.8
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/train_dataset/cl_train_files_0.8_dirty_removed.xlsx


## Write to files

In [31]:
clean_accuracy_scores_df = pd.DataFrame(clean_accuracy_scores)
clean_accuracy_scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.977137,0.977634,0.978628,0.981113,0.980616,0.980119,0.978628,0.979622,0.977634,0.980616
1,0.980119,0.979125,0.977137,0.980616,0.98161,0.977137,0.978628,0.978131,0.980616,0.980119
2,0.981113,0.97664,0.97664,0.981113,0.978628,0.978131,0.980119,0.979622,0.977634,0.983101
3,0.980616,0.980616,0.981113,0.978628,0.980119,0.980616,0.981113,0.979125,0.983101,0.978628
4,0.980119,0.979622,0.979622,0.980616,0.980616,0.977634,0.98161,0.980616,0.98161,0.980119


In [34]:
unclean_accuracy_scores_df = pd.DataFrame(unclean_accuracy_scores)
unclean_accuracy_scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.95463,0.954167,0.95463,0.959722,0.957407,0.958796,0.953704,0.956019,0.95463,0.955556
1,0.956481,0.95463,0.954167,0.956481,0.959259,0.953704,0.95463,0.954167,0.955093,0.95787
2,0.956944,0.953704,0.950926,0.956019,0.955556,0.954167,0.955093,0.952778,0.955556,0.95787
3,0.956019,0.958796,0.956019,0.955093,0.95463,0.956481,0.957407,0.954167,0.958796,0.954167
4,0.953704,0.954167,0.954167,0.956019,0.95463,0.952778,0.957407,0.955556,0.956944,0.956481


In [37]:
clean_accuracy_scores_df.to_excel("../../output/survey_data/20240115_remove_dirty_current_train_set_clean_acc_scores.xlsx", index=False)

In [38]:
unclean_accuracy_scores_df.to_excel("../../output/survey_data/20240115_remove_dirty_current_train_set_unclean_acc_scores.xlsx", index=False)

# Survey with test dataset

## With CL

In [13]:
test_accuracy_clean = []
for i in [0.1, 0.2, 0.4, 0.6, 0.8]:
    test_file_name = f"/home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_{i}_dirty_removed.xlsx"
    print(f"[+] Working with i = {i}")
    test_accuracy_clean.append(get_accuracy_with_test_set(file_name=test_file_name, X_train=X_train_clean, y_train=y_train_clean))
test_accuracy_clean = np.array(test_accuracy_clean)

[+] Working with i = 0.1
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.1_dirty_removed.xlsx
[+] Working with i = 0.2
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.2_dirty_removed.xlsx
[+] Working with i = 0.4
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.4_dirty_removed.xlsx
[+] Working with i = 0.6
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.6_dirty_removed.xlsx
[+] Working with i = 0.8
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.8_dirty_removed.xlsx


## Without CL

In [17]:
test_accuracy_unclean = []
for i in [0.1, 0.2, 0.4, 0.6, 0.8]:
    test_file_name = f"/home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_{i}_dirty_removed.xlsx"
    print(f"[+] Working with i = {i}")
    test_accuracy_unclean.append(get_accuracy_with_test_set(file_name=test_file_name, X_train=X_train_unclean, y_train=y_train_unclean))
test_accuracy_unclean = np.array(test_accuracy_unclean)

[+] Working with i = 0.1
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.1_dirty_removed.xlsx
[+] Working with i = 0.2
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.2_dirty_removed.xlsx
[+] Working with i = 0.4
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.4_dirty_removed.xlsx
[+] Working with i = 0.6
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.6_dirty_removed.xlsx
[+] Working with i = 0.8
[+] Working with file: /home/mrcong/Code/nilm_as/data/cl_survey_data/test_dataset/cl_test_files_0.8_dirty_removed.xlsx


## Write to files

In [19]:
clean_test_accuracy_df = pd.DataFrame(test_accuracy_clean)
unclean_test_acc_df = pd.DataFrame(test_accuracy_unclean)

In [20]:
clean_test_accuracy_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.95715,0.95715,0.956684,0.957615,0.957615,0.957615,0.95715,0.956684,0.956684,0.957615
1,0.958724,0.960131,0.959193,0.961069,0.960131,0.960131,0.959662,0.959193,0.959662,0.959193
2,0.964812,0.963861,0.964337,0.966239,0.963861,0.965288,0.964337,0.964812,0.963386,0.965763
3,0.969609,0.970092,0.970092,0.970092,0.970092,0.970574,0.968644,0.970092,0.968162,0.970574
4,0.97456,0.975538,0.975049,0.973581,0.975049,0.975538,0.975049,0.975049,0.97456,0.975049


In [21]:
unclean_test_acc_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.959013,0.958547,0.958081,0.958081,0.958547,0.958081,0.958547,0.957615,0.95715,0.959013
1,0.9606,0.962008,0.959662,0.961069,0.960131,0.9606,0.961538,0.959193,0.959193,0.960131
2,0.965288,0.965288,0.965288,0.966239,0.963386,0.965763,0.964812,0.964812,0.963386,0.964812
3,0.969127,0.969127,0.968644,0.970092,0.970092,0.970092,0.966715,0.968644,0.966715,0.968644
4,0.972603,0.973092,0.97407,0.971624,0.973581,0.973581,0.973581,0.97456,0.972603,0.972114


In [22]:
clean_test_accuracy_df.to_excel("../../output/survey_data/20240115_remove_dirty_current_clean_test_set_acc_scores.xlsx", index=False)
unclean_test_acc_df.to_excel("../../output/survey_data/20240115_remove_dirty_current_unclean_test_set_acc_scores.xlsx", index=False)