In [8]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time as time

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from data_repository import DataRepository
from model_training_ultils import ModelEvaluationUltis
from sklearn.ensemble import RandomForestClassifier

# Import data

## Import test data

In [2]:
evaludation_tool = ModelEvaluationUltis()
data_repo = DataRepository("../.env")

In [3]:
_, _, _, _, X_test_clean, y_test_clean = data_repo.load_current_data(clean_data=True)
_, _, _, _, X_test_unclean, y_test_unclean = data_repo.load_current_data(clean_data=False)

In [4]:
print(f"Train dataset: X: {X_test_clean.shape}, y: {y_test_clean.shape}")
print(f"Test dataset X: {X_test_unclean.shape}, y{y_test_unclean.shape}")

Train dataset: X: (2012, 9000), y: (2012,)
Test dataset X: (2160, 9000), y(2160,)


__Label encoder__

In [5]:
label_encoder = LabelEncoder()
y_test_clean = label_encoder.fit_transform(y_test_clean)
y_test_unclean = label_encoder.transform(y_test_unclean)

In [6]:
print(list(label_encoder.classes_))
print(label_encoder.transform(list(label_encoder.classes_)))

['error', 'normal', 'overcurrent', 'overheating', 'zero']
[0 1 2 3 4]


In [7]:
print(data_repo.count_labels(y_test_unclean))
print(data_repo.count_labels(y_test_clean))

{0: 445, 1: 419, 2: 454, 3: 411, 4: 431}
{0: 414, 1: 394, 2: 421, 3: 374, 4: 409}


In [10]:
def get_accuracy_on_clean_test_set(file_name, clean_data: bool):
    accuracy_arr = []
    X_trains, y_trains = data_repo.read_train_current_survey_data(file_name)
    for i in range(X_trains.shape[0]):
        X_train = X_trains[i]
        y_train = label_encoder.transform(y_trains[i])
        rf_current = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
        rf_current.fit(X_train, y_train)
        if clean_data:
            y_pred = rf_current.predict(X_test_clean)
            accuracy_arr.append(accuracy_score(y_pred=y_pred, y_true=y_test_clean))
        else:
            y_pred = rf_current.predict(X_test_unclean)
            accuracy_arr.append(accuracy_score(y_pred=y_pred, y_true=y_test_unclean))
    return np.array(accuracy_arr)

In [9]:
def get_training_time(file_name):
    training_times = []
    X_trains, y_trains = data_repo.read_train_current_survey_data(file_name)
    for i in range(X_trains.shape[0]):
        X_train = X_trains[i]
        y_train = label_encoder.transform(y_trains[i])
        start_time = time.time()
        rf_current = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
        rf_current.fit(X_train, y_train)
        end_time = time.time()
        training_times.append(end_time-start_time)
    return np.array(training_times)

# Surveying

## Clean data

__Accuracy__

In [10]:
accuracy_scores = []
for i in [1000, 2000, 4000, 6000, 8000]:
    file_name =f"../../data/survery_data/clean_data/train_files_{i}.xlsx"
    accuracy_scores.append(get_accuracy_on_clean_test_set(file_name=file_name, clean_data=True))
accuracy_scores = np.array(accuracy_scores)

In [11]:
accuracy_scores

array([[0.94980119, 0.94930417, 0.95526839, 0.95328032, 0.95477137,
        0.95477137, 0.95178926, 0.94880716, 0.95079523, 0.95079523],
       [0.96371769, 0.95725646, 0.96222664, 0.9666998 , 0.95775348,
        0.96719682, 0.95576541, 0.95079523, 0.96371769, 0.97017893],
       [0.97564612, 0.97564612, 0.97862823, 0.97465209, 0.97813121,
        0.972167  , 0.97813121, 0.97564612, 0.98011928, 0.97614314],
       [0.98011928, 0.97962227, 0.97912525, 0.9806163 , 0.98111332,
        0.97813121, 0.97813121, 0.97664016, 0.97912525, 0.97912525],
       [0.97912525, 0.98111332, 0.9806163 , 0.97912525, 0.98210736,
        0.98111332, 0.98111332, 0.97912525, 0.9806163 , 0.98111332]])

In [12]:
clean_accuracy_scores = pd.DataFrame(accuracy_scores)

In [13]:
clean_accuracy_scores.to_excel("../../output/survey_data/20240109_clean_acc_scores.xlsx", index=False)

__Training time__

In [11]:
clean_training_time = []
for i in [1000, 2000, 4000, 6000, 8000]:
    file_name =f"../../data/survery_data/clean_data/train_files_{i}.xlsx"
    clean_training_time.append(get_training_time(file_name=file_name))
clean_training_time = np.array(clean_training_time)

In [12]:
print(clean_training_time.shape)
print(clean_training_time)

(5, 10)
[[0.47248745 0.47308135 0.4732492  0.48269606 0.46145749 0.45695114
  0.48279285 0.49483848 0.47021914 0.46414495]
 [0.92775583 0.93511176 0.93307734 0.95550537 0.92694712 0.90880775
  0.9388721  0.95024228 0.93568444 0.90597796]
 [1.98778987 1.9055841  2.02529216 2.08870959 2.01255226 1.93543696
  2.04200602 1.95783734 1.95797181 2.04196739]
 [3.12076378 2.98412442 3.08451939 3.09969521 3.05897212 3.0110755
  3.08352852 3.11764765 3.02921414 3.02827287]
 [3.97341871 3.98595738 3.99378157 3.96893668 3.9870739  4.01407743
  3.99215794 3.94830537 3.98925567 3.97640133]]


# Unclean data

__Training time__

In [32]:
unclean_accuracy_scores = []
for i in [1000, 2000, 4000, 6000, 8000]:
    file_name =f"../../data/survery_data/unclean_data/train_files_{i}.xlsx"
    unclean_accuracy_score = get_accuracy_on_clean_test_set(file_name=file_name, clean_data=False)
    unclean_accuracy_scores.append(unclean_accuracy_score)
unclean_accuracy_scores = np.array(unclean_accuracy_scores)

In [33]:
unclean_accuracy_scores

array([[0.92361111, 0.91944444, 0.90740741, 0.90787037, 0.9287037 ,
        0.92731481, 0.91990741, 0.93518519, 0.925     , 0.925     ],
       [0.92453704, 0.93148148, 0.93842593, 0.9375    , 0.94583333,
        0.94259259, 0.9287037 , 0.93703704, 0.94074074, 0.93657407],
       [0.94953704, 0.95046296, 0.94953704, 0.95694444, 0.95324074,
        0.95509259, 0.95231481, 0.95694444, 0.94814815, 0.94814815],
       [0.95462963, 0.95833333, 0.95648148, 0.95694444, 0.95694444,
        0.95740741, 0.95648148, 0.95740741, 0.95694444, 0.95694444],
       [0.95462963, 0.95694444, 0.95787037, 0.95555556, 0.95416667,
        0.95555556, 0.95601852, 0.95092593, 0.95416667, 0.95787037]])

In [40]:
unclean_accuracy_scores_df = pd.DataFrame(unclean_accuracy_scores)

In [41]:
unclean_accuracy_scores_df.to_excel("../../output/survey_data/20240109_unclean_acc_scores_training_size.xlsx", index=False)

__Training time__

In [13]:
unclean_training_time = []
for i in [1000, 2000, 4000, 6000, 8000]:
    file_name =f"../../data/survery_data/unclean_data/train_files_{i}.xlsx"
    unclean_training_time.append(get_training_time(file_name=file_name))
unclean_training_time = np.array(unclean_training_time)

In [14]:
print(unclean_training_time.shape)
print(unclean_training_time)

(5, 10)
[[0.51997209 0.53142667 0.53553104 0.52388406 0.56661606 0.54328966
  0.56497312 0.55669856 0.54467845 0.53489161]
 [1.10153937 1.11818528 1.12229133 1.12344813 1.12854171 1.11650658
  1.13870645 1.15917945 1.09220195 1.09401155]
 [2.33148503 2.34348941 2.32309508 2.38808846 2.48055363 2.47954369
  2.45864868 2.44991946 2.40015841 2.39717841]
 [3.84642339 3.8426621  3.93744087 3.85481572 3.85821509 3.9126966
  3.85255623 3.84200716 3.73213696 3.85249162]
 [4.97727299 4.99130344 4.99779224 4.99063516 5.0135107  4.95985079
  4.98642468 4.97688437 4.98004341 4.98881745]]


# Write to file

In [15]:
clean_training_time_df = pd.DataFrame(clean_training_time)
unclean_training_time_df = pd.DataFrame(unclean_training_time)
clean_training_time_df.to_excel("../../output/survey_data/20240110_clean_rf-current_training_time.xlsx")
unclean_training_time_df.to_excel("../../output/survey_data/20240110_unclean_rf-current_training_time.xlsx")