In [12]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time as time

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from data_repository import DataRepository
from model_training_ultils import ModelEvaluationUltis
from sklearn.ensemble import RandomForestClassifier


# Import data

In [2]:
evaludation_tool = ModelEvaluationUltis()
data_repo = DataRepository("../.env")

In [3]:
_, _, _, _, test_clean, y_test_clean = data_repo.load_fft_data(clean_data=True)
_, _, _, _, test_unclean, y_test_unclean = data_repo.load_fft_data(clean_data=False)

In [4]:
def find_top_k_indices(amplitudes, k):
    # Get the indices of the top 50 elements
    top_k_indices = np.argsort(amplitudes)[-k:][::-1]
    highest_ampls = amplitudes[top_k_indices]
    return highest_ampls

def get_x_by_top_ampls(k, ampls):
    X = []
    for ampl in ampls:
        X.append(find_top_k_indices(amplitudes=ampl, k=k))
    return np.array(X)

X_test_clean = get_x_by_top_ampls(k=1, ampls=test_clean)
X_test_unclean = get_x_by_top_ampls(k=1, ampls=test_unclean)

In [5]:
print(f"Test dataset clean: {X_test_clean.shape}, y: {y_test_clean.shape}")
print(f"Test dataset unclean: {X_test_unclean.shape}, y{y_test_unclean.shape}")

Test dataset clean: (2012, 1), y: (2012,)
Test dataset unclean: (2160, 1), y(2160,)


__Label encoder__

In [6]:
label_encoder = LabelEncoder()
y_test_clean = label_encoder.fit_transform(y_test_clean)
y_test_unclean = label_encoder.transform(y_test_unclean)

In [7]:
print(list(label_encoder.classes_))
print(label_encoder.transform(list(label_encoder.classes_)))

['error', 'normal', 'overcurrent', 'overheating', 'zero']
[0 1 2 3 4]


In [8]:
print(data_repo.count_labels(y_test_unclean))
print(data_repo.count_labels(y_test_clean))

{0: 445, 1: 419, 2: 454, 3: 411, 4: 431}
{0: 414, 1: 394, 2: 421, 3: 374, 4: 409}


In [9]:
def get_accuracy_on_clean_test_set(file_name, clean_data: bool):
    accuracy_arr = []
    X_trains, y_trains = data_repo.read_train_fft_survey_data(file_name)
    for i in range(X_trains.shape[0]):
        train_ampls = X_trains[i]
        X_train = get_x_by_top_ampls(k=1, ampls=train_ampls)
        y_train = label_encoder.transform(y_trains[i])
        rf_current = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
        rf_current.fit(X_train, y_train)
        if clean_data:
            y_pred = rf_current.predict(X_test_clean)
            accuracy_arr.append(accuracy_score(y_pred=y_pred, y_true=y_test_clean))
        else:
            y_pred = rf_current.predict(X_test_unclean)
            accuracy_arr.append(accuracy_score(y_pred=y_pred, y_true=y_test_unclean))
    return np.array(accuracy_arr)

In [10]:
def get_training_time(file_name):
    training_times = []
    X_trains, y_trains = data_repo.read_train_fft_survey_data(file_name)
    for i in range(X_trains.shape[0]):
        train_ampls = X_trains[i]
        X_train = get_x_by_top_ampls(k=1, ampls=train_ampls)
        y_train = label_encoder.transform(y_trains[i])
        start_time = time.time()
        rf_current = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
        rf_current.fit(X_train, y_train)
        end_time = time.time()
        training_times.append(end_time-start_time)
    return training_times

# Surveying

## Clean data

### Accuracy

In [25]:
clean_accuracy_scores = []
for i in [1000, 2000, 4000, 6000, 8000]:
    file_name =f"../../data/survery_data/clean_data/train_files_{i}.xlsx"
    clean_accuracy_scores.append(get_accuracy_on_clean_test_set(file_name=file_name, clean_data=True))
clean_accuracy_scores = np.array(clean_accuracy_scores)

In [30]:
clean_accuracy_scores.shape

(5, 10)

In [33]:
clean_accuracy_scores

array([[0.97415507, 0.97713718, 0.972167  , 0.97117296, 0.96322068,
        0.96819085, 0.97415507, 0.96471173, 0.972167  , 0.97415507],
       [0.96918489, 0.97166998, 0.97365805, 0.97365805, 0.98210736,
        0.97614314, 0.97266402, 0.97067594, 0.97813121, 0.97117296],
       [0.98011928, 0.97912525, 0.97813121, 0.97316103, 0.97713718,
        0.97912525, 0.97713718, 0.97763419, 0.97465209, 0.96968191],
       [0.98011928, 0.98161034, 0.97862823, 0.97813121, 0.97614314,
        0.97664016, 0.97813121, 0.97415507, 0.97862823, 0.97415507],
       [0.97713718, 0.97713718, 0.97713718, 0.97713718, 0.97763419,
        0.97614314, 0.97465209, 0.97614314, 0.97713718, 0.97614314]])

### Training time

In [13]:
clean_training_time = []
for i in [1000, 2000, 4000, 6000, 8000]:
    file_name =f"../../data/survery_data/clean_data/train_files_{i}.xlsx"
    clean_training_time.append(get_training_time(file_name=file_name))
clean_training_time = np.array(clean_training_time)

In [14]:
clean_training_time.shape

(5, 10)

In [15]:
clean_training_time

array([[0.07059479, 0.06880856, 0.06947517, 0.06915379, 0.06918359,
        0.0686667 , 0.06967807, 0.07008505, 0.06868911, 0.06851149],
       [0.09302878, 0.09278393, 0.09866023, 0.0947485 , 0.09389424,
        0.09315228, 0.09298611, 0.0952332 , 0.09353089, 0.09247923],
       [0.14365292, 0.14135218, 0.14731669, 0.14474058, 0.14457393,
        0.14436889, 0.1434865 , 0.14407253, 0.14424706, 0.14095235],
       [0.20282054, 0.2011447 , 0.20278716, 0.20369101, 0.20062327,
        0.20018053, 0.2085762 , 0.20055556, 0.20144796, 0.20207024],
       [0.25165272, 0.25160551, 0.25158262, 0.25111866, 0.25367427,
        0.25029683, 0.2518971 , 0.25136495, 0.25151324, 0.25141215]])

## Unclean data

### Accuracy score with unclean data

In [16]:
unclean_accuracy_scores = []
for i in [1000, 2000, 4000, 6000, 8000]:
    file_name =f"../../data/survery_data/unclean_data/train_files_{i}.xlsx"
    unclean_accuracy_score = get_accuracy_on_clean_test_set(file_name=file_name, clean_data=False)
    unclean_accuracy_scores.append(unclean_accuracy_score)
unclean_accuracy_scores = np.array(unclean_accuracy_scores)

In [17]:
unclean_accuracy_scores.shape

(5, 10)

In [18]:
unclean_accuracy_scores

array([[0.95555556, 0.93981481, 0.94907407, 0.96527778, 0.93935185,
        0.95925926, 0.95324074, 0.9537037 , 0.94953704, 0.95833333],
       [0.94907407, 0.95092593, 0.95138889, 0.9625    , 0.94814815,
        0.95      , 0.94537037, 0.95185185, 0.95138889, 0.95324074],
       [0.94861111, 0.95277778, 0.95509259, 0.95694444, 0.9537037 ,
        0.9537037 , 0.95231481, 0.95972222, 0.95324074, 0.95416667],
       [0.95416667, 0.95462963, 0.95555556, 0.95833333, 0.95462963,
        0.95509259, 0.95231481, 0.95787037, 0.95787037, 0.95324074],
       [0.95462963, 0.95462963, 0.95416667, 0.95416667, 0.95555556,
        0.95555556, 0.95277778, 0.95972222, 0.95462963, 0.95694444]])

### Training time with unclean data

In [19]:
unclean_training_time = []
for i in [1000, 2000, 4000, 6000, 8000]:
    file_name =f"../../data/survery_data/unclean_data/train_files_{i}.xlsx"
    unclean_training_time.append(get_training_time(file_name=file_name))
unclean_training_time = np.array(unclean_training_time)

In [20]:
unclean_training_time

array([[0.07299209, 0.07270193, 0.07272339, 0.07280636, 0.07376599,
        0.07345295, 0.07413793, 0.07297182, 0.07276917, 0.07181978],
       [0.09959769, 0.09990406, 0.09957647, 0.09919906, 0.10720992,
        0.10178375, 0.09988523, 0.10051847, 0.09963799, 0.10031438],
       [0.15742517, 0.15798044, 0.15540481, 0.15824342, 0.1584146 ,
        0.15974069, 0.15738583, 0.15706515, 0.15868068, 0.15705276],
       [0.2287209 , 0.22587585, 0.22205114, 0.2244556 , 0.22211814,
        0.2240715 , 0.23219991, 0.22534084, 0.22195077, 0.23160768],
       [0.28620005, 0.28422856, 0.2837739 , 0.27941418, 0.27993941,
        0.28415561, 0.28335524, 0.28166652, 0.2813611 , 0.28078914]])

## Write to files

In [36]:
clean_accuracy_scores_df = pd.DataFrame(clean_accuracy_scores)
unclean_accuracy_scores_df = pd.DataFrame(unclean_accuracy_scores)
clean_accuracy_scores_df.to_excel("../../output/survey_data/20240110_clean_rf-fft_acc_scores_training_size.xlsx")
unclean_accuracy_scores_df.to_excel("../../output/survey_data/20240110_unclean_rf-fft_acc_scores_training_size.xlsx")

In [22]:
clean_training_time_df = pd.DataFrame(clean_training_time)
unclean_training_time_df = pd.DataFrame(unclean_training_time)
clean_training_time_df.to_excel("../../output/survey_data/20240110_clean_rf-fft_training_time.xlsx")
unclean_training_time_df.to_excel("../../output/survey_data/20240110_unclean_rf-fft_training_time.xlsx")