In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from data_repository import DataRepository
from model_training_ultils import ModelMemCalculation
from model_training_ultils import ModelEvaluationUltis

# Import data

In [2]:
mem_cal = ModelMemCalculation()
evaludation_tool = ModelEvaluationUltis()

In [3]:
data_repo = DataRepository("../.env")
#Clean data
train_ampls, y_train, validation_ampls, y_validation, test_ampls, y_test = data_repo.load_fft_data(clean_data=True)

In [4]:
print(f"Train dataset: X: {train_ampls.shape}, y: {y_train.shape}")
print(f"Validation dataset: X: {validation_ampls.shape}, y: {y_validation.shape}")
print(f"Test dataset: X: {test_ampls.shape}, y: {y_test.shape}")

Train dataset: X: (8188, 4501), y: (8188,)
Validation dataset: X: (1123, 4501), y: (1123,)
Test dataset: X: (2012, 4501), y: (2012,)


__Label encoder__

In [5]:
# Chuyển đổi danh sách labels thành mã số
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_validation = label_encoder.transform(y_validation)
y_test = label_encoder.transform(y_test)

In [6]:
print(list(label_encoder.classes_))
print(label_encoder.transform(list(label_encoder.classes_)))

['error', 'normal', 'overcurrent', 'overheating', 'zero']
[0 1 2 3 4]


# Feature selection survey based on k best

In [7]:
def find_top_k_indices(amplitudes, k):
    # Get the indices of the top 50 elements
    top_k_indices = np.argsort(amplitudes)[-k:][::-1]
    highest_ampls = amplitudes[top_k_indices]
    return highest_ampls

def get_x_by_top_ampls(k, ampls):
    X = []
    for ampl in ampls:
        X.append(find_top_k_indices(amplitudes=ampl, k=k))
    return np.array(X)

In [16]:
test_x = get_x_by_top_ampls(k=2, ampls=train_ampls)
test_x.shape

(8188, 2)

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from tqdm import tqdm

svm_acs, svm_f1s = [], []
rf_acs, rf_f1s = [], []
xgb_acs, xgb_f1s = [], []
k_arr = []

for k in range(1, 4501, 10):
    # Feature selection
    print(f"[+] Working with {k} features")
    #Using top 50 ampls
    k_arr.append(k)
    X_train = get_x_by_top_ampls(k=k, ampls=train_ampls)
    X_validation = get_x_by_top_ampls(k=k, ampls=validation_ampls)
    X_test = get_x_by_top_ampls(k=k, ampls=test_ampls)
    # Tạo mô hình SVM
    svm_model2 = SVC(kernel='rbf', random_state=42, probability=False)
    svm_model2.fit(X_train, y_train)
    # Tạo mô hình RF
    rf_model2 = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
    rf_model2.fit(X_train, y_train)
    # Tạo mô hình XgBoost
    xgboost_model2 = XGBClassifier(objective='binary:logistic', tree_method="gpu_hist")
    xgboost_model2.fit(X_train, y_train)
    # SVM
    svm_accuracy, svm_f1_macro = evaludation_tool.evaluate_models(model=svm_model2, X_test=X_test, y_test=y_test)
    svm_acs.append(svm_accuracy)
    svm_f1s.append(svm_f1_macro)
    # Random forest
    rf_accuracy, rf_f1_macro = evaludation_tool.evaluate_models(model=rf_model2, X_test=X_test, y_test=y_test)
    rf_acs.append(rf_accuracy)
    rf_f1s.append(rf_f1_macro)
    # XgBoost
    xgb_accuracy, xgb_f1_macro = evaludation_tool.evaluate_models(model=xgboost_model2, X_test=X_test, y_test=y_test)
    xgb_acs.append(xgb_accuracy)
    xgb_f1s.append(xgb_f1_macro)
    print(f"[+] Finished {k} features")

In [None]:
X_test.shape

(2012, 101)

In [None]:
X_train

In [None]:
print(len(k_arr), len(svm_acs))

501 501


In [None]:
#Create a dict
test_evaluation_dict = {
    "k values": k_arr,
    "SVM accuracy": svm_acs,
    "SVM F1-Score": svm_f1s,
    "RF accuracy": rf_acs,
    "RF F1-Score": rf_f1s,
    "XGB accuracy": xgb_acs,
    "XGB F1-Score": xgb_f1s,
}
test_eval_df = pd.DataFrame(test_evaluation_dict)
test_eval_df.head()

ValueError: All arrays must be of the same length

In [None]:
test_eval_df.to_excel("../../output/others/20240107_select_k_test_set.xlsx", index=False)

In [11]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from tqdm import tqdm

svm_acs, svm_f1s = [], []
rf_acs, rf_f1s = [], []
xgb_acs, xgb_f1s = [], []
k_arr = []

for k in range(4491, 4502, 10):
    # Feature selection
    print(f"[+] Working with {k} features")
    #Using top 50 ampls
    k_arr.append(k)
    X_train = get_x_by_top_ampls(k=k, ampls=train_ampls)
    X_validation = get_x_by_top_ampls(k=k, ampls=validation_ampls)
    X_test = get_x_by_top_ampls(k=k, ampls=test_ampls)
    # Tạo mô hình SVM
    svm_model2 = SVC(kernel='rbf', random_state=42, probability=False)
    svm_model2.fit(X_train, y_train)
    # Tạo mô hình RF
    rf_model2 = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=42)
    rf_model2.fit(X_train, y_train)
    # Tạo mô hình XgBoost
    xgboost_model2 = XGBClassifier(objective='binary:logistic', tree_method="gpu_hist")
    xgboost_model2.fit(X_train, y_train)
    # SVM
    svm_accuracy, svm_f1_macro = evaludation_tool.evaluate_models(model=svm_model2, X_test=X_validation, y_test=y_validation)
    svm_acs.append(svm_accuracy)
    svm_f1s.append(svm_f1_macro)
    # Random forest
    rf_accuracy, rf_f1_macro = evaludation_tool.evaluate_models(model=rf_model2, X_test=X_validation, y_test=y_validation)
    rf_acs.append(rf_accuracy)
    rf_f1s.append(rf_f1_macro)
    # XgBoost
    xgb_accuracy, xgb_f1_macro = evaludation_tool.evaluate_models(model=xgboost_model2, X_test=X_validation, y_test=y_validation)
    xgb_acs.append(xgb_accuracy)
    xgb_f1s.append(xgb_f1_macro)
    print(f"[+] Finished {k} features")

[+] Working with 4491 features
[+] Finished 4491 features
[+] Working with 4501 features
[+] Finished 4501 features


In [12]:
#Create a dict
validation_evaluation_dict = {
    "k values": k_arr,
    "SVM accuracy": svm_acs,
    "SVM F1-Score": svm_f1s,
    "RF accuracy": rf_acs,
    "RF F1-Score": rf_f1s,
    "XGB accuracy": xgb_acs,
    "XGB F1-Score": xgb_f1s,
}
validation_eval_df = pd.DataFrame(validation_evaluation_dict)
validation_eval_df.head()

Unnamed: 0,k values,SVM accuracy,SVM F1-Score,RF accuracy,RF F1-Score,XGB accuracy,XGB F1-Score
0,4491,0.932324,0.930513,0.739092,0.75729,0.968833,0.97018
1,4501,0.932324,0.930513,0.750668,0.768625,0.967943,0.969336


In [10]:
validation_eval_df.to_excel("../../output/others/20240107_select_k_validation_set.xlsx", index=False)