## Load librarys

In [3]:
import os
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

from mrmr import mrmr_classif

In [4]:
%load_ext watermark
%watermark --iversions

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
matplotlib: 3.10.8
mrmr      : 0.2.8
numpy     : 2.3.5
pandas    : 2.3.3
sklearn   : 1.8.0
tensorflow: 2.20.0



## Load data

In [3]:
x_train_Znorm=pd.read_csv('data/x_train_Znorm.csv')
x_val_Znorm=pd.read_csv('data/x_val_Znorm.csv')
y_train=pd.DataFrame(np.argmax(np.load('data/y_train.npy'),axis=1))
y_val=pd.DataFrame(np.argmax(np.load('data/y_val.npy'),axis=1))

# Calculation of class weights based on training data
classis = np.unique(y_train)
weight_classis = compute_class_weight(class_weight='balanced', classes=classis, y=y_train)
weight_classis_dict = dict(zip(classis, weight_classis))

## Classification without feature selection

In [4]:
rf_scores = []
svm_scores = []
knn_scores = []
gb_scores = []
lr_scores = []
nb_scores = []

#Gaussian Naive Bayes
nb_model = GaussianNB()
nb_model.fit(x_train_Znorm, y_train)
report_dict = classification_report(y_val, nb_model.predict(x_val_Znorm), output_dict=True)
nb_scores = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]

#Random Forest
rf_model = RandomForestClassifier(n_estimators=5000,max_depth=None,class_weight=weight_classis_dict,random_state=5000,n_jobs=-1)
rf_model.fit(x_train_Znorm, y_train)
report_dict = classification_report(y_val, rf_model.predict(x_val_Znorm), output_dict=True)
rf_scores = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]

#K nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5,weights='distance',n_jobs=-1)
knn_model.fit(x_train_Znorm, y_train)
report_dict = classification_report(y_val, knn_model.predict(x_val_Znorm), output_dict=True)
knn_scores = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]

#Support Vector Machine
svm_model = SVC(kernel='rbf',class_weight=weight_classis_dict,probability=True,random_state=42)
svm_model.fit(x_train_Znorm, y_train)
report_dict = classification_report(y_val, svm_model.predict(x_val_Znorm), output_dict=True)
svm_scores = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]

#Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=300,learning_rate=0.1,max_depth=100,random_state=42)
gb_model.fit(x_train_Znorm, y_train)
report_dict = classification_report(y_val, gb_model.predict(x_val_Znorm), output_dict=True)
gb_scores = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]

#Logistic Regression
lr_model = LogisticRegression(solver='lbfgs',class_weight=weight_classis_dict,max_iter=1000,random_state=42,n_jobs=-1)
lr_model.fit(x_train_Znorm, y_train)
report_dict = classification_report(y_val, lr_model.predict(x_val_Znorm), output_dict=True)
lr_scores = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]


In [5]:
print("ML model scores without feature selection:")
print("------------------------------------------------")
print("Model                   Accuracy,  Precision,       Recall,       F1-score")
print(f'Gaussian Naive Bayes    acc={nb_scores[0]:.3f}, Precision={nb_scores[1]:.3f}, Recall={nb_scores[2]:.3f}, F1={nb_scores[3]:.3f}')
print(f'Random Forest           acc={rf_scores[0]:.3f}, Precision={rf_scores[1]:.3f}, Recall={rf_scores[2]:.3f}, F1={rf_scores[3]:.3f}')
print(f'K nearest Neighbors     acc={knn_scores[0]:.3f}, Precision={knn_scores[1]:.3f}, Recall={knn_scores[2]:.3f}, F1={knn_scores[3]:.3f}')
print(f'Support Vector Machine  acc={svm_scores[0]:.3f}, Precision={svm_scores[1]:.3f}, Recall={svm_scores[2]:.3f}, F1={svm_scores[3]:.3f}')
print(f'Gradient Boosting       acc={gb_scores[0]:.3f}, Precision={gb_scores[1]:.3f}, Recall={gb_scores[2]:.3f}, F1={gb_scores[3]:.3f}')
print(f'Logistic Regression     acc={lr_scores[0]:.3f}, Precision={lr_scores[1]:.3f}, Recall={lr_scores[2]:.3f}, F1={lr_scores[3]:.3f}')

ML model scores without feature selection:
------------------------------------------------
Model                   Accuracy,  Precision,       Recall,       F1-score
Gaussian Naive Bayes    acc=0.800, Precision=0.814, Recall=0.801, F1=0.800
Random Forest           acc=0.833, Precision=0.839, Recall=0.829, F1=0.829
K nearest Neighbors     acc=0.867, Precision=0.876, Recall=0.861, F1=0.855
Support Vector Machine  acc=0.883, Precision=0.881, Recall=0.881, F1=0.879
Gradient Boosting       acc=0.883, Precision=0.872, Recall=0.874, F1=0.872
Logistic Regression     acc=0.900, Precision=0.909, Recall=0.899, F1=0.899


# Classification with feature selection mRMR

In [16]:
selected = mrmr_classif(X=x_train_Znorm, y=y_train, K=72)
K_values = range(10, 72,1)
rf_scores = []
svm_scores = []
knn_scores = []
gb_scores = []
lr_scores = []
nb_scores = []

for K in K_values:
    x_train_sel = x_train_Znorm[selected[:K]]
    x_val_sel = x_val_Znorm[selected[:K]]

    #Gaussian Naive Bayes
    nb_model = GaussianNB()
    nb_model.fit(x_train_sel, y_train)
    report_dict = classification_report(y_val, nb_model.predict(x_val_sel), output_dict=True)
    report = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]
    nb_scores.append(report)

    #Random Forest
    rf_model = RandomForestClassifier(n_estimators=5000,max_depth=None,class_weight=weight_classis_dict,random_state=5000,n_jobs=-1)
    rf_model.fit(x_train_sel, y_train)
    report_dict = classification_report(y_val, rf_model.predict(x_val_sel), output_dict=True)
    report = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]
    rf_scores.append(report)

    #KNN
    knn_model = KNeighborsClassifier(n_neighbors=5,weights='distance',n_jobs=-1)
    knn_model.fit(x_train_sel, y_train)
    report_dict = classification_report(y_val, knn_model.predict(x_val_sel), output_dict=True)
    report = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]
    knn_scores.append(report)

    #SVM
    svm_model = SVC(kernel='rbf',class_weight=weight_classis_dict,probability=True,random_state=42)
    svm_model.fit(x_train_sel, y_train)
    report_dict = classification_report(y_val, svm_model.predict(x_val_sel), output_dict=True)
    report = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]
    svm_scores.append(report)

    #Gradient Boosting
    gb_model = GradientBoostingClassifier(n_estimators=300,learning_rate=0.1,max_depth=100,random_state=42)
    gb_model.fit(x_train_sel, y_train)
    report_dict = classification_report(y_val, gb_model.predict(x_val_sel), output_dict=True)
    report = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]
    gb_scores.append(report)
    
    #Logistic Regression
    lr_model = LogisticRegression(solver='lbfgs',class_weight=weight_classis_dict,max_iter=1000,random_state=42,n_jobs=-1)
    lr_model.fit(x_train_sel, y_train)
    report_dict = classification_report(y_val, lr_model.predict(x_val_sel), output_dict=True)
    report = [report_dict['accuracy'],report_dict['macro avg']['precision'],report_dict['macro avg']['recall'],report_dict['macro avg']['f1-score']]
    lr_scores.append(report)

    print(f'K={K} done')
    

100%|██████████| 72/72 [00:02<00:00, 28.70it/s]


K=10 done
K=11 done
K=12 done
K=13 done
K=14 done
K=15 done
K=16 done
K=17 done
K=18 done
K=19 done
K=20 done
K=21 done
K=22 done
K=23 done
K=24 done
K=25 done
K=26 done
K=27 done
K=28 done
K=29 done
K=30 done
K=31 done
K=32 done
K=33 done
K=34 done
K=35 done
K=36 done
K=37 done
K=38 done
K=39 done
K=40 done
K=41 done
K=42 done
K=43 done
K=44 done
K=45 done
K=46 done
K=47 done
K=48 done
K=49 done
K=50 done
K=51 done
K=52 done
K=53 done
K=54 done
K=55 done
K=56 done
K=57 done
K=58 done
K=59 done
K=60 done
K=61 done
K=62 done
K=63 done
K=64 done
K=65 done
K=66 done
K=67 done
K=68 done
K=69 done
K=70 done
K=71 done


In [17]:
rf_scores = np.array(rf_scores)
svm_scores = np.array(svm_scores)
knn_scores = np.array(knn_scores)
gb_scores = np.array(gb_scores)
lr_scores = np.array(lr_scores)
nb_scores = np.array(nb_scores)

In [18]:
print(f'Optimální K: {K_values[np.argmax(nb_scores[:,0])]}, NB acc={np.max(nb_scores[:,0]):.3f}, Precision={nb_scores[np.argmax(nb_scores[:,0]),1]:.3f}, Recall={nb_scores[np.argmax(nb_scores[:,0]),2]:.3f}, F1={nb_scores[np.argmax(nb_scores[:,0]),3]:.3f}')
print(f'Optimální K: {K_values[np.argmax(rf_scores[:,0])]}, RF acc={np.max(rf_scores[:,0]):.3f}, Precision={rf_scores[np.argmax(rf_scores[:,0]),1]:.3f}, Recall={rf_scores[np.argmax(rf_scores[:,0]),2]:.3f}, F1={rf_scores[np.argmax(rf_scores[:,0]),3]:.3f}')
print(f'Optimální K: {K_values[np.argmax(knn_scores[:,0])]}, KNN acc={np.max(knn_scores[:,0]):.3f}, Precision={knn_scores[np.argmax(knn_scores[:,0]),1]:.3f}, Recall={knn_scores[np.argmax(knn_scores[:,0]),2]:.3f}, F1={knn_scores[np.argmax(knn_scores[:,0]),3]:.3f}')
print(f'Optimální K: {K_values[np.argmax(svm_scores[:,0])]}, SVM acc={np.max(svm_scores[:,0]):.3f}, Precision={svm_scores[np.argmax(svm_scores[:,0]),1]:.3f}, Recall={svm_scores[np.argmax(svm_scores[:,0]),2]:.3f}, F1={svm_scores[np.argmax(svm_scores[:,0]),3]:.3f}')
print(f'Optimální K: {K_values[np.argmax(gb_scores[:,0])]}, GB acc={np.max(gb_scores[:,0]):.3f}, Precision={gb_scores[np.argmax(gb_scores[:,0]),1]:.3f}, Recall={gb_scores[np.argmax(gb_scores[:,0]),2]:.3f}, F1={gb_scores[np.argmax(gb_scores[:,0]),3]:.3f}')
print(f'Optimální K: {K_values[np.argmax(lr_scores[:,0])]}, LR acc={np.max(lr_scores[:,0]):.3f}, Precision={lr_scores[np.argmax(lr_scores[:,0]),1]:.3f}, Recall={lr_scores[np.argmax(lr_scores[:,0]),2]:.3f}, F1={lr_scores[np.argmax(lr_scores[:,0]),3]:.3f}')


Optimální K: 58, NB acc=0.817, Precision=0.828, Recall=0.818, F1=0.815
Optimální K: 34, RF acc=0.850, Precision=0.862, Recall=0.847, F1=0.849
Optimální K: 39, KNN acc=0.900, Precision=0.906, Recall=0.896, F1=0.894
Optimální K: 35, SVM acc=0.900, Precision=0.908, Recall=0.901, F1=0.900
Optimální K: 56, GB acc=0.883, Precision=0.872, Recall=0.874, F1=0.872
Optimální K: 40, LR acc=0.917, Precision=0.920, Recall=0.918, F1=0.916
